From 959c697c8248f8fb129e4285f562280ca17f6daf Mon Sep 17 00:00:00 2001
From: Basil Hess <bhe@zurich.ibm.com>
Date: Tue, 21 Jan 2025 13:48:19 +0100
Subject: [PATCH] Pull update

Signed-off-by: Basil Hess <bhe@zurich.ibm.com>
---
 src/kem/ml_kem/CMakeLists.txt                 |  18 +-
 .../aarch64/src/arith_native_aarch64.h        |   4 +-
 .../aarch64/src/clean_impl.h                  |   1 -
 .../aarch64/src/intt_clean.S                  |  85 ++--
 .../aarch64/src/intt_opt.S                    |  85 ++--
 .../aarch64/src/ntt_clean.S                   |  63 ++-
 .../aarch64/src/ntt_opt.S                     |  63 ++-
 .../aarch64/src/opt_impl.h                    |   2 -
 .../aarch64/src/poly_clean.S                  |  78 ++--
 .../aarch64/src/poly_opt.S                    |  78 ++--
 .../aarch64/src/polyvec_clean.S               |  97 +++--
 .../aarch64/src/polyvec_opt.S                 |  97 +++--
 .../aarch64/src/rej_uniform_asm_clean.S       |  79 +++-
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-1024_aarch64/cbd.c    | 106 ++---
 .../mlkem-native_ml-kem-1024_aarch64/cbd.h    |  35 +-
 .../mlkem-native_ml-kem-1024_aarch64/cbmc.h   |  12 +-
 .../mlkem-native_ml-kem-1024_aarch64/common.h |  17 +-
 .../mlkem-native_ml-kem-1024_aarch64/config.h |  92 ++--
 .../mlkem-native_ml-kem-1024_aarch64/debug.c  |  60 +++
 .../mlkem-native_ml-kem-1024_aarch64/debug.h  | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-1024_aarch64/indcpa.c | 170 +-------
 .../mlkem-native_ml-kem-1024_aarch64/indcpa.h |   8 +-
 .../mlkem-native_ml-kem-1024_aarch64/kem.c    |   4 +-
 .../mlkem-native_ml-kem-1024_aarch64/kem.h    |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-1024_aarch64/ntt.c    |  74 ++--
 .../mlkem-native_ml-kem-1024_aarch64/ntt.h    |   7 +-
 .../mlkem-native_ml-kem-1024_aarch64/params.h |  23 +-
 .../mlkem-native_ml-kem-1024_aarch64/poly.c   | 404 ++++++++----------
 .../mlkem-native_ml-kem-1024_aarch64/poly.h   | 301 +++++--------
 .../polyvec.c                                 | 194 ++++++++-
 .../polyvec.h                                 | 293 ++++++++++++-
 .../mlkem-native_ml-kem-1024_aarch64/reduce.h |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-1024_aarch64/verify.c |  11 +-
 .../mlkem-native_ml-kem-1024_aarch64/verify.h |   4 +-
 .../mlkem-native_ml-kem-1024_aarch64/zetas.c  |   9 +
 .../arith_backend.h                           |   2 +
 .../ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c | 106 ++---
 .../ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h |  35 +-
 .../mlkem-native_ml-kem-1024_ref/cbmc.h       |  12 +-
 .../mlkem-native_ml-kem-1024_ref/common.h     |  17 +-
 .../mlkem-native_ml-kem-1024_ref/config.h     |  92 ++--
 .../mlkem-native_ml-kem-1024_ref/debug.c      |  60 +++
 .../mlkem-native_ml-kem-1024_ref/debug.h      | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-1024_ref/indcpa.c     | 170 +-------
 .../mlkem-native_ml-kem-1024_ref/indcpa.h     |   8 +-
 .../ml_kem/mlkem-native_ml-kem-1024_ref/kem.c |   4 +-
 .../ml_kem/mlkem-native_ml-kem-1024_ref/kem.h |   9 +
 .../mlkem_native.h                            |  14 +-
 .../ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c |  74 ++--
 .../ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h |   7 +-
 .../mlkem-native_ml-kem-1024_ref/params.h     |  23 +-
 .../mlkem-native_ml-kem-1024_ref/poly.c       | 404 ++++++++----------
 .../mlkem-native_ml-kem-1024_ref/poly.h       | 301 +++++--------
 .../mlkem-native_ml-kem-1024_ref/polyvec.c    | 194 ++++++++-
 .../mlkem-native_ml-kem-1024_ref/polyvec.h    | 293 ++++++++++++-
 .../mlkem-native_ml-kem-1024_ref/reduce.h     |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../mlkem-native_ml-kem-1024_ref/symmetric.h  |   1 +
 .../mlkem-native_ml-kem-1024_ref/verify.c     |  11 +-
 .../mlkem-native_ml-kem-1024_ref/verify.h     |   4 +-
 .../mlkem-native_ml-kem-1024_ref/zetas.c      |   9 +
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-1024_x86_64/cbd.c     | 106 ++---
 .../mlkem-native_ml-kem-1024_x86_64/cbd.h     |  35 +-
 .../mlkem-native_ml-kem-1024_x86_64/cbmc.h    |  12 +-
 .../mlkem-native_ml-kem-1024_x86_64/common.h  |  17 +-
 .../mlkem-native_ml-kem-1024_x86_64/config.h  |  92 ++--
 .../mlkem-native_ml-kem-1024_x86_64/debug.c   |  60 +++
 .../mlkem-native_ml-kem-1024_x86_64/debug.h   | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-1024_x86_64/indcpa.c  | 170 +-------
 .../mlkem-native_ml-kem-1024_x86_64/indcpa.h  |   8 +-
 .../mlkem-native_ml-kem-1024_x86_64/kem.c     |   4 +-
 .../mlkem-native_ml-kem-1024_x86_64/kem.h     |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-1024_x86_64/ntt.c     |  74 ++--
 .../mlkem-native_ml-kem-1024_x86_64/ntt.h     |   7 +-
 .../mlkem-native_ml-kem-1024_x86_64/params.h  |  23 +-
 .../mlkem-native_ml-kem-1024_x86_64/poly.c    | 404 ++++++++----------
 .../mlkem-native_ml-kem-1024_x86_64/poly.h    | 301 +++++--------
 .../mlkem-native_ml-kem-1024_x86_64/polyvec.c | 194 ++++++++-
 .../mlkem-native_ml-kem-1024_x86_64/polyvec.h | 293 ++++++++++++-
 .../mlkem-native_ml-kem-1024_x86_64/reduce.h  |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-1024_x86_64/verify.c  |  11 +-
 .../mlkem-native_ml-kem-1024_x86_64/verify.h  |   4 +-
 .../x86_64/src/arith_native_x86_64.h          |   2 +-
 .../x86_64/src/default_impl.h                 |   3 -
 .../mlkem-native_ml-kem-1024_x86_64/zetas.c   |   9 +
 .../aarch64/src/arith_native_aarch64.h        |   4 +-
 .../aarch64/src/clean_impl.h                  |   1 -
 .../aarch64/src/intt_clean.S                  |  85 ++--
 .../aarch64/src/intt_opt.S                    |  85 ++--
 .../aarch64/src/ntt_clean.S                   |  63 ++-
 .../aarch64/src/ntt_opt.S                     |  63 ++-
 .../aarch64/src/opt_impl.h                    |   2 -
 .../aarch64/src/poly_clean.S                  |  78 ++--
 .../aarch64/src/poly_opt.S                    |  78 ++--
 .../aarch64/src/polyvec_clean.S               |  97 +++--
 .../aarch64/src/polyvec_opt.S                 |  97 +++--
 .../aarch64/src/rej_uniform_asm_clean.S       |  79 +++-
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-512_aarch64/cbd.c     | 106 ++---
 .../mlkem-native_ml-kem-512_aarch64/cbd.h     |  35 +-
 .../mlkem-native_ml-kem-512_aarch64/cbmc.h    |  12 +-
 .../mlkem-native_ml-kem-512_aarch64/common.h  |  17 +-
 .../mlkem-native_ml-kem-512_aarch64/config.h  |  92 ++--
 .../mlkem-native_ml-kem-512_aarch64/debug.c   |  60 +++
 .../mlkem-native_ml-kem-512_aarch64/debug.h   | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-512_aarch64/indcpa.c  | 170 +-------
 .../mlkem-native_ml-kem-512_aarch64/indcpa.h  |   8 +-
 .../mlkem-native_ml-kem-512_aarch64/kem.c     |   4 +-
 .../mlkem-native_ml-kem-512_aarch64/kem.h     |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-512_aarch64/ntt.c     |  74 ++--
 .../mlkem-native_ml-kem-512_aarch64/ntt.h     |   7 +-
 .../mlkem-native_ml-kem-512_aarch64/params.h  |  23 +-
 .../mlkem-native_ml-kem-512_aarch64/poly.c    | 404 ++++++++----------
 .../mlkem-native_ml-kem-512_aarch64/poly.h    | 301 +++++--------
 .../mlkem-native_ml-kem-512_aarch64/polyvec.c | 194 ++++++++-
 .../mlkem-native_ml-kem-512_aarch64/polyvec.h | 293 ++++++++++++-
 .../mlkem-native_ml-kem-512_aarch64/reduce.h  |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-512_aarch64/verify.c  |  11 +-
 .../mlkem-native_ml-kem-512_aarch64/verify.h  |   4 +-
 .../mlkem-native_ml-kem-512_aarch64/zetas.c   |   9 +
 .../arith_backend.h                           |   2 +
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbd.c  | 106 ++---
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbd.h  |  35 +-
 .../ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h |  12 +-
 .../mlkem-native_ml-kem-512_ref/common.h      |  17 +-
 .../mlkem-native_ml-kem-512_ref/config.h      |  92 ++--
 .../mlkem-native_ml-kem-512_ref/debug.c       |  60 +++
 .../mlkem-native_ml-kem-512_ref/debug.h       | 130 ++++++
 .../mlkem-native_ml-kem-512_ref/debug/debug.c |  56 ---
 .../mlkem-native_ml-kem-512_ref/debug/debug.h | 224 ----------
 .../mlkem-native_ml-kem-512_ref/indcpa.c      | 170 +-------
 .../mlkem-native_ml-kem-512_ref/indcpa.h      |   8 +-
 .../ml_kem/mlkem-native_ml-kem-512_ref/kem.c  |   4 +-
 .../ml_kem/mlkem-native_ml-kem-512_ref/kem.h  |   9 +
 .../mlkem_native.h                            |  14 +-
 .../ml_kem/mlkem-native_ml-kem-512_ref/ntt.c  |  74 ++--
 .../ml_kem/mlkem-native_ml-kem-512_ref/ntt.h  |   7 +-
 .../mlkem-native_ml-kem-512_ref/params.h      |  23 +-
 .../ml_kem/mlkem-native_ml-kem-512_ref/poly.c | 404 ++++++++----------
 .../ml_kem/mlkem-native_ml-kem-512_ref/poly.h | 301 +++++--------
 .../mlkem-native_ml-kem-512_ref/polyvec.c     | 194 ++++++++-
 .../mlkem-native_ml-kem-512_ref/polyvec.h     | 293 ++++++++++++-
 .../mlkem-native_ml-kem-512_ref/reduce.h      |  19 +-
 .../mlkem-native_ml-kem-512_ref/rej_uniform.c | 209 +++++++--
 .../mlkem-native_ml-kem-512_ref/rej_uniform.h |  85 ++--
 .../mlkem-native_ml-kem-512_ref/symmetric.h   |   1 +
 .../mlkem-native_ml-kem-512_ref/verify.c      |  11 +-
 .../mlkem-native_ml-kem-512_ref/verify.h      |   4 +-
 .../mlkem-native_ml-kem-512_ref/zetas.c       |   9 +
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-512_x86_64/cbd.c      | 106 ++---
 .../mlkem-native_ml-kem-512_x86_64/cbd.h      |  35 +-
 .../mlkem-native_ml-kem-512_x86_64/cbmc.h     |  12 +-
 .../mlkem-native_ml-kem-512_x86_64/common.h   |  17 +-
 .../mlkem-native_ml-kem-512_x86_64/config.h   |  92 ++--
 .../mlkem-native_ml-kem-512_x86_64/debug.c    |  60 +++
 .../mlkem-native_ml-kem-512_x86_64/debug.h    | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-512_x86_64/indcpa.c   | 170 +-------
 .../mlkem-native_ml-kem-512_x86_64/indcpa.h   |   8 +-
 .../mlkem-native_ml-kem-512_x86_64/kem.c      |   4 +-
 .../mlkem-native_ml-kem-512_x86_64/kem.h      |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-512_x86_64/ntt.c      |  74 ++--
 .../mlkem-native_ml-kem-512_x86_64/ntt.h      |   7 +-
 .../mlkem-native_ml-kem-512_x86_64/params.h   |  23 +-
 .../mlkem-native_ml-kem-512_x86_64/poly.c     | 404 ++++++++----------
 .../mlkem-native_ml-kem-512_x86_64/poly.h     | 301 +++++--------
 .../mlkem-native_ml-kem-512_x86_64/polyvec.c  | 194 ++++++++-
 .../mlkem-native_ml-kem-512_x86_64/polyvec.h  | 293 ++++++++++++-
 .../mlkem-native_ml-kem-512_x86_64/reduce.h   |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-512_x86_64/verify.c   |  11 +-
 .../mlkem-native_ml-kem-512_x86_64/verify.h   |   4 +-
 .../x86_64/src/arith_native_x86_64.h          |   2 +-
 .../x86_64/src/default_impl.h                 |   3 -
 .../mlkem-native_ml-kem-512_x86_64/zetas.c    |   9 +
 .../aarch64/src/arith_native_aarch64.h        |   4 +-
 .../aarch64/src/clean_impl.h                  |   1 -
 .../aarch64/src/intt_clean.S                  |  85 ++--
 .../aarch64/src/intt_opt.S                    |  85 ++--
 .../aarch64/src/ntt_clean.S                   |  63 ++-
 .../aarch64/src/ntt_opt.S                     |  63 ++-
 .../aarch64/src/opt_impl.h                    |   2 -
 .../aarch64/src/poly_clean.S                  |  78 ++--
 .../aarch64/src/poly_opt.S                    |  78 ++--
 .../aarch64/src/polyvec_clean.S               |  97 +++--
 .../aarch64/src/polyvec_opt.S                 |  97 +++--
 .../aarch64/src/rej_uniform_asm_clean.S       |  79 +++-
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-768_aarch64/cbd.c     | 106 ++---
 .../mlkem-native_ml-kem-768_aarch64/cbd.h     |  35 +-
 .../mlkem-native_ml-kem-768_aarch64/cbmc.h    |  12 +-
 .../mlkem-native_ml-kem-768_aarch64/common.h  |  17 +-
 .../mlkem-native_ml-kem-768_aarch64/config.h  |  92 ++--
 .../mlkem-native_ml-kem-768_aarch64/debug.c   |  60 +++
 .../mlkem-native_ml-kem-768_aarch64/debug.h   | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-768_aarch64/indcpa.c  | 170 +-------
 .../mlkem-native_ml-kem-768_aarch64/indcpa.h  |   8 +-
 .../mlkem-native_ml-kem-768_aarch64/kem.c     |   4 +-
 .../mlkem-native_ml-kem-768_aarch64/kem.h     |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-768_aarch64/ntt.c     |  74 ++--
 .../mlkem-native_ml-kem-768_aarch64/ntt.h     |   7 +-
 .../mlkem-native_ml-kem-768_aarch64/params.h  |  23 +-
 .../mlkem-native_ml-kem-768_aarch64/poly.c    | 404 ++++++++----------
 .../mlkem-native_ml-kem-768_aarch64/poly.h    | 301 +++++--------
 .../mlkem-native_ml-kem-768_aarch64/polyvec.c | 194 ++++++++-
 .../mlkem-native_ml-kem-768_aarch64/polyvec.h | 293 ++++++++++++-
 .../mlkem-native_ml-kem-768_aarch64/reduce.h  |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-768_aarch64/verify.c  |  11 +-
 .../mlkem-native_ml-kem-768_aarch64/verify.h  |   4 +-
 .../mlkem-native_ml-kem-768_aarch64/zetas.c   |   9 +
 .../arith_backend.h                           |   2 +
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbd.c  | 106 ++---
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbd.h  |  35 +-
 .../ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h |  12 +-
 .../mlkem-native_ml-kem-768_ref/common.h      |  17 +-
 .../mlkem-native_ml-kem-768_ref/config.h      |  92 ++--
 .../mlkem-native_ml-kem-768_ref/debug.c       |  60 +++
 .../mlkem-native_ml-kem-768_ref/debug.h       | 130 ++++++
 .../mlkem-native_ml-kem-768_ref/debug/debug.c |  56 ---
 .../mlkem-native_ml-kem-768_ref/debug/debug.h | 224 ----------
 .../mlkem-native_ml-kem-768_ref/indcpa.c      | 170 +-------
 .../mlkem-native_ml-kem-768_ref/indcpa.h      |   8 +-
 .../ml_kem/mlkem-native_ml-kem-768_ref/kem.c  |   4 +-
 .../ml_kem/mlkem-native_ml-kem-768_ref/kem.h  |   9 +
 .../mlkem_native.h                            |  14 +-
 .../ml_kem/mlkem-native_ml-kem-768_ref/ntt.c  |  74 ++--
 .../ml_kem/mlkem-native_ml-kem-768_ref/ntt.h  |   7 +-
 .../mlkem-native_ml-kem-768_ref/params.h      |  23 +-
 .../ml_kem/mlkem-native_ml-kem-768_ref/poly.c | 404 ++++++++----------
 .../ml_kem/mlkem-native_ml-kem-768_ref/poly.h | 301 +++++--------
 .../mlkem-native_ml-kem-768_ref/polyvec.c     | 194 ++++++++-
 .../mlkem-native_ml-kem-768_ref/polyvec.h     | 293 ++++++++++++-
 .../mlkem-native_ml-kem-768_ref/reduce.h      |  19 +-
 .../mlkem-native_ml-kem-768_ref/rej_uniform.c | 209 +++++++--
 .../mlkem-native_ml-kem-768_ref/rej_uniform.h |  85 ++--
 .../mlkem-native_ml-kem-768_ref/symmetric.h   |   1 +
 .../mlkem-native_ml-kem-768_ref/verify.c      |  11 +-
 .../mlkem-native_ml-kem-768_ref/verify.h      |   4 +-
 .../mlkem-native_ml-kem-768_ref/zetas.c       |   9 +
 .../arith_backend.h                           |   2 +
 .../mlkem-native_ml-kem-768_x86_64/cbd.c      | 106 ++---
 .../mlkem-native_ml-kem-768_x86_64/cbd.h      |  35 +-
 .../mlkem-native_ml-kem-768_x86_64/cbmc.h     |  12 +-
 .../mlkem-native_ml-kem-768_x86_64/common.h   |  17 +-
 .../mlkem-native_ml-kem-768_x86_64/config.h   |  92 ++--
 .../mlkem-native_ml-kem-768_x86_64/debug.c    |  60 +++
 .../mlkem-native_ml-kem-768_x86_64/debug.h    | 130 ++++++
 .../debug/debug.c                             |  56 ---
 .../debug/debug.h                             | 224 ----------
 .../mlkem-native_ml-kem-768_x86_64/indcpa.c   | 170 +-------
 .../mlkem-native_ml-kem-768_x86_64/indcpa.h   |   8 +-
 .../mlkem-native_ml-kem-768_x86_64/kem.c      |   4 +-
 .../mlkem-native_ml-kem-768_x86_64/kem.h      |   9 +
 .../mlkem_native.h                            |  14 +-
 .../mlkem-native_ml-kem-768_x86_64/ntt.c      |  74 ++--
 .../mlkem-native_ml-kem-768_x86_64/ntt.h      |   7 +-
 .../mlkem-native_ml-kem-768_x86_64/params.h   |  23 +-
 .../mlkem-native_ml-kem-768_x86_64/poly.c     | 404 ++++++++----------
 .../mlkem-native_ml-kem-768_x86_64/poly.h     | 301 +++++--------
 .../mlkem-native_ml-kem-768_x86_64/polyvec.c  | 194 ++++++++-
 .../mlkem-native_ml-kem-768_x86_64/polyvec.h  | 293 ++++++++++++-
 .../mlkem-native_ml-kem-768_x86_64/reduce.h   |  19 +-
 .../rej_uniform.c                             | 209 +++++++--
 .../rej_uniform.h                             |  85 ++--
 .../symmetric.h                               |   1 +
 .../mlkem-native_ml-kem-768_x86_64/verify.c   |  11 +-
 .../mlkem-native_ml-kem-768_x86_64/verify.h   |   4 +-
 .../x86_64/src/arith_native_x86_64.h          |   2 +-
 .../x86_64/src/default_impl.h                 |   3 -
 .../mlkem-native_ml-kem-768_x86_64/zetas.c    |   9 +
 304 files changed, 14028 insertions(+), 11358 deletions(-)
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.c
 create mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.h
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.c
 delete mode 100644 src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.h

diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index edd305ce8..fc2655ddf 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -6,7 +6,7 @@
 set(_ML_KEM_OBJS "")
 
 if(OQS_ENABLE_KEM_ml_kem_512)
-    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/cbd.c mlkem-native_ml-kem-512_ref/debug/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/ntt.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/polyvec.c mlkem-native_ml-kem-512_ref/rej_uniform.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c)
+    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/cbd.c mlkem-native_ml-kem-512_ref/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/ntt.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/polyvec.c mlkem-native_ml-kem-512_ref/rej_uniform.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c)
     target_compile_options(ml_kem_512_ref PUBLIC -DMLKEM_K=2 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_C)
     target_include_directories(ml_kem_512_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_ref)
     target_include_directories(ml_kem_512_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -15,7 +15,7 @@ if(OQS_ENABLE_KEM_ml_kem_512)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
-    add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/cbd.c mlkem-native_ml-kem-512_x86_64/debug/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/ntt.c mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/polyvec.c mlkem-native_ml-kem-512_x86_64/rej_uniform.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/zetas.c)
+    add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/cbd.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/ntt.c mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/polyvec.c mlkem-native_ml-kem-512_x86_64/rej_uniform.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/zetas.c)
     target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64)
     target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_512_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
@@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_aarch64)
-    add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
+    add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
     target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64)
     target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT)
@@ -32,7 +32,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_aarch64)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768)
-    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/cbd.c mlkem-native_ml-kem-768_ref/debug/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/ntt.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/polyvec.c mlkem-native_ml-kem-768_ref/rej_uniform.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c)
+    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/cbd.c mlkem-native_ml-kem-768_ref/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/ntt.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/polyvec.c mlkem-native_ml-kem-768_ref/rej_uniform.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c)
     target_compile_options(ml_kem_768_ref PUBLIC -DMLKEM_K=3 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_C)
     target_include_directories(ml_kem_768_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_ref)
     target_include_directories(ml_kem_768_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -41,7 +41,7 @@ if(OQS_ENABLE_KEM_ml_kem_768)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
-    add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/cbd.c mlkem-native_ml-kem-768_x86_64/debug/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/ntt.c mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/polyvec.c mlkem-native_ml-kem-768_x86_64/rej_uniform.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/zetas.c)
+    add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/cbd.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/ntt.c mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/polyvec.c mlkem-native_ml-kem-768_x86_64/rej_uniform.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/zetas.c)
     target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64)
     target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_768_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
@@ -50,7 +50,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_aarch64)
-    add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
+    add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
     target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64)
     target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT)
@@ -58,7 +58,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_aarch64)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024)
-    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/cbd.c mlkem-native_ml-kem-1024_ref/debug/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/ntt.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/polyvec.c mlkem-native_ml-kem-1024_ref/rej_uniform.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c)
+    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/cbd.c mlkem-native_ml-kem-1024_ref/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/ntt.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/polyvec.c mlkem-native_ml-kem-1024_ref/rej_uniform.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c)
     target_compile_options(ml_kem_1024_ref PUBLIC -DMLKEM_K=4 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_C)
     target_include_directories(ml_kem_1024_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_ref)
     target_include_directories(ml_kem_1024_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -67,7 +67,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
-    add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/cbd.c mlkem-native_ml-kem-1024_x86_64/debug/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/ntt.c mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/polyvec.c mlkem-native_ml-kem-1024_x86_64/rej_uniform.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/zetas.c)
+    add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/cbd.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/ntt.c mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/polyvec.c mlkem-native_ml-kem-1024_x86_64/rej_uniform.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/zetas.c)
     target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64)
     target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_1024_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
@@ -76,7 +76,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_aarch64)
-    add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
+    add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
     target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64)
     target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h
index 6a5ee8a7d..fc4e7dd38 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h
@@ -75,14 +75,14 @@ void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
 void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_clean \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
                                                      const int16_t *a,
                                                      const int16_t *b,
                                                      const int16_t *b_cache);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_opt \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
                                                    const int16_t *b,
                                                    const int16_t *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h
index b0ff3d597..548b1eebb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h
@@ -31,7 +31,6 @@ static INLINE void ntt_native(poly *data)
                 aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S
index 623a82ae9..b243a569d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_clean):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -361,4 +341,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S
index e332efef8..c94746e17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_opt):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -1017,4 +997,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S
index 877a5f689..cd63cc4d6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -156,7 +156,6 @@
         q_root2_tw .req q6
 
         consts    .req v7
-        q_consts  .req q7
 
         tmp .req v24
         t0  .req v25
@@ -167,21 +166,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_clean):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -280,4 +271,46 @@ layer3456_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S
index 15103a595..8705615b7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -167,21 +167,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_opt)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -916,4 +908,47 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h
index b22674026..ec1bf6587 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h
@@ -25,14 +25,12 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-#define NTT_BOUND_NATIVE (6 * MLKEM_Q)
 static INLINE void ntt_native(poly *data)
 {
   ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
               aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S
index f70a40221..809f9667e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
 loop_start:
@@ -115,6 +90,7 @@ loop_start:
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -122,9 +98,7 @@ loop_start:
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -137,6 +111,7 @@ loop_start:
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -152,13 +127,14 @@ loop_start:
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
 mulcache_compute_loop_start:
@@ -185,6 +161,7 @@ mulcache_compute_loop_start:
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -200,9 +177,7 @@ mulcache_compute_loop_start:
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -261,6 +236,7 @@ poly_tobytes_asm_clean_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -268,22 +244,25 @@ poly_tobytes_asm_clean_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
 poly_tomont_asm_loop:
@@ -311,6 +290,7 @@ poly_tomont_asm_loop:
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -318,13 +298,9 @@ poly_tomont_asm_loop:
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S
index e58ee77c4..815a9dd1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
                                                // Instructions:    15
@@ -278,6 +253,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -285,9 +261,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -300,6 +274,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -315,13 +290,14 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
                                               // Instructions:    7
@@ -426,6 +402,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -441,9 +418,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -502,6 +477,7 @@ poly_tobytes_asm_opt_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -509,22 +485,25 @@ poly_tobytes_asm_opt_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
                                              // Instructions:    5
@@ -670,6 +649,7 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -677,13 +657,9 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S
index 99fb05de5..c91675b44 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -198,12 +176,15 @@ k2_loop_start:
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -237,12 +218,15 @@ k3_loop_start:
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -285,4 +269,39 @@ k4_loop_start:
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq wtmp
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S
index 16ed77c3f..8300b682c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -530,12 +508,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1001,12 +982,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1581,4 +1565,39 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq wtmp
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S
index 722dc0f49..5151a05d0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S
@@ -45,6 +45,7 @@
     len                         .req w4
 
     /* Temporary output on the stack */
+    xtmp                        .req x7
     output_tmp                  .req x7
     output_tmp_base             .req x8
 
@@ -110,20 +111,26 @@
 
     mlkem_q                     .req v30
     bits                        .req v31
-    bits_q                      .req q31
 
 .text
-/* Literal pool */
-.p2align 4
-c_bit_table:
-    .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-
 .align 4
 .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
     push_stack
 
-    ldr  bits_q, c_bit_table
+    // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+    movz xtmp, 0x1
+    movk xtmp, 0x2, lsl 16
+    movk xtmp, 0x4, lsl 32
+    movk xtmp, 0x8, lsl 48
+    mov bits.d[0], xtmp
+
+    movz xtmp, 0x10
+    movk xtmp, 0x20, lsl 16
+    movk xtmp, 0x40, lsl 32
+    movk xtmp, 0x80, lsl 48
+    mov bits.d[1], xtmp
+
     movz tmp, #MLKEM_Q
     dup  mlkem_q.8h, tmp
 
@@ -337,5 +344,63 @@ return:
     pop_stack
     ret
 
+
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq output
+    .unreq buf
+    .unreq buflen
+    .unreq table_idx
+    .unreq len
+    .unreq output_tmp
+    .unreq output_tmp_base
+    .unreq count
+    .unreq buf_consumed
+    .unreq tmp
+    .unreq xtmp
+    .unreq final_copy_count
+    .unreq rec_idx_0
+    .unreq rec_idx_1
+    .unreq rec_idx_2
+    .unreq rec_idx_3
+    .unreq ctr0
+    .unreq ctr1
+    .unreq ctr2
+    .unreq ctr3
+    .unreq ctr01
+    .unreq ctr23
+    .unreq buf0
+    .unreq buf1
+    .unreq buf2
+    .unreq tmp0
+    .unreq tmp1
+    .unreq tmp2
+    .unreq tmp3
+    .unreq sign0
+    .unreq sign1
+    .unreq sign2
+    .unreq sign3
+    .unreq val0
+    .unreq val0q
+    .unreq val1
+    .unreq val1q
+    .unreq val2
+    .unreq val2q
+    .unreq val3
+    .unreq val3q
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq table0
+    .unreq table0q
+    .unreq table1
+    .unreq table1q
+    .unreq table2
+    .unreq table2q
+    .unreq table3
+    .unreq table3q
+    .unreq mlkem_q
+    .unreq bits
+
 #endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
           defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h
index ce13e7911..25e00a930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h
@@ -42,7 +42,7 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
                   const __m256i *qdata);
 
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_avx2)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
     poly *r, const polyvec *a, const polyvec *b,
     const polyvec_mulcache *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h
index 66de8c85f..029111c17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h
@@ -28,9 +28,6 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
-#define NTT_BOUND_NATIVE (8 * MLKEM_Q)
-
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 {
   nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h
index 6a5ee8a7d..fc4e7dd38 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h
@@ -75,14 +75,14 @@ void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
 void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_clean \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
                                                      const int16_t *a,
                                                      const int16_t *b,
                                                      const int16_t *b_cache);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_opt \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
                                                    const int16_t *b,
                                                    const int16_t *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h
index b0ff3d597..548b1eebb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h
@@ -31,7 +31,6 @@ static INLINE void ntt_native(poly *data)
                 aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S
index 623a82ae9..b243a569d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_clean):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -361,4 +341,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S
index e332efef8..c94746e17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_opt):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -1017,4 +997,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S
index 877a5f689..cd63cc4d6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -156,7 +156,6 @@
         q_root2_tw .req q6
 
         consts    .req v7
-        q_consts  .req q7
 
         tmp .req v24
         t0  .req v25
@@ -167,21 +166,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_clean):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -280,4 +271,46 @@ layer3456_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S
index 15103a595..8705615b7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -167,21 +167,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_opt)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -916,4 +908,47 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h
index b22674026..ec1bf6587 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h
@@ -25,14 +25,12 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-#define NTT_BOUND_NATIVE (6 * MLKEM_Q)
 static INLINE void ntt_native(poly *data)
 {
   ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
               aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S
index f70a40221..809f9667e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
 loop_start:
@@ -115,6 +90,7 @@ loop_start:
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -122,9 +98,7 @@ loop_start:
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -137,6 +111,7 @@ loop_start:
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -152,13 +127,14 @@ loop_start:
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
 mulcache_compute_loop_start:
@@ -185,6 +161,7 @@ mulcache_compute_loop_start:
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -200,9 +177,7 @@ mulcache_compute_loop_start:
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -261,6 +236,7 @@ poly_tobytes_asm_clean_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -268,22 +244,25 @@ poly_tobytes_asm_clean_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
 poly_tomont_asm_loop:
@@ -311,6 +290,7 @@ poly_tomont_asm_loop:
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -318,13 +298,9 @@ poly_tomont_asm_loop:
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S
index e58ee77c4..815a9dd1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
                                                // Instructions:    15
@@ -278,6 +253,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -285,9 +261,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -300,6 +274,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -315,13 +290,14 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
                                               // Instructions:    7
@@ -426,6 +402,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -441,9 +418,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -502,6 +477,7 @@ poly_tobytes_asm_opt_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -509,22 +485,25 @@ poly_tobytes_asm_opt_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
                                              // Instructions:    5
@@ -670,6 +649,7 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -677,13 +657,9 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S
index 99fb05de5..c91675b44 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -198,12 +176,15 @@ k2_loop_start:
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -237,12 +218,15 @@ k3_loop_start:
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -285,4 +269,39 @@ k4_loop_start:
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq wtmp
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S
index 16ed77c3f..8300b682c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -530,12 +508,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1001,12 +982,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1581,4 +1565,39 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq wtmp
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S
index 722dc0f49..5151a05d0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S
@@ -45,6 +45,7 @@
     len                         .req w4
 
     /* Temporary output on the stack */
+    xtmp                        .req x7
     output_tmp                  .req x7
     output_tmp_base             .req x8
 
@@ -110,20 +111,26 @@
 
     mlkem_q                     .req v30
     bits                        .req v31
-    bits_q                      .req q31
 
 .text
-/* Literal pool */
-.p2align 4
-c_bit_table:
-    .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-
 .align 4
 .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
     push_stack
 
-    ldr  bits_q, c_bit_table
+    // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+    movz xtmp, 0x1
+    movk xtmp, 0x2, lsl 16
+    movk xtmp, 0x4, lsl 32
+    movk xtmp, 0x8, lsl 48
+    mov bits.d[0], xtmp
+
+    movz xtmp, 0x10
+    movk xtmp, 0x20, lsl 16
+    movk xtmp, 0x40, lsl 32
+    movk xtmp, 0x80, lsl 48
+    mov bits.d[1], xtmp
+
     movz tmp, #MLKEM_Q
     dup  mlkem_q.8h, tmp
 
@@ -337,5 +344,63 @@ return:
     pop_stack
     ret
 
+
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq output
+    .unreq buf
+    .unreq buflen
+    .unreq table_idx
+    .unreq len
+    .unreq output_tmp
+    .unreq output_tmp_base
+    .unreq count
+    .unreq buf_consumed
+    .unreq tmp
+    .unreq xtmp
+    .unreq final_copy_count
+    .unreq rec_idx_0
+    .unreq rec_idx_1
+    .unreq rec_idx_2
+    .unreq rec_idx_3
+    .unreq ctr0
+    .unreq ctr1
+    .unreq ctr2
+    .unreq ctr3
+    .unreq ctr01
+    .unreq ctr23
+    .unreq buf0
+    .unreq buf1
+    .unreq buf2
+    .unreq tmp0
+    .unreq tmp1
+    .unreq tmp2
+    .unreq tmp3
+    .unreq sign0
+    .unreq sign1
+    .unreq sign2
+    .unreq sign3
+    .unreq val0
+    .unreq val0q
+    .unreq val1
+    .unreq val1q
+    .unreq val2
+    .unreq val2q
+    .unreq val3
+    .unreq val3q
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq table0
+    .unreq table0q
+    .unreq table1
+    .unreq table1q
+    .unreq table2
+    .unreq table2q
+    .unreq table3
+    .unreq table3q
+    .unreq mlkem_q
+    .unreq bits
+
 #endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
           defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h
index ce13e7911..25e00a930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h
@@ -42,7 +42,7 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
                   const __m256i *qdata);
 
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_avx2)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
     poly *r, const polyvec *a, const polyvec *b,
     const polyvec_mulcache *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h
index 66de8c85f..029111c17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h
@@ -28,9 +28,6 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
-#define NTT_BOUND_NATIVE (8 * MLKEM_Q)
-
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 {
   nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h
index 6a5ee8a7d..fc4e7dd38 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h
@@ -75,14 +75,14 @@ void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
 void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_clean \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
                                                      const int16_t *a,
                                                      const int16_t *b,
                                                      const int16_t *b_cache);
 
 #define polyvec_basemul_acc_montgomery_cached_asm_opt \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a,
                                                    const int16_t *b,
                                                    const int16_t *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h
index b0ff3d597..548b1eebb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h
@@ -31,7 +31,6 @@ static INLINE void ntt_native(poly *data)
                 aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S
index 623a82ae9..b243a569d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_clean):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -361,4 +341,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S
index e332efef8..c94746e17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_opt):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -1017,4 +997,49 @@ layer012_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq ninv
+    .unreq ninv_tw
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S
index 877a5f689..cd63cc4d6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -156,7 +156,6 @@
         q_root2_tw .req q6
 
         consts    .req v7
-        q_consts  .req q7
 
         tmp .req v24
         t0  .req v25
@@ -167,21 +166,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_clean):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -280,4 +271,46 @@ layer3456_start:
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S
index 15103a595..8705615b7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -167,21 +167,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_opt)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -916,4 +908,47 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         pop_stack
         ret
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq in
+    .unreq r01234_ptr
+    .unreq r56_ptr
+    .unreq inp
+    .unreq count
+    .unreq wtmp
+    .unreq data0
+    .unreq data1
+    .unreq data2
+    .unreq data3
+    .unreq data4
+    .unreq data5
+    .unreq data6
+    .unreq data7
+    .unreq q_data0
+    .unreq q_data1
+    .unreq q_data2
+    .unreq q_data3
+    .unreq q_data4
+    .unreq q_data5
+    .unreq q_data6
+    .unreq q_data7
+    .unreq root0
+    .unreq root1
+    .unreq root2
+    .unreq root0_tw
+    .unreq root1_tw
+    .unreq root2_tw
+    .unreq q_root0
+    .unreq q_root1
+    .unreq q_root2
+    .unreq q_root0_tw
+    .unreq q_root1_tw
+    .unreq q_root2_tw
+    .unreq consts
+    .unreq q_consts
+    .unreq tmp
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h
index b22674026..ec1bf6587 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h
@@ -25,14 +25,12 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-#define NTT_BOUND_NATIVE (6 * MLKEM_Q)
 static INLINE void ntt_native(poly *data)
 {
   ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
               aarch64_ntt_zetas_layer56);
 }
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
 static INLINE void intt_native(poly *data)
 {
   intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S
index f70a40221..809f9667e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
 loop_start:
@@ -115,6 +90,7 @@ loop_start:
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -122,9 +98,7 @@ loop_start:
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -137,6 +111,7 @@ loop_start:
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -152,13 +127,14 @@ loop_start:
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
 mulcache_compute_loop_start:
@@ -185,6 +161,7 @@ mulcache_compute_loop_start:
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -200,9 +177,7 @@ mulcache_compute_loop_start:
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -261,6 +236,7 @@ poly_tobytes_asm_clean_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -268,22 +244,25 @@ poly_tobytes_asm_clean_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
 poly_tomont_asm_loop:
@@ -311,6 +290,7 @@ poly_tomont_asm_loop:
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -318,13 +298,9 @@ poly_tomont_asm_loop:
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S
index e58ee77c4..815a9dd1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S
@@ -6,33 +6,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
                                                // Instructions:    15
@@ -278,6 +253,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -285,9 +261,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -300,6 +274,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -315,13 +290,14 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
                                               // Instructions:    7
@@ -426,6 +402,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -441,9 +418,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -502,6 +477,7 @@ poly_tobytes_asm_opt_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -509,22 +485,25 @@ poly_tobytes_asm_opt_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
                                              // Instructions:    5
@@ -670,6 +649,7 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -677,13 +657,9 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S
index 99fb05de5..c91675b44 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -198,12 +176,15 @@ k2_loop_start:
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -237,12 +218,15 @@ k3_loop_start:
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -285,4 +269,39 @@ k4_loop_start:
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq wtmp
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S
index 16ed77c3f..8300b682c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S
@@ -12,31 +12,6 @@
 #include "common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -164,12 +138,16 @@ c_modulus_twisted: dup8h 3327
         t0   .req v28
 
 #if MLKEM_K == 2
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -530,12 +508,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 2 */
 
 #if MLKEM_K == 3
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1001,12 +982,15 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 #endif /* MLKEM_K == 3 */
 
 #if MLKEM_K == 4
-.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt)
+.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt)
 
-MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
+MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1581,4 +1565,39 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         ret
 #endif /* MLKEM_K == 4 */
 
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq out
+    .unreq a0_ptr
+    .unreq b0_ptr
+    .unreq b0_cache_ptr
+    .unreq a1_ptr
+    .unreq b1_ptr
+    .unreq b1_cache_ptr
+    .unreq a2_ptr
+    .unreq b2_ptr
+    .unreq b2_cache_ptr
+    .unreq a3_ptr
+    .unreq b3_ptr
+    .unreq b3_cache_ptr
+    .unreq count
+    .unreq modulus
+    .unreq modulus_twisted
+    .unreq wtmp
+    .unreq aa0
+    .unreq aa1
+    .unreq bb0
+    .unreq bb1
+    .unreq bb1t
+    .unreq res0l
+    .unreq res1l
+    .unreq res0h
+    .unreq res1h
+    .unreq tmp0
+    .unreq tmp1
+    .unreq q_tmp0
+    .unreq q_tmp1
+    .unreq out0
+    .unreq out1
+    .unreq t0
+
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S
index 722dc0f49..5151a05d0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S
@@ -45,6 +45,7 @@
     len                         .req w4
 
     /* Temporary output on the stack */
+    xtmp                        .req x7
     output_tmp                  .req x7
     output_tmp_base             .req x8
 
@@ -110,20 +111,26 @@
 
     mlkem_q                     .req v30
     bits                        .req v31
-    bits_q                      .req q31
 
 .text
-/* Literal pool */
-.p2align 4
-c_bit_table:
-    .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-
 .align 4
 .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
     push_stack
 
-    ldr  bits_q, c_bit_table
+    // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+    movz xtmp, 0x1
+    movk xtmp, 0x2, lsl 16
+    movk xtmp, 0x4, lsl 32
+    movk xtmp, 0x8, lsl 48
+    mov bits.d[0], xtmp
+
+    movz xtmp, 0x10
+    movk xtmp, 0x20, lsl 16
+    movk xtmp, 0x40, lsl 32
+    movk xtmp, 0x80, lsl 48
+    mov bits.d[1], xtmp
+
     movz tmp, #MLKEM_Q
     dup  mlkem_q.8h, tmp
 
@@ -337,5 +344,63 @@ return:
     pop_stack
     ret
 
+
+/****************** REGISTER DEALLOCATIONS *******************/
+    .unreq output
+    .unreq buf
+    .unreq buflen
+    .unreq table_idx
+    .unreq len
+    .unreq output_tmp
+    .unreq output_tmp_base
+    .unreq count
+    .unreq buf_consumed
+    .unreq tmp
+    .unreq xtmp
+    .unreq final_copy_count
+    .unreq rec_idx_0
+    .unreq rec_idx_1
+    .unreq rec_idx_2
+    .unreq rec_idx_3
+    .unreq ctr0
+    .unreq ctr1
+    .unreq ctr2
+    .unreq ctr3
+    .unreq ctr01
+    .unreq ctr23
+    .unreq buf0
+    .unreq buf1
+    .unreq buf2
+    .unreq tmp0
+    .unreq tmp1
+    .unreq tmp2
+    .unreq tmp3
+    .unreq sign0
+    .unreq sign1
+    .unreq sign2
+    .unreq sign3
+    .unreq val0
+    .unreq val0q
+    .unreq val1
+    .unreq val1q
+    .unreq val2
+    .unreq val2q
+    .unreq val3
+    .unreq val3q
+    .unreq t0
+    .unreq t1
+    .unreq t2
+    .unreq t3
+    .unreq table0
+    .unreq table0q
+    .unreq table1
+    .unreq table1q
+    .unreq table2
+    .unreq table2q
+    .unreq table3
+    .unreq table3q
+    .unreq mlkem_q
+    .unreq bits
+
 #endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
           defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
index 09e30f207..0543b1bd1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
@@ -16,7 +16,9 @@
  *
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 #include "api.h"
 #endif
+#endif
 
 #endif /* MLKEM_NATIVE_ARITH_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c
index 433bdc954..1e6b7c5d1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c
@@ -2,8 +2,11 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "cbd.h"
+#include "common.h"
+#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+
 #include <stdint.h>
+#include "cbd.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -11,8 +14,6 @@
  * within a single compilation unit. */
 #define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
 #define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-#define cbd2 MLKEM_NAMESPACE(cbd2)
-#define cbd3 MLKEM_NAMESPACE(cbd3)
 /* End of static namespacing */
 
 /*************************************************
@@ -35,44 +36,13 @@ static uint32_t load32_littleendian(const uint8_t x[4])
   return r;
 }
 
-#if MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-#endif /* MLKEM_ETA1 == 3 */
-
-/*************************************************
- * Name:        cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
   {
     unsigned j;
@@ -82,7 +52,7 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
 
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
     {
       const int16_t a = (d >> (4 * j + 0)) & 0x3;
@@ -92,24 +62,34 @@ static void cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
   }
 }
 
-#if MLKEM_ETA1 == 3
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
 /*************************************************
- * Name:        cbd3
+ * Name:        load24_littleendian
  *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
  *              This function is only needed for ML-KEM-512
  *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
  **************************************************/
-static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
   {
     unsigned j;
@@ -120,7 +100,7 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
 
     for (j = 0; j < 4; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 4 && j >= 0 && j <= 4)
+      invariant(i <= MLKEM_N / 4 && j <= 4)
       invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
     {
       const int16_t a = (d >> (6 * j + 0)) & 0x7;
@@ -129,28 +109,12 @@ static void cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
     }
   }
 }
-#endif /* MLKEM_ETA1 == 3 */
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-{
-#if MLKEM_ETA1 == 2
-  cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  cbd3(r, buf);
-#else
-#error "This implementation requires eta1 in {2,3}"
-#endif
-}
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-{
-#if MLKEM_ETA2 == 2
-  cbd2(r, buf);
-#else
-#error "This implementation requires eta2 = 2"
-#endif
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
+int empty_cu_cbd;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h
index 15db89570..54c1f5b90 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h
@@ -9,46 +9,35 @@
 #include "common.h"
 #include "poly.h"
 
-#define poly_cbd_eta1 MLKEM_NAMESPACE(poly_cbd_eta1)
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd2
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *              a centered binomial distribution with parameter eta=2
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta1(poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-);
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
 
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_cbd_eta2 MLKEM_NAMESPACE(poly_cbd_eta2)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
 /*************************************************
- * Name:        poly_cbd_eta1
+ * Name:        poly_cbd3
  *
  * Description: Given an array of uniformly random bytes, compute
  *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *buf: pointer to input byte array
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_cbd_eta2(poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
 
-#endif
+#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbmc.h
index baa0bfa9f..52b95bc3f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbmc.h
@@ -13,7 +13,7 @@
 
 #define __contract__(x)
 #define __loop__(x)
-#define cassert(x, y)
+#define cassert(x)
 
 #else /* CBMC _is_ defined, therefore we're doing proof */
 
@@ -30,7 +30,7 @@
 #define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
 #define decreases(...) __CPROVER_decreases(__VA_ARGS__)
 /* cassert to avoid confusion with in-built assert */
-#define cassert(...) __CPROVER_assert(__VA_ARGS__)
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
 #define assume(...) __CPROVER_assume(__VA_ARGS__)
 
 /***************************************************
@@ -119,13 +119,13 @@
   {                                                                    \
     unsigned qvar;                                                     \
     ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
-        (((value_lb) <= (array_var[(qvar)])) &&                        \
-        ((array_var[(qvar)]) < (value_ub)))                            \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
   }
 
 #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
   array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb),      \
-                   (qvar_ub), (array_var), (value_lb), (value_ub))
+      (qvar_ub), (array_var), (value_lb), (value_ub))
 /* clang-format on */
 
 /* Wrapper around array_bound operating on absolute values.
@@ -134,6 +134,6 @@
  * bound in array_bound is inclusive, we have to raise it by 1.
  */
 #define array_abs_bound(arr, lb, ub, k) \
-  array_bound((arr), (lb), (ub), -(k) + 1, (k))
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
index da886780c..4f326333e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
@@ -43,23 +43,30 @@
 #define MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2) x1##_##x2
 #define MLKEM_NATIVE_MAKE_NAMESPACE(x1, x2) MLKEM_NATIVE_MAKE_NAMESPACE_(x1, x2)
 
-#define FIPS202_NAMESPACE(s) \
-  MLKEM_NATIVE_MAKE_NAMESPACE(FIPS202_NAMESPACE_PREFIX, s)
-
 #define MLKEM_NAMESPACE(s) \
   MLKEM_NATIVE_MAKE_NAMESPACE(MLKEM_NAMESPACE_PREFIX, s)
 
+#if defined(MLKEM_NAMESPACE_PREFIX_ADD_LEVEL)
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3) x1##x2##_##x3
+#define MLKEM_NATIVE_MAKE_NAMESPACE_K(x1, x2, x3) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K_(x1, x2, x3)
+#define MLKEM_NAMESPACE_K(s) \
+  MLKEM_NATIVE_MAKE_NAMESPACE_K(MLKEM_NAMESPACE_PREFIX, MLKEM_LVL, s)
+#else
+#define MLKEM_NAMESPACE_K(s) MLKEM_NAMESPACE(s)
+#endif
+
 /* On Apple platforms, we need to emit leading underscore
  * in front of assembly symbols. We thus introducee a separate
  * namespace wrapper for ASM symbols. */
 #if !defined(__APPLE__)
 #define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym)
-#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym)
+#define MLKEM_ASM_NAMESPACE_K(sym) MLKEM_NAMESPACE_K(sym)
 #else
 #define PREFIX_UNDERSCORE_(sym) _##sym
 #define PREFIX_UNDERSCORE(sym) PREFIX_UNDERSCORE_(sym)
 #define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym))
-#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym))
+#define MLKEM_ASM_NAMESPACE_K(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE_K(sym))
 #endif
 
 #endif /* MLKEM_NATIVE_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
index d1441835b..fa89370ce 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
@@ -40,10 +40,12 @@
 /* #define MLKEM_NATIVE_CONFIG_FILE "config.h" */
 
 /******************************************************************************
- * Name:        MLKEM_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/.
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              Level-dependent symbols will additionally be prefixed with the
+ *              security level if MLKEM_NAMESPACE_PREFIX_ADD_LEVEL is set.
  *
  *              This can also be set using CFLAGS.
  *
@@ -53,17 +55,71 @@
 #endif
 
 /******************************************************************************
- * Name:        FIPS202_NAMESPACE
+ * Name:        MLKEM_NAMESPACE_PREFIX_ADD_LEVEL
+ *
+ * Description: If set, the level (512, 768, 1024) is added to the namespace
+ *              prefix MLKEM_NAMESPACE_PREFIX for all functions which are
+ *              level-dependent. Level-independent functions will have there
+ *              symbol prefixed by MLKEM_NAMESPACE_PREFIX only.
  *
- * Description: The prefix to use to namespace global symbols
- *              from mlkem/fips202/.
+ *              This is intended to be used for multi-level builds where
+ *              level-independent code should be shared across levels.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(FIPS202_NAMESPACE_PREFIX)
-#define FIPS202_NAMESPACE_PREFIX FIPS202_DEFAULT_NAMESPACE_PREFIX
-#endif
+/* #define MLKEM_NAMESPACE_PREFIX_ADD_LEVEL */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, all MLKEM_K-independent code will be included
+ *              in the build, including code needed only for other security
+ *              levels.
+ *
+ *              Example: poly_cbd3 is only needed for MLKEM_K == 2. Yet, if
+ *              this option is set for a build with MLKEM_K==3/4, it would
+ *              be included.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mlkem-native only. If you
+ *              need only a single security level build of mlkem-native,
+ *              keep this unset.
+ *
+ *              If this is set, no MLKEM_K-independent code will be included
+ *              in the build.
+ *
+ *              To build mlkem-native with support for all security levels,
+ *              build it three times -- once per level -- and set the option
+ *              MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED for exactly one of
+ *              them, and MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED for the
+ *              others.
+ *
+ *              See examples/multilevel_build for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
  * Name:        MLKEM_USE_NATIVE
@@ -112,25 +168,13 @@
 /* Default namespace
  *
  * Don't change this. If you need a different namespace, re-define
- * MLKEM_NAMESPACE above instead, and remove the following.
- */
-
-/*
- * The default FIPS202 namespace is
- *
- *   PQCP_MLKEM_NATIVE_FIPS202_<BACKEND>_
+ * MLKEM_NAMESPACE_PREFIX above instead, and remove the following.
  *
- * e.g., PQCP_MLKEM_NATIVE_FIPS202_C_
- */
-
-#define FIPS202_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_FIPS202
-
-/*
  * The default MLKEM namespace is
  *
- *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_<BACKEND>_
+ *   PQCP_MLKEM_NATIVE_MLKEM<LEVEL>_
  *
- * e.g., PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_
+ * e.g., PQCP_MLKEM_NATIVE_MLKEM512_
  */
 
 #if MLKEM_K == 2
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.c
new file mode 100644
index 000000000..4b4857cbc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) && defined(MLKEM_DEBUG)
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mlkem_debug_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr,
+            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int16_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(
+          stderr,
+          MLKEM_NATIVE_DEBUG_ERROR_HEADER
+          "Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
+          file, line, i, (int)val, lower_bound_exclusive,
+          upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+    exit(1);
+}
+
+#else /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
+
+#define empty_cu_debug MLKEM_NAMESPACE_K(empty_cu_debug)
+int empty_cu_debug;
+
+#endif /* !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED && MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.h
new file mode 100644
index 000000000..1103124db
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_DEBUG_H
+#define MLKEM_DEBUG_H
+#include "common.h"
+
+#if defined(MLKEM_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mlkem_debug_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
+void mlkem_debug_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mlkem_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int16_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int16_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
+void mlkem_debug_check_bounds(const char *file, int line, const int16_t *ptr,
+                              unsigned len, int lower_bound_exclusive,
+                              int upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define debug_assert(val) mlkem_debug_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int16_t's
+ * ptr: Base of int16_t array; will be explicitly cast to int16_t*,
+ *      so you may pass a byte-compatible type such as poly or polyvec.
+ * len: Number of int16_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define debug_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mlkem_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
+                           (value_lb)-1, (value_ub))
+
+/* Check absolute bounds in array of int16_t's
+ * ptr: Base of array, expression of type int16_t*
+ * len: Number of int16_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  debug_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  debug_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  debug_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+
+#include "../cbmc.h"
+
+#define debug_assert(val) cassert(val)
+
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define debug_assert_bound_2d(ptr, M, N, value_lb, value_ub)           \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define debug_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                 \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* MLKEM_DEBUG */
+
+#define debug_assert(val) \
+  do                      \
+  {                       \
+  } while (0)
+#define debug_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                     \
+  {                                                      \
+  } while (0)
+#define debug_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+
+#define debug_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                               \
+  {                                                                \
+  } while (0)
+
+#define debug_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+
+#endif /* MLKEM_DEBUG */
+#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.c
deleted file mode 100644
index 64294ebe1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-
-#include <stdio.h>
-#include "debug.h"
-
-#define MLKEM_NATIVE_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
-
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val)
-{
-  if (val == 0)
-  {
-    fprintf(stderr,
-            MLKEM_NATIVE_DEBUG_ERROR_HEADER "Assertion failed: %s (value %d)\n",
-            file, line, description, val);
-    exit(1);
-  }
-}
-
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive)
-{
-  int err = 0;
-  unsigned i;
-  for (i = 0; i < len; i++)
-  {
-    int16_t val = ptr[i];
-    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
-    {
-      fprintf(stderr,
-              MLKEM_NATIVE_DEBUG_ERROR_HEADER
-              "%s, index %u, value %d out of bounds (%d,%d)\n",
-              file, line, description, i, (int)val, lower_bound_exclusive,
-              upper_bound_exclusive);
-      err = 1;
-    }
-  }
-
-  if (err == 1)
-    exit(1);
-}
-
-#else /* MLKEM_DEBUG */
-
-#define empty_cu_debug MLKEM_NAMESPACE(empty_cu_debug)
-int empty_cu_debug;
-
-#endif /* MLKEM_DEBUG */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.h
deleted file mode 100644
index 5ce320ea2..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/debug/debug.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_DEBUG_H
-#define MLKEM_DEBUG_H
-
-#include "../common.h"
-
-#if defined(MLKEM_DEBUG)
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-/*************************************************
- * Name:        mlkem_debug_assert
- *
- * Description: Check debug assertion
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of assertion
- *              - val: Value asserted to be non-zero
- **************************************************/
-#define mlkem_debug_assert MLKEM_NAMESPACE(mlkem_debug_assert)
-void mlkem_debug_assert(const char *file, int line, const char *description,
-                        const int val);
-
-/*************************************************
- * Name:        mlkem_debug_check_bounds
- *
- * Description: Check whether values in an array of int16_t
- *              are within specified bounds.
- *
- *              Prints an error message to stderr and calls
- *              exit(1) if not.
- *
- * Arguments:   - file: filename
- *              - line: line number
- *              - description: Textual description of check
- *              - ptr: Base of array to be checked
- *              - len: Number of int16_t in ptr
- *              - lower_bound_exclusive: Exclusive lower bound
- *              - upper_bound_exclusive: Exclusive upper bound
- **************************************************/
-#define mlkem_debug_check_bounds MLKEM_NAMESPACE(mlkem_debug_check_bounds)
-void mlkem_debug_check_bounds(const char *file, int line,
-                              const char *description, const int16_t *ptr,
-                              unsigned len, int lower_bound_exclusive,
-                              int upper_bound_exclusive);
-
-/* Check assertion, calling exit() upon failure
- *
- * val: Value that's asserted to be non-zero
- * msg: Message to print on failure
- *
- * Currently called CASSERT to avoid clash with CBMC assert.
- */
-#define CASSERT(val, msg)                                 \
-  do                                                      \
-  {                                                       \
-    mlkem_debug_assert(__FILE__, __LINE__, (msg), (val)); \
-  } while (0)
-
-/* Check absolute bounds of scalar
- * val: Scalar to be checked
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  CASSERT((val) > -(abs_bound) && (val) < (abs_bound), msg)
-
-/* Check that all coefficients in array of int16_t's are non-negative
- * and below an exclusive upper bound.
- *
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * high_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define UBOUND(ptr, len, high_bound, msg)                                 \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -1, ((high_bound)));                  \
-  } while (0)
-
-/* Check absolute bounds in array of int16_t's
- * ptr: Base of array, expression of type int16_t*
- * len: Number of int16_t in array
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define BOUND(ptr, len, abs_bound, msg)                                   \
-  do                                                                      \
-  {                                                                       \
-    mlkem_debug_check_bounds(__FILE__, __LINE__, (msg), (int16_t *)(ptr), \
-                             (len), -(abs_bound), (abs_bound));           \
-  } while (0)
-
-/* Check absolute bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check
- * msg: Message to print on failure */
-#define POLY_BOUND_MSG(ptr, abs_bound, msg)                                    \
-  BOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (abs_bound), \
-        msg)
-
-/* Check unsigned bounds on coefficients in polynomial or mulcache
- * ptr: poly* or poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- * msg: Message to print on failure */
-#define POLY_UBOUND_MSG(ptr, ubound, msg)                                    \
-  UBOUND((ptr)->coeffs, (sizeof((ptr)->coeffs) / sizeof(int16_t)), (ubound), \
-         msg)
-
-/* Check absolute bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLY_BOUND(ptr, abs_bound) \
-  POLY_BOUND_MSG((ptr), (abs_bound), "poly absolute bound for " #ptr)
-
-/* Check unsigned bounds on coefficients in polynomial
- * ptr: poly* of poly_mulcache* pointer to polynomial (cache) to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLY_UBOUND(ptr, ubound) \
-  POLY_UBOUND_MSG((ptr), (ubound), "poly unsigned bound for " #ptr)
-
-/* Check absolute bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * abs_bound: Exclusive upper bound on absolute value to check */
-#define POLYVEC_BOUND(ptr, abs_bound)                                      \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_BOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (abs_bound),   \
-                     "polyvec absolute bound for " #ptr ".vec[i]");        \
-  } while (0)
-
-/* Check unsigned bounds on coefficients in vector of polynomials
- * ptr: polyvec* or polyvec_mulcache* pointer to vector of polynomials to check
- * ubound: Exclusive upper bound on value to check. Inclusive lower bound is 0.
- */
-#define POLYVEC_UBOUND(ptr, ubound)                                        \
-  do                                                                       \
-  {                                                                        \
-    unsigned _debug_polyvec_bound_idx;                                     \
-    for (_debug_polyvec_bound_idx = 0; _debug_polyvec_bound_idx < MLKEM_K; \
-         _debug_polyvec_bound_idx++)                                       \
-      POLY_UBOUND_MSG(&(ptr)->vec[_debug_polyvec_bound_idx], (ubound),     \
-                      "polyvec unsigned bound for " #ptr ".vec[i]");       \
-  } while (0)
-
-#define MLKEM_CONCAT_(left, right) left##right
-#define MLKEM_CONCAT(left, right) MLKEM_CONCAT_(left, right)
-
-/* Following AWS-LC to define a C99-compliant static assert */
-#define MLKEM_STATIC_ASSERT_DEFINE(cond, msg)                            \
-  typedef struct                                                         \
-  {                                                                      \
-    unsigned int MLKEM_CONCAT(static_assertion_, msg) : (cond) ? 1 : -1; \
-  } MLKEM_CONCAT(MLKEM_NAMESPACE(static_assertion_), msg)                \
-      __attribute__((unused));
-
-#define MLKEM_STATIC_ASSERT_ADD_LINE0(cond, suffix) \
-  MLKEM_STATIC_ASSERT_DEFINE(cond, MLKEM_CONCAT(at_line_, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE1(cond, line, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE0(cond, MLKEM_CONCAT(line, suffix))
-#define MLKEM_STATIC_ASSERT_ADD_LINE2(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE1(cond, __LINE__, suffix)
-#define MLKEM_STATIC_ASSERT_ADD_ERROR(cond, suffix) \
-  MLKEM_STATIC_ASSERT_ADD_LINE2(cond, MLKEM_CONCAT(_error_is_, suffix))
-#define STATIC_ASSERT(cond, error) MLKEM_STATIC_ASSERT_ADD_ERROR(cond, error)
-
-#else /* MLKEM_DEBUG */
-
-#define CASSERT(val, msg) \
-  do                      \
-  {                       \
-  } while (0)
-#define SCALAR_BOUND(val, abs_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define BOUND(ptr, len, abs_bound, msg) \
-  do                                    \
-  {                                     \
-  } while (0)
-#define POLY_BOUND(ptr, abs_bound) \
-  do                               \
-  {                                \
-  } while (0)
-#define POLYVEC_BOUND(ptr, abs_bound) \
-  do                                  \
-  {                                   \
-  } while (0)
-#define POLY_BOUND_MSG(ptr, ubound, abs_bound) \
-  do                                           \
-  {                                            \
-  } while (0)
-#define UBOUND(ptr, len, high_bound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define POLY_UBOUND(ptr, ubound) \
-  do                             \
-  {                              \
-  } while (0)
-#define POLYVEC_UBOUND(ptr, ubound) \
-  do                                \
-  {                                 \
-  } while (0)
-#define POLY_UBOUND_MSG(ptr, ubound, msg) \
-  do                                      \
-  {                                       \
-  } while (0)
-#define STATIC_ASSERT(cond, error)
-
-#endif /* MLKEM_DEBUG */
-
-#endif /* MLKEM_DEBUG_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
index 4d3133e14..0cfcc3e9e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
@@ -17,7 +17,7 @@
 #include "symmetric.h"
 
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 #include "cbmc.h"
 
@@ -25,15 +25,13 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define pack_pk MLKEM_NAMESPACE(pack_pk)
-#define unpack_pk MLKEM_NAMESPACE(unpack_pk)
-#define pack_sk MLKEM_NAMESPACE(pack_sk)
-#define unpack_sk MLKEM_NAMESPACE(unpack_sk)
-#define pack_ciphertext MLKEM_NAMESPACE(pack_ciphertext)
-#define unpack_ciphertext MLKEM_NAMESPACE(unpack_ciphertext)
-#define gen_matrix_entry_x4 MLKEM_NAMESPACE(gen_matrix_entry_x4)
-#define gen_matrix_entry MLKEM_NAMESPACE(gen_matrix_entry)
-#define matvec_mul MLKEM_NAMESPACE(matvec_mul)
+#define pack_pk MLKEM_NAMESPACE_K(pack_pk)
+#define unpack_pk MLKEM_NAMESPACE_K(unpack_pk)
+#define pack_sk MLKEM_NAMESPACE_K(pack_sk)
+#define unpack_sk MLKEM_NAMESPACE_K(unpack_sk)
+#define pack_ciphertext MLKEM_NAMESPACE_K(pack_ciphertext)
+#define unpack_ciphertext MLKEM_NAMESPACE_K(unpack_ciphertext)
+#define matvec_mul MLKEM_NAMESPACE_K(matvec_mul)
 /* End of static namespacing */
 
 /*************************************************
@@ -51,7 +49,7 @@
 static void pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], polyvec *pk,
                     const uint8_t seed[MLKEM_SYMBYTES])
 {
-  POLYVEC_BOUND(pk, MLKEM_Q);
+  debug_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, pk);
   memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
 }
@@ -77,7 +75,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
   /* NOTE: If a modulus check was conducted on the PK, we know at this
    * point that the coefficients of `pk` are unsigned canonical. The
    * specifications and proofs, however, do _not_ assume this, and instead
-   * work with the easily provable bound by 4096. */
+   * work with the easily provable bound by UINT12_LIMIT. */
 }
 
 /*************************************************
@@ -91,7 +89,7 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
  **************************************************/
 static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk)
 {
-  POLYVEC_BOUND(sk, MLKEM_Q);
+  debug_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
   polyvec_tobytes(r, sk);
 }
 
@@ -145,131 +143,11 @@ static void unpack_ciphertext(polyvec *b, poly *v,
   poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
 }
 
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-/*
- * Generate four A matrix entries from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-/*
- * Generate a single A matrix entry from a seed, using rejection
- * sampling on the output of a XOF.
- */
-static void gen_matrix_entry(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(0 <= ctr && ctr <= MLKEM_N)
-    invariant(ctr > 0 ==> array_bound(entry->coeffs, 0, ctr,
-                                          0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
 #if !defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
 /* This namespacing is not done at the top to avoid a naming conflict
  * with native backends, which are currently not yet namespaced. */
 #define poly_permute_bitrev_to_custom \
-  MLKEM_NAMESPACE(poly_permute_bitrev_to_custom)
+  MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 __contract__(
@@ -332,7 +210,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
      * This call writes across polyvec boundaries for K=2 and K=3.
      * This is intentional and safe.
      */
-    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
+    poly_rej_uniform_x4(&a[0].vec[0] + i, seedxy);
   }
 
   /* For left over polynomial, we use single keccak. */
@@ -353,12 +231,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
       seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    poly_rej_uniform(&a[0].vec[0] + i, seed0);
     i++;
   }
 
-  cassert(i == MLKEM_K * MLKEM_K,
-          "gen_matrix: failed to generate whole matrix");
+  debug_assert(i == MLKEM_K * MLKEM_K);
 
   /*
    * The public matrix is generated in NTT domain. If the native backend
@@ -402,16 +279,12 @@ __contract__(
   for (i = 0; i < MLKEM_K; i++)
   __loop__(
     assigns(i, object_whole(out))
-    invariant(i >= 0 && i <= MLKEM_K))
+    invariant(i <= MLKEM_K))
   {
     polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
 }
 
-
-
-STATIC_ASSERT(NTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_enc_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
                            uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
@@ -461,7 +334,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
   matvec_mul(&pkpv, a, &skpv, &skpv_cache);
   polyvec_tomont(&pkpv);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&pkpv, &e);
   polyvec_reduce(&pkpv);
   polyvec_reduce(&skpv);
@@ -471,11 +343,6 @@ void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
 }
 
 
-/* Check that the arithmetic in indcpa_enc() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA1 < INT16_MAX, indcpa_enc_bound_0)
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_ETA2 + MLKEM_Q < INT16_MAX,
-              indcpa_enc_bound_1)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
                 const uint8_t m[MLKEM_INDCPA_MSGBYTES],
@@ -522,7 +389,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   polyvec_invntt_tomont(&b);
   poly_invntt_tomont(&v);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   polyvec_add(&b, &ep);
   poly_add(&v, &epp);
   poly_add(&v, &k);
@@ -533,9 +399,6 @@ void indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
   pack_ciphertext(c, &b, &v);
 }
 
-/* Check that the arithmetic in indcpa_dec() does not overflow */
-STATIC_ASSERT(INVNTT_BOUND + MLKEM_Q < INT16_MAX, indcpa_dec_bound_0)
-
 MLKEM_NATIVE_INTERNAL_API
 void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
                 const uint8_t c[MLKEM_INDCPA_BYTES],
@@ -551,7 +414,6 @@ void indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
   polyvec_basemul_acc_montgomery(&sb, &skpv, &b);
   poly_invntt_tomont(&sb);
 
-  /* Arithmetic cannot overflow, see static assertion at the top */
   poly_sub(&v, &sb);
   poly_reduce(&v);
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
index 011f1aa4f..2c4fda3c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
@@ -10,7 +10,7 @@
 #include "common.h"
 #include "polyvec.h"
 
-#define gen_matrix MLKEM_NAMESPACE(gen_matrix)
+#define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
  * Name:        gen_matrix
  *
@@ -34,7 +34,7 @@ __contract__(
   array_bound(a[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))));
 );
 
-#define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
+#define indcpa_keypair_derand MLKEM_NAMESPACE_K(indcpa_keypair_derand)
 /*************************************************
  * Name:        indcpa_keypair_derand
  *
@@ -60,7 +60,7 @@ __contract__(
   assigns(object_whole(sk))
 );
 
-#define indcpa_enc MLKEM_NAMESPACE(indcpa_enc)
+#define indcpa_enc MLKEM_NAMESPACE_K(indcpa_enc)
 /*************************************************
  * Name:        indcpa_enc
  *
@@ -89,7 +89,7 @@ __contract__(
   assigns(object_whole(c))
 );
 
-#define indcpa_dec MLKEM_NAMESPACE(indcpa_dec)
+#define indcpa_dec MLKEM_NAMESPACE_K(indcpa_dec)
 /*************************************************
  * Name:        indcpa_dec
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.c
index 5779d3273..88c3843be 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.c
@@ -16,8 +16,8 @@
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
-#define check_pk MLKEM_NAMESPACE(check_pk)
-#define check_sk MLKEM_NAMESPACE(check_sk)
+#define check_pk MLKEM_NAMESPACE_K(check_pk)
+#define check_sk MLKEM_NAMESPACE_K(check_sk)
 /* End of static namespacing */
 
 #if defined(CBMC)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.h
index 074e4771e..93caa796b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/kem.h
@@ -9,6 +9,7 @@
 #include "cbmc.h"
 #include "common.h"
 
+#if defined(MLKEM_NATIVE_CHECK_APIS)
 /* Include to ensure consistency between internal kem.h
  * and external mlkem_native.h. */
 #include "mlkem_native.h"
@@ -25,6 +26,14 @@
 #error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
 #endif
 
+#else
+#define crypto_kem_keypair_derand MLKEM_NAMESPACE_K(keypair_derand)
+#define crypto_kem_keypair MLKEM_NAMESPACE_K(keypair)
+#define crypto_kem_enc_derand MLKEM_NAMESPACE_K(enc_derand)
+#define crypto_kem_enc MLKEM_NAMESPACE_K(enc)
+#define crypto_kem_dec MLKEM_NAMESPACE_K(dec)
+#endif
+
 /*************************************************
  * Name:        crypto_kem_keypair_derand
  *
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/mlkem_native.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/mlkem_native.h
index 4aed4efbb..12d1d12e6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/mlkem_native.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/mlkem_native.h
@@ -59,9 +59,17 @@
 #error MLKEM_NAMESPACE_PREFIX not set by config file
 #endif
 
-#define BUILD_INFO_CONCAT_(x, y) x##_##y
-#define BUILD_INFO_CONCAT(x, y) BUILD_INFO_CONCAT_(x, y)
-#define BUILD_INFO_NAMESPACE(sym) BUILD_INFO_CONCAT(MLKEM_NAMESPACE_PREFIX, sym)
+#if defined(MLKEM_NATIVE_NAMESPACE_PREFIX_ADD_LEVEL)
+#define BUILD_INFO_CONCAT3_(x, y, z) x##y##_##z
+#define BUILD_INFO_CONCAT3(x, y, z) BUILD_INFO_CONCAT_(x, y, z)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT3(MLKEM_NAMESPACE_PREFIX, BUILD_INFO_LVL, sym)
+#else
+#define BUILD_INFO_CONCAT2_(x, y) x##_##y
+#define BUILD_INFO_CONCAT2(x, y) BUILD_INFO_CONCAT2_(x, y)
+#define BUILD_INFO_NAMESPACE(sym) \
+  BUILD_INFO_CONCAT2(MLKEM_NAMESPACE_PREFIX, sym)
+#endif
 
 #endif /* BUILD_INFO_LVL */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c
index 02b45215c..3651c8da9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c
@@ -2,10 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include <stdint.h>
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
+#include <stdint.h>
 #include "arith_backend.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "ntt.h"
 #include "reduce.h"
 
@@ -45,10 +47,10 @@
  *          4 -- 6
  *             5 -- 7
  */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, int start,
-                                int len, int bound)
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
 __contract__(
-  requires(0 <= start && start < MLKEM_N)
+  requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
   requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
   requires(-HALF_Q < zeta && zeta < HALF_Q)
@@ -60,7 +62,7 @@ __contract__(
   ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
 {
   /* `bound` is a ghost variable only needed in the CBMC specification */
-  int j;
+  unsigned j;
   ((void)bound);
   for (j = start; j < start + len; j++)
   __loop__(
@@ -93,7 +95,7 @@ __contract__(
  *   official Kyber implementation here, merely adding `layer` as
  *   a ghost variable for the specifications.
  */
-static void ntt_layer(int16_t r[MLKEM_N], int len, int layer)
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
@@ -101,15 +103,15 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable only needed in the CBMC specification */
   ((void)layer);
   /* Twiddle factors for layer n start at index 2^(layer-1) */
   k = MLKEM_N / (2 * len);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
-    invariant(0 <= start && start < MLKEM_N + 2 * len)
-    invariant(0 <= k && k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
@@ -130,9 +132,9 @@ __contract__(
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  int len, layer;
+  unsigned len, layer;
   int16_t *r;
-  POLY_BOUND_MSG(p, MLKEM_Q, "ref ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
   for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
@@ -144,30 +146,23 @@ void poly_ntt(poly *p)
   }
 
   /* Check the stronger bound */
-  POLY_BOUND_MSG(p, NTT_BOUND, "ref ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_NTT */
 
-/* Check that bound for native NTT implies contractual bound */
-STATIC_ASSERT(NTT_BOUND_NATIVE <= NTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  POLY_BOUND_MSG(p, MLKEM_Q, "native ntt input");
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   ntt_native(p);
-  POLY_BOUND_MSG(p, NTT_BOUND_NATIVE, "native ntt output");
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if !defined(MLKEM_USE_NATIVE_INTT)
 
-/* Check that bound for reference invNTT implies contractual bound */
-#define INVNTT_BOUND_REF (3 * MLKEM_Q / 4)
-STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
-
 /* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, int len, int layer)
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
   requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
@@ -176,23 +171,23 @@ __contract__(
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
 {
-  int start, k;
+  unsigned start, k;
   /* `layer` is a ghost variable used only in the specification */
   ((void)layer);
   k = MLKEM_N / len - 1;
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+    invariant(start <= MLKEM_N && k <= 127)
     /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
     invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
   {
-    int j;
+    unsigned j;
     int16_t zeta = zetas[k--];
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
-      invariant(0 <= start && start <= MLKEM_N && 0 <= k && k <= 127)
+      invariant(start <= MLKEM_N && k <= 127)
       invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
     {
       int16_t t = r[j];
@@ -211,13 +206,13 @@ void poly_invntt_tomont(poly *p)
    * and NTT twist. This also brings coefficients down to
    * absolute value < MLKEM_Q.
    */
-  int j, len, layer;
+  unsigned j, len, layer;
   const int16_t f = 1441;
   int16_t *r = p->coeffs;
 
   for (j = 0; j < MLKEM_N; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N)
+    invariant(j <= MLKEM_N)
     invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
   {
     r[j] = fqmul(r[j], f);
@@ -226,24 +221,21 @@ void poly_invntt_tomont(poly *p)
   /* Run the invNTT layers */
   for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
   __loop__(
-    invariant(2 <= len && len <= 256 && 0 <= layer && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
     invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
   {
     invntt_layer(p->coeffs, len, layer);
   }
 
-  POLY_BOUND_MSG(p, INVNTT_BOUND_REF, "ref intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #else  /* MLKEM_USE_NATIVE_INTT */
 
-/* Check that bound for native invNTT implies contractual bound */
-STATIC_ASSERT(INVNTT_BOUND_NATIVE <= INVNTT_BOUND, invntt_bound)
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_invntt_tomont(poly *p)
 {
   intt_native(p);
-  POLY_BOUND_MSG(p, INVNTT_BOUND_NATIVE, "native intt output");
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
 }
 #endif /* MLKEM_USE_NATIVE_INTT */
 
@@ -252,8 +244,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
                     int16_t b_cached)
 {
   int32_t t0, t1;
-
-  BOUND(a, 2, 4096, "basemul input bound");
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
   t0 = (int32_t)a[1] * b_cached;
   t0 += (int32_t)a[0] * b[0];
@@ -264,5 +255,12 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
   r[0] = montgomery_reduce(t0);
   r[1] = montgomery_reduce(t1);
 
-  BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound");
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
+int empty_cu_ntt;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h
index 5592bb9a2..4e80d3ab3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h
@@ -4,10 +4,10 @@
  */
 #ifndef NTT_H
 #define NTT_H
+#include "common.h"
 
 #include <stdint.h>
 #include "cbmc.h"
-#include "common.h"
 #include "poly.h"
 #include "reduce.h"
 
@@ -81,7 +81,7 @@ __contract__(
  *                   Upon return, coefficients are bound by
  *                   2*MLKEM_Q in absolute value.
  *            - a: Pointer to first input polynomial
- *                   Must be coefficient-wise < 4096 in absolute value.
+ *                   Every coefficient must be in [0..4095]
  *            - b: Pointer to second input polynomial
  *                   Can have arbitrary int16_t coefficients
  *            - b_cached: Some precomputed value, typically derived from
@@ -99,5 +99,4 @@ __contract__(
   ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
 );
 
-
-#endif
+#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
index fa751f977..57ea4c8ba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
@@ -25,23 +25,34 @@
 #define MLKEM_POLYBYTES 384
 #define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
 
+#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
+#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
+#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
+#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
+
 #if MLKEM_K == 2
 #define MLKEM_LVL 512
 #define MLKEM_ETA1 3
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 3
 #define MLKEM_LVL 768
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 128
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 320
+#define MLKEM_DU 10
+#define MLKEM_DV 4
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #elif MLKEM_K == 4
 #define MLKEM_LVL 1024
 #define MLKEM_ETA1 2
-#define MLKEM_POLYCOMPRESSEDBYTES_DV 160
-#define MLKEM_POLYCOMPRESSEDBYTES_DU 352
+#define MLKEM_DU 11
+#define MLKEM_DV 5
+#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
+#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
 #define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
index 5807879df..7483ebf6d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
@@ -2,13 +2,15 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
 #include <stdint.h>
 #include <string.h>
-
 #include "arith_backend.h"
 #include "cbd.h"
 #include "cbmc.h"
-#include "debug/debug.h"
+#include "debug.h"
 #include "fips202x4.h"
 #include "ntt.h"
 #include "poly.h"
@@ -16,50 +18,46 @@
 #include "symmetric.h"
 #include "verify.h"
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 8))
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
     __loop__(
-      invariant(k >= 0 && k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
     {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
     }
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
   }
+}
 
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j >= 0 && j <= MLKEM_N / 4))
+  __loop__(invariant(j <= MLKEM_N / 4))
   {
     unsigned k;
     uint16_t t[4];
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(k >= 0 && k <= 4)
+      invariant(k <= 4)
       invariant(forall(r, 0, k, t[r] < (1u << 10))))
     {
       t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
@@ -75,51 +73,35 @@ void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
     r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
     r[5 * j + 4] = (t[3] >> 2);
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
 }
 
-
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
 {
-  unsigned j;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DU == 352)
-  for (j = 0; j < MLKEM_N / 8; j++)
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
   {
-    int k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(0 <= k && k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DU == 320)
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
   for (j = 0; j < MLKEM_N / 4; j++)
   __loop__(
-    invariant(0 <= j && j <= MLKEM_N / 4)
+    invariant(j <= MLKEM_N / 4)
     invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
   {
-    int k;
+    unsigned k;
     uint16_t t[4];
     uint8_t const *base = &a[5 * j];
 
@@ -130,51 +112,33 @@ void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
 
     for (k = 0; k < 4; k++)
     __loop__(
-      invariant(0 <= k && k <= 4)
+      invariant(k <= 4)
       invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
     {
       r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DU needs to be in {320,352}"
-#endif
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
 
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     uint8_t t[8] = {0};
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <= MLKEM_N / 8 && j <= 8)
       invariant(array_bound(t, 0, j, 0, 32)))
     {
       t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
@@ -191,33 +155,57 @@ void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
     r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
     r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 }
 
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
 {
-  unsigned i;
-#if (MLKEM_POLYCOMPRESSEDBYTES_DV == 128)
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
   {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
   }
-#elif (MLKEM_POLYCOMPRESSEDBYTES_DV == 160)
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     uint8_t t[8];
-    const int offset = i * 5;
+    const unsigned offset = i * 5;
     /*
      * Explicitly truncate to avoid warning about
      * implicit truncation in CBMC and unwind loop for ease
@@ -240,29 +228,62 @@ void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
     /* and copy to the correct slice in r[] */
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(j >= 0 && j <= 8 && i >= 0 && i <= MLKEM_N / 8)
+      invariant(j <= 8 && i <= MLKEM_N / 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
     }
   }
-#else
-#error "MLKEM_POLYCOMPRESSEDBYTES_DV needs to be in {128, 160}"
-#endif
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
 #if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
-
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 2))
+  __loop__(invariant(i <= MLKEM_N / 2))
   {
     const uint16_t t0 = a->coeffs[2 * i];
     const uint16_t t1 = a->coeffs[2 * i + 1];
@@ -290,7 +311,7 @@ void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
 {
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
   poly_tobytes_native(r, a);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
@@ -302,7 +323,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   unsigned i;
   for (i = 0; i < MLKEM_N / 2; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 2)
+    invariant(i <= MLKEM_N / 2)
     invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
   {
     const uint8_t t0 = a[3 * i + 0];
@@ -313,7 +334,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
   }
 
   /* Note that the coefficients are not canonical */
-  POLY_UBOUND(r, 4096);
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 MLKEM_NATIVE_INTERNAL_API
@@ -333,13 +354,13 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
 
   for (i = 0; i < MLKEM_N / 8; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N / 8)
+    invariant(i <= MLKEM_N / 8)
     invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
   {
     unsigned j;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <  MLKEM_N / 8 && j >= 0 && j <= 8)
+      invariant(i <  MLKEM_N / 8 && j <= 8)
       invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
     {
       /* Prevent the compiler from recognizing this as a bit selection */
@@ -347,23 +368,23 @@ void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
       r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
     }
   }
-  POLY_BOUND_MSG(r, MLKEM_Q, "poly_frommsg output");
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
 {
   unsigned i;
-  POLY_UBOUND(a, MLKEM_Q);
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 8))
+  __loop__(invariant(i <= MLKEM_N / 8))
   {
     unsigned j;
     msg[i] = 0;
     for (j = 0; j < 8; j++)
     __loop__(
-      invariant(i >= 0 && i <= MLKEM_N / 8 && j >= 0 && j <= 8))
+      invariant(i <= MLKEM_N / 8 && j <= 8))
     {
       uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
       msg[i] |= t << j;
@@ -371,104 +392,17 @@ void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
   }
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA1 + 1, "poly_getnoise_eta1_4x output 3");
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  POLY_BOUND_MSG(r, MLKEM_ETA1 + 1, "poly_getnoise_eta2 output");
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  POLY_BOUND_MSG(r0, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 0");
-  POLY_BOUND_MSG(r1, MLKEM_ETA1 + 1, "poly_getnoise_eta1122_4x output 1");
-  POLY_BOUND_MSG(r2, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 2");
-  POLY_BOUND_MSG(r3, MLKEM_ETA2 + 1, "poly_getnoise_eta1122_4x output 3");
-}
-#endif /* MLKEM_K == 2 */
-
 MLKEM_NATIVE_INTERNAL_API
 void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
                                     const poly_mulcache *b_cache)
 {
   unsigned i;
-  POLY_BOUND(b_cache, 4096);
+  debug_assert_bound(a, MLKEM_N, 0, UINT12_LIMIT);
 
   for (i = 0; i < MLKEM_N / 4; i++)
   __loop__(
     assigns(i, object_whole(r))
-    invariant(i >= 0 && i <= MLKEM_N / 4)
+    invariant(i <= MLKEM_N / 4)
     invariant(array_abs_bound(r->coeffs, 0, 4 * i, 2 * MLKEM_Q)))
   {
     basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i],
@@ -476,6 +410,8 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b,
     basemul_cached(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2],
                    &b->coeffs[4 * i + 2], b_cache->coeffs[2 * i + 1]);
   }
+
+  debug_assert_abs_bound(r, MLKEM_N, 2 * MLKEM_Q);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -486,20 +422,20 @@ void poly_tomont(poly *r)
   const int16_t f = (1ULL << 32) % MLKEM_Q; /* 1353 */
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
-    invariant(array_abs_bound(r->coeffs ,0, i, MLKEM_Q)))
+    invariant(i <= MLKEM_N)
+    invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
   {
     r->coeffs[i] = fqmul(r->coeffs[i], f);
   }
 
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_TOMONT */
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
   poly_tomont_native(r);
-  POLY_BOUND(r, MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
@@ -510,7 +446,7 @@ void poly_reduce(poly *r)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
   {
     /* Barrett reduction, giving signed canonical representative */
@@ -519,14 +455,14 @@ void poly_reduce(poly *r)
     r->coeffs[i] = scalar_signed_to_unsigned_q(t);
   }
 
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_REDUCE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
   poly_reduce_native(r);
-  POLY_UBOUND(r, MLKEM_Q);
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
@@ -536,7 +472,7 @@ void poly_add(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
   {
@@ -550,7 +486,7 @@ void poly_sub(poly *r, const poly *b)
   unsigned i;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
-    invariant(i >= 0 && i <= MLKEM_N)
+    invariant(i <= MLKEM_N)
     invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
     invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
   {
@@ -564,20 +500,36 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   unsigned i;
   for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(invariant(i >= 0 && i <= MLKEM_N / 4))
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
   {
     x->coeffs[2 * i + 0] = fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-  POLY_BOUND(x, MLKEM_Q);
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
   poly_mulcache_compute_native(x, a);
-  /* Omitting POLY_BOUND(x, MLKEM_Q) since native implementations may
+  /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
+int empty_cu_poly;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
index 1e8c109c6..6a14c785d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
@@ -307,112 +307,164 @@ __contract__(
  ************************************************************/
 static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
 __contract__(
-  requires(c >= -(MLKEM_Q - 1) && c <= (MLKEM_Q - 1))
-  ensures(return_value >= 0 && return_value <= (MLKEM_Q - 1))
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
   ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
 {
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
   /* Add Q if c is negative, but in constant time */
   c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
 
-  cassert(c >= 0, "scalar_signed_to_unsigned_q result lower bound");
-  cassert(c < MLKEM_Q, "scalar_signed_to_unsigned_q result upper bound");
-
   /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
   return (uint16_t)c;
 }
 
-#define poly_compress_du MLKEM_NAMESPACE(poly_compress_du)
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
 /*************************************************
- * Name:        poly_compress_du
+ * Name:        poly_compress_d4
  *
- * Description: Compression (du bits) and subsequent serialization of a
- *polynomial
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-);
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
 
-#define poly_decompress_du MLKEM_NAMESPACE(poly_decompress_du)
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
 /*************************************************
- * Name:        poly_decompress_du
+ * Name:        poly_decompress_d4
  *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *polynomial; approximate inverse of poly_compress_du
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_du(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
 
-#define poly_compress_dv MLKEM_NAMESPACE(poly_compress_dv)
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
 /*************************************************
- * Name:        poly_compress_dv
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
  *
- * Description: Compression (dv bits) and subsequent serialization of a
- *polynomial
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
  *
  * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (of length MLKEM_POLYCOMPRESSEDBYTES_DV)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *              - const poly *a: pointer to input polynomial
  *                  Coefficients must be unsigned canonical,
  *                  i.e. in [0,1,..,MLKEM_Q-1].
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
 
-#define poly_decompress_dv MLKEM_NAMESPACE(poly_decompress_dv)
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
 /*************************************************
- * Name:        poly_decompress_dv
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
  *
  * Description: De-serialization and subsequent decompression (dv bits) of a
- *polynomial; approximate inverse of poly_compress
+ *              polynomial; approximate inverse of poly_compress
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV
- *bytes)
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
  *
  * Upon return, the coefficients of the output polynomial are unsigned-canonical
  * (non-negative and smaller than MLKEM_Q).
  *
  **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_dv(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
 
 #define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
 /*************************************************
@@ -500,144 +552,6 @@ __contract__(
   assigns(object_whole(msg))
 );
 
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -649,8 +563,7 @@ __contract__(
  *              Bounds:
  *              - a is assumed to be coefficient-wise < q in absolute value.
  *
- *              The result is coefficient-wise bound by 3/2 q in absolute
- *              value.
+ *              The result is coefficient-wise bound by 2*q in absolute value.
  *
  * Arguments:   - poly *r: pointer to output polynomial
  *              - const poly *a: pointer to first input polynomial
@@ -802,4 +715,4 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#endif
+#endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c
index 7d2016773..50ea1c34a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c
@@ -4,18 +4,29 @@
  */
 #include "polyvec.h"
 #include <stdint.h>
+#include <string.h>
 #include "arith_backend.h"
+#include "cbd.h"
 #include "ntt.h"
 #include "poly.h"
+#include "symmetric.h"
 
-#include "debug/debug.h"
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
                          const polyvec *a)
 {
   unsigned i;
-  POLYVEC_UBOUND(a, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 
   for (i = 0; i < MLKEM_K; i++)
   {
@@ -33,13 +44,15 @@ void polyvec_decompress_du(polyvec *r,
     poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
   }
 
-  POLYVEC_UBOUND(r, MLKEM_Q);
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
 void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
 {
   unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
   for (i = 0; i < MLKEM_K; i++)
   {
     poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
@@ -54,6 +67,8 @@ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
   {
     poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -64,6 +79,8 @@ void polyvec_ntt(polyvec *r)
   {
     poly_ntt(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -74,6 +91,8 @@ void polyvec_invntt_tomont(polyvec *r)
   {
     poly_invntt_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
 }
 
 #if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -84,10 +103,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
 {
   unsigned i;
   poly t;
-
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  POLYVEC_BOUND(b_cache, MLKEM_Q);
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
 
   poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
   for (i = 1; i < MLKEM_K; i++)
@@ -95,18 +111,15 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
     poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
                                    &b_cache->vec[i]);
     poly_add(r, &t);
-    /* abs bounds: < (i+1) * 3/2 * q */
   }
 
   /*
-   * Those bounds are true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus best to omit
-   * them from the spec to not unnecessarily constraint native implementations.
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
    */
-  cassert(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_K * 2 * MLKEM_Q),
-          "polyvec_basemul_acc_montgomery_cached output bounds");
-  /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */
-  POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q);
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
 }
 #else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 MLKEM_NATIVE_INTERNAL_API
@@ -114,9 +127,8 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
                                            const polyvec *b,
                                            const polyvec_mulcache *b_cache)
 {
-  POLYVEC_BOUND(a, 4096);
-  POLYVEC_BOUND(b, NTT_BOUND);
-  /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
   polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
@@ -149,6 +161,8 @@ void polyvec_reduce(polyvec *r)
   {
     poly_reduce(&r->vec[i]);
   }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -169,4 +183,148 @@ void polyvec_tomont(polyvec *r)
   {
     poly_tomont(&r->vec[i]);
   }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
 }
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h
index 138724150..8be8579e0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h
@@ -9,19 +9,144 @@
 #include "common.h"
 #include "poly.h"
 
-#define polyvec MLKEM_NAMESPACE(polyvec)
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
 typedef struct
 {
   poly vec[MLKEM_K];
 } ALIGN polyvec;
 
-#define polyvec_mulcache MLKEM_NAMESPACE(polyvec_mulcache)
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
 typedef struct
 {
   poly_mulcache vec[MLKEM_K];
 } polyvec_mulcache;
 
-#define polyvec_compress_du MLKEM_NAMESPACE(polyvec_compress_du)
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
 /*************************************************
  * Name:        polyvec_compress_du
  *
@@ -44,7 +169,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_decompress_du MLKEM_NAMESPACE(polyvec_decompress_du)
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
 /*************************************************
  * Name:        polyvec_decompress_du
  *
@@ -67,7 +192,7 @@ __contract__(
          array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_tobytes MLKEM_NAMESPACE(polyvec_tobytes)
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
 /*************************************************
  * Name:        polyvec_tobytes
  *
@@ -88,7 +213,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_frombytes MLKEM_NAMESPACE(polyvec_frombytes)
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
 /*************************************************
  * Name:        polyvec_frombytes
  *
@@ -110,7 +235,7 @@ __contract__(
         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
 );
 
-#define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt)
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
 /*************************************************
  * Name:        polyvec_ntt
  *
@@ -136,7 +261,7 @@ __contract__(
   array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
 );
 
-#define polyvec_invntt_tomont MLKEM_NAMESPACE(polyvec_invntt_tomont)
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
 /*************************************************
  * Name:        polyvec_invntt_tomont
  *
@@ -162,7 +287,7 @@ __contract__(
 );
 
 #define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery
  *
@@ -186,7 +311,7 @@ __contract__(
 
 
 #define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
 /*************************************************
  * Name:        polyvec_basemul_acc_montgomery_cached
  *
@@ -194,7 +319,7 @@ __contract__(
  *              using mulcache for second operand.
  *
  *              Bounds:
- *              - a is assumed to be coefficient-wise < 4096 in absolute value.
+ *              - Every coefficient of a is assumed to be in [0..4095]
  *              - No bounds guarantees for the coefficients in the result.
  *
  * Arguments:   - poly *r: pointer to output polynomial
@@ -218,7 +343,7 @@ __contract__(
   assigns(memory_slice(r, sizeof(poly)))
 );
 
-#define polyvec_mulcache_compute MLKEM_NAMESPACE(polyvec_mulcache_compute)
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
 /************************************************************
  * Name: polyvec_mulcache_compute
  *
@@ -252,7 +377,7 @@ __contract__(
   assigns(object_whole(x))
 );
 
-#define polyvec_reduce MLKEM_NAMESPACE(polyvec_reduce)
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
 /*************************************************
  * Name:        polyvec_reduce
  *
@@ -278,7 +403,7 @@ __contract__(
     array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
 );
 
-#define polyvec_add MLKEM_NAMESPACE(polyvec_add)
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
 /*************************************************
  * Name:        polyvec_add
  *
@@ -309,7 +434,7 @@ __contract__(
   assigns(object_whole(r))
 );
 
-#define polyvec_tomont MLKEM_NAMESPACE(polyvec_tomont)
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
 /*************************************************
  * Name:        polyvec_tomont
  *
@@ -329,4 +454,142 @@ __contract__(
     array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
 );
 
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h
index 1f502167e..b432a4201 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "debug/debug.h"
+#include "debug.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
@@ -109,13 +109,13 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
  **************************************************/
 static INLINE int16_t montgomery_reduce(int32_t a)
 __contract__(
-  requires(a > -(2 * 4096 * 32768))
-  requires(a <  (2 * 4096 * 32768))
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
   ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
 )
 {
   int16_t res;
-  SCALAR_BOUND(a, 2 * UINT12_LIMIT * 32768, "montgomery_reduce input");
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
 
   res = montgomery_reduce_generic(a);
   /* Bounds:
@@ -124,7 +124,7 @@ __contract__(
    *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
    *        < 2 * MLKEM_Q */
 
-  SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
   return res;
 }
 
@@ -150,7 +150,7 @@ __contract__(
 )
 {
   int16_t res;
-  SCALAR_BOUND(b, HALF_Q, "fqmul input");
+  debug_assert_abs_bound(&b, 1, HALF_Q);
 
   res = montgomery_reduce((int32_t)a * (int32_t)b);
   /* Bounds:
@@ -160,7 +160,7 @@ __contract__(
    *        < MLKEM_Q
    */
 
-  SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
 }
 
@@ -200,7 +200,10 @@ __contract__(
    * t is in -10 .. +10, so we need 32-bit math to
    * evaluate t * MLKEM_Q and the subsequent subtraction
    */
-  return (int16_t)(a - t * MLKEM_Q);
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
+
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
 
 #endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c
index 918986e9b..cbbe4407f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c
@@ -2,46 +2,24 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 
-#include "rej_uniform.h"
 #include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "rej_uniform.h"
+#include "symmetric.h"
 
 /* Static namespacing
  * This is to facilitate building multiple instances
  * of mlkem-native (e.g. with varying security levels)
  * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
 /* End of static namespacing */
 
-/*************************************************
- * Name:        rej_uniform_scalar
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
                                        unsigned int offset, const uint8_t *buf,
                                        unsigned int buflen)
@@ -58,6 +36,8 @@ __contract__(
   unsigned int ctr, pos;
   uint16_t val0, val1;
 
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
   ctr = offset;
   pos = 0;
   /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
@@ -79,28 +59,183 @@ __contract__(
       r[ctr++] = val1;
     }
   }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
   return ctr;
 }
 
 #if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
 {
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
 {
   int ret;
 
   /* Sample from large buffer with full lane as much as possible. */
   ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
   if (ret != -1)
-    return offset + (unsigned)ret;
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
 
   return rej_uniform_scalar(r, target, offset, buf, buflen);
 }
 #endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
+int empty_cu_rej_uniform;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h
index 13db836bc..801287259 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h
@@ -9,54 +9,55 @@
 #include <stdlib.h>
 #include "cbmc.h"
 #include "common.h"
+#include "poly.h"
 
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
- * Name:        rej_uniform
+ * Name:        poly_rej_uniform_x4
  *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
  *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
  *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
  **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
 MLKEM_NATIVE_INTERNAL_API
-unsigned int rej_uniform(int16_t *r, unsigned int target, unsigned int offset,
-                         const uint8_t *buf, unsigned int buflen)
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
 __contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-);
-#endif
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/symmetric.h
index 55ebbbd53..3563e5505 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/symmetric.h
@@ -10,6 +10,7 @@
 #include "cbmc.h"
 #include "common.h"
 #include "fips202.h"
+#include "fips202x4.h"
 
 /* Macros denoting FIPS-203 specific Hash functions */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.c
index b7078fcc1..9f39dcd22 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.c
@@ -4,7 +4,8 @@
  */
 #include "verify.h"
 
-#if !defined(MLKEM_USE_ASM_VALUE_BARRIER)
+#if !defined(MLKEM_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 /*
  * Masking value used in constant-time functions from
  * verify.h to block the compiler's range analysis and
@@ -12,9 +13,11 @@
  */
 volatile uint64_t ct_opt_blocker_u64 = 0;
 
-#else /* MLKEM_USE_ASM_VALUE_BARRIER */
+#else /* MLKEM_USE_ASM_VALUE_BARRIER && \
+         !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_verify MLKEM_NAMESPACE(empty_cu_verify)
+#define empty_cu_verify MLKEM_NAMESPACE_K(empty_cu_verify)
 int empty_cu_verify;
 
-#endif /* MLKEM_USE_ASM_VALUE_BARRIER */
+#endif /* MLKEM_USE_ASM_VALUE_BARRIER && \
+          !MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.h
index 8c47155dc..f6ecf5eba 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/verify.h
@@ -268,7 +268,7 @@ __contract__(
 
   for (i = 0; i < len; i++)
   __loop__(
-    invariant(i >= 0 && i <= len)
+    invariant(i <= len)
     invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
   {
     r |= a[i] ^ b[i];
@@ -314,4 +314,4 @@ __contract__(
   }
 }
 
-#endif
+#endif /* VERIFY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h
index ce13e7911..25e00a930 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h
@@ -42,7 +42,7 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
                   const __m256i *qdata);
 
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
-  MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_avx2)
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
     poly *r, const polyvec *a, const polyvec *b,
     const polyvec_mulcache *b_cache);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h
index 66de8c85f..029111c17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h
@@ -28,9 +28,6 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-#define INVNTT_BOUND_NATIVE (8 * MLKEM_Q)
-#define NTT_BOUND_NATIVE (8 * MLKEM_Q)
-
 static INLINE void poly_permute_bitrev_to_custom(poly *data)
 {
   nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
index 1a26e0dd5..4ef887c62 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
@@ -8,6 +8,8 @@
  *          Do not modify it directly.
  */
 
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
 #include "ntt.h"
 
 /*
@@ -28,3 +30,10 @@ ALIGN const int16_t zetas[128] = {
     -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
     -308,  996,   991,   958,   -1460, 1522,  1628,
 };
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_zetas MLKEM_NAMESPACE_K(empty_cu_zetas)
+int empty_cu_zetas;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */