diff --git a/.github/changed-files.yml b/.github/changed-files.yml
index da113c09f..ae83dd29c 100644
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -16,10 +16,8 @@ cpp:
- icicle/**/*.c
- icicle/**/*.h
- icicle/CMakeLists.txt
- - .github/workflows/cpp_cuda.yml
- - icicle/cmake/Common.cmake
- - icicle/cmake/CurvesCommon.cmake
- - icicle/cmake/FieldsCommon.cmake
+ - icicle/cmake/**/*.cmake
+ - .github/workflows/cpp.yml
examples:
- examples/**/*
- .github/workflows/examples.yml
diff --git a/.github/workflows/check-changed-files.yml b/.github/workflows/check-changed-files.yml
index db0dac450..12677daa9 100644
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -9,9 +9,9 @@ on:
rust:
description: "Flag for if Rust files changed"
value: ${{ jobs.check-changed-files.outputs.rust }}
- cpp_cuda:
- description: "Flag for if C++/CUDA files changed"
- value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
+ cpp:
+ description: "Flag for if C++ files changed"
+ value: ${{ jobs.check-changed-files.outputs.cpp }}
examples:
description: "Flag for if example files changed"
value: ${{ jobs.check-changed-files.outputs.examples }}
@@ -23,7 +23,7 @@ jobs:
outputs:
golang: ${{ steps.changed_files.outputs.golang }}
rust: ${{ steps.changed_files.outputs.rust }}
- cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+ cpp: ${{ steps.changed_files.outputs.cpp }}
examples: ${{ steps.changed_files.outputs.examples }}
steps:
- name: Checkout Repo
@@ -40,5 +40,5 @@ jobs:
run: |
echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
- echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+ echo "cpp=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
echo "examples=${{ steps.changed-files-yaml.outputs.examples_any_modified }}" >> "$GITHUB_OUTPUT"
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index ff9738e1e..1db2f868e 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -4,7 +4,8 @@ on:
pull_request:
branches:
- main
- - V2
+ - V3
+ - yshekel/V3
jobs:
spelling-checker:
@@ -15,6 +16,6 @@ jobs:
- uses: codespell-project/actions-codespell@v2
with:
# https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
- skip: ./**/target,./**/build,./docs/*.js,./docs/*.json
+ skip: ./**/target,./**/build,./docs/*.js,./docs/*.json,./*.svg
# https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
ignore_words_file: .codespellignore
diff --git a/.github/workflows/cpp_cuda.yml b/.github/workflows/cpp.yml
similarity index 59%
rename from .github/workflows/cpp_cuda.yml
rename to .github/workflows/cpp.yml
index bb57823af..fbd72c101 100644
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp.yml
@@ -3,12 +3,11 @@ name: C++/CUDA
on:
pull_request:
branches:
- - main
- - V2
+ - V3
+ - yshekel/V3 # TODO remove when merged to V3
push:
branches:
- - main
- - V2
+ - V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -26,13 +25,18 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Check clang-format
- if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true'
run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
+ extract-cuda-backend-branch:
+ uses: ./.github/workflows/extract-backends.yml
+ with:
+ pr-number: ${{ github.event.pull_request.number }}
+
test-linux-curve:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
+ needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
curve:
@@ -50,25 +54,33 @@ jobs:
steps:
- name: Checkout Repo
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: Build curve
working-directory: ./icicle
- if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true'
run: |
mkdir -p build && rm -rf build/*
- cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -S . -B build
+ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -DCUDA_BACKEND=local -S . -B build
cmake --build build -j
- name: Run C++ curve Tests
working-directory: ./icicle/build/tests
- if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true'
run: ctest
test-linux-field:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
+ needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
- field:
+ field:
- name: babybear
build_args: -DEXT_FIELD=ON
- name: stark252
@@ -78,14 +90,22 @@ jobs:
steps:
- name: Checkout Repo
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: Build field
working-directory: ./icicle
- if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true'
run: |
mkdir -p build && rm -rf build/*
- cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field.name }} ${{ matrix.field.build_args }} -S . -B build
+ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field.name }} ${{ matrix.field.build_args }} -DCUDA_BACKEND=local -S . -B build
cmake --build build -j
- name: Run C++ field Tests
working-directory: ./icicle/build/tests
- if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true'
run: ctest
\ No newline at end of file
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index c6fe2c24f..c5240e6db 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -1,8 +1,7 @@
# This workflow is a demo of how to run all examples in the Icicle repository.
-# For each language directory (c++, Rust, etc.) the workflow
-# (1) loops over all examples (msm, ntt, etc.) and
-# (2) runs ./compile.sh and ./run.sh in each directory.
-# The script ./compile.sh should compile the example and ./run.sh should run it.
+# For each language directory (c++, Rust, etc.) the workflow
+# (1) loops over all examples (msm, ntt, etc.) and
+# (2) runs ./run.sh in each directory.
# Each script should return 0 for success and 1 otherwise.
name: Examples
@@ -10,12 +9,11 @@ name: Examples
on:
pull_request:
branches:
- - main
- - V2
+ - V3
+ - yshekel/V3 # TODO remove when merged to V3
push:
branches:
- - main
- - V2
+ - V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -25,26 +23,38 @@ jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
+ extract-cuda-backend-branch:
+ uses: ./.github/workflows/extract-backends.yml
+ with:
+ pr-number: ${{ github.event.pull_request.number }}
+
run-examples:
runs-on: [self-hosted, Linux, X64, icicle, examples]
- needs: check-changed-files
+ needs: [check-changed-files, extract-cuda-backend-branch]
steps:
- name: Checkout
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: c++ examples
working-directory: ./examples/c++
- if: needs.check-changed-files.outputs.cpp_cuda == 'true' || needs.check-changed-files.outputs.examples == 'true'
+ if: needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.examples == 'true'
run: |
# loop over all directories in the current directory
for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
if [ -d "$dir" ]; then
echo "Running command in $dir"
cd $dir
- ./compile.sh
- ./run.sh
+ ./run.sh -d CUDA
cd -
fi
- done
+ done
- name: Rust examples
working-directory: ./examples/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.examples == 'true'
@@ -54,7 +64,7 @@ jobs:
if [ -d "$dir" ]; then
echo "Running command in $dir"
cd $dir
- cargo run --release
+ ./run.sh -d CUDA
cd -
fi
- done
\ No newline at end of file
+ done
\ No newline at end of file
diff --git a/.github/workflows/extract-backends.yml b/.github/workflows/extract-backends.yml
new file mode 100644
index 000000000..80aac40d9
--- /dev/null
+++ b/.github/workflows/extract-backends.yml
@@ -0,0 +1,36 @@
+name: Extract Icicle Backend Branch
+
+on:
+ workflow_call:
+ inputs:
+ pr-number:
+ description: 'The PR number to fetch the description for'
+ required: true
+ type: number
+ outputs:
+ cuda-backend-branch:
+ description: "Branch name for cuda backend"
+ value: ${{ jobs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
+
+jobs:
+ extract-cuda-backend-branch:
+ name: Extract cuda branch name
+ runs-on: ubuntu-22.04
+ outputs:
+ cuda-backend-branch: ${{ steps.extract.outputs.cuda-backend-branch }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ - name: Extract Private Branch from PR Description
+ id: extract
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ DESCRIPTION=$(gh pr view ${{ inputs.pr-number }} --json body -q '.body')
+ echo "PR Description: $DESCRIPTION"
+ CUDA_BE_BRANCH=$(echo "$DESCRIPTION" | grep -oP 'cuda-backend-branch:\s*\K[^\s]+') || true
+ if [ -z "$CUDA_BE_BRANCH" ]; then
+ CUDA_BE_BRANCH="main" # Default branch if not specified
+ fi
+ echo "Extracted CUDA Backend Branch: $CUDA_BE_BRANCH"
+ echo "cuda-backend-branch=$CUDA_BE_BRANCH" >> "$GITHUB_OUTPUT"
\ No newline at end of file
diff --git a/.github/workflows/golang.yml b/.github/workflows/golang.yml
index d6d76b832..997728a0c 100644
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -3,12 +3,11 @@ name: GoLang
on:
pull_request:
branches:
- - main
- - V2
+ - V3
+ - yshekel/V3 # TODO remove when merged to V3
push:
branches:
- - main
- - V2
+ - V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -33,26 +32,39 @@ jobs:
if: needs.check-changed-files.outputs.golang == 'true'
run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+ extract-cuda-backend-branch:
+ uses: ./.github/workflows/extract-backends.yml
+ with:
+ pr-number: ${{ github.event.pull_request.number }}
+
build-curves-linux:
name: Build and test curves on Linux
runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
+ needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
curve:
- name: bn254
- build_args: -g2 -ecntt
+ build_args:
- name: bls12_381
- build_args: -g2 -ecntt
+ build_args:
- name: bls12_377
- build_args: -g2 -ecntt
+ build_args:
- name: bw6_761
- build_args: -g2 -ecntt
+ build_args:
- name: grumpkin
build_args:
steps:
- name: Checkout Repo
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: Setup go
uses: actions/setup-go@v5
with:
@@ -61,27 +73,36 @@ jobs:
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# builds a single curve with the curve's specified build args
- run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }}
+ run: |
+ ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -cuda_backend=local
- name: Test
working-directory: ./wrappers/golang/curves
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
CURVE=$(echo ${{ matrix.curve.name }} | sed -e 's/_//g')
- export CPATH=$CPATH:/usr/local/cuda/include
+ export ICICLE_BACKEND_INSTALL_DIR=/usr/local/lib
go test ./$CURVE/tests -count=1 -failfast -p 2 -timeout 60m -v
build-fields-linux:
name: Build and test fields on Linux
runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
+ needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
field:
- name: babybear
- build_args: -field-ext
+ build_args:
steps:
- name: Checkout Repo
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: Setup go
uses: actions/setup-go@v5
with:
@@ -90,73 +111,12 @@ jobs:
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# builds a single field with the fields specified build args
- run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }}
+ run: |
+ ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} -cuda_backend=local
- name: Test
working-directory: ./wrappers/golang/fields
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
FIELD=$(echo ${{ matrix.field.name }} | sed -e 's/_//g')
- export CPATH=$CPATH:/usr/local/cuda/include
+ export ICICLE_BACKEND_INSTALL_DIR=/usr/local/lib
go test ./$FIELD/tests -count=1 -failfast -p 2 -timeout 60m -v
-
- build-hashes-linux:
- name: Build and test hashes on Linux
- runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
- strategy:
- matrix:
- hash:
- - name: keccak
- build_args:
- steps:
- - name: Checkout Repo
- uses: actions/checkout@v4
- - name: Setup go
- uses: actions/setup-go@v5
- with:
- go-version: '1.20.0'
- - name: Build
- working-directory: ./wrappers/golang
- if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # builds a single hash algorithm with the hash's specified build args
- run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }}
- - name: Test
- working-directory: ./wrappers/golang/hash
- if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- run: |
- HASH=$(echo ${{ matrix.hash.name }} | sed -e 's/_//g')
- export CPATH=$CPATH:/usr/local/cuda/include
- go test ./$HASH/tests -count=1 -failfast -p 2 -timeout 60m -v
-
- # TODO: bw6 on windows requires more memory than the standard runner has
- # Add a large runner and then enable this job
- # build-windows:
- # name: Build on Windows
- # runs-on: windows-2022
- # needs: [check-changed-files, check-format]
- # strategy:
- # matrix:
- # curve: [bn254, bls12_381, bls12_377, bw6_761]
- # steps:
- # - name: Checkout Repo
- # uses: actions/checkout@v4
- # - name: Setup go
- # uses: actions/setup-go@v5
- # with:
- # go-version: '1.20.0'
- # - name: Download and Install Cuda
- # if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # id: cuda-toolkit
- # uses: Jimver/cuda-toolkit@v0.2.11
- # with:
- # cuda: '12.0.0'
- # method: 'network'
- # # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
- # sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
- # - name: Build libs
- # if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # working-directory: ./wrappers/golang
- # env:
- # CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
- # shell: pwsh
- # run: ./build.ps1 ${{ matrix.curve }} ON # builds a single curve with G2 enabled
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index af8f024b4..da1332470 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -22,6 +22,14 @@ jobs:
uses: actions/checkout@v4
with:
ssh-key: ${{ secrets.DEPLOY_KEY }}
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: main
- name: Setup Cache
id: cache
uses: actions/cache@v4
@@ -46,5 +54,15 @@ jobs:
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
+ mkdir -p release_output && rm -rf ./release_output/*
+ ./scripts/release/build_all.sh ./release_output
LATEST_TAG=$(git describe --tags --abbrev=0)
gh release create $LATEST_TAG --generate-notes -d --verify-tag -t "Release $LATEST_TAG"
+ - name: Upload release tars
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ LATEST_TAG=$(git describe --tags --abbrev=0)
+ for file in ./release_output/*.tar.gz; do
+ gh release upload $LATEST_TAG "$file"
+ done
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 09b3d2061..fd96fc9b9 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -3,12 +3,11 @@ name: Rust
on:
pull_request:
branches:
- - main
- - V2
+ - V3
+ - yshekel/V3 # TODO remove when merged to V3
push:
branches:
- - main
- - V2
+ - V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -26,7 +25,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Check rustfmt
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+ if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
working-directory: ./wrappers/rust
# "-name target -prune" removes searching in any directory named "target"
# Formatting by single file is necessary due to generated files not being present
@@ -35,78 +34,32 @@ jobs:
# causing rustfmt to fail.
run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
- build-linux:
- name: Build on Linux
- runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, check-format]
- steps:
- - name: Checkout Repo
- uses: actions/checkout@v4
- - name: Build
- working-directory: ./wrappers/rust
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # Building from the root workspace will build all members of the workspace by default
- run: cargo build --release --verbose
-
+ extract-cuda-backend-branch:
+ uses: ./.github/workflows/extract-backends.yml
+ with:
+ pr-number: ${{ github.event.pull_request.number }}
+
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
- needs: [check-changed-files, build-linux]
+ needs: [check-changed-files, check-format, extract-cuda-backend-branch]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
+ - name: Checkout CUDA Backend
+ uses: actions/checkout@v4
+ with:
+ repository: ingonyama-zk/icicle-cuda-backend
+ path: ./icicle/backend/cuda
+ token: ${{ secrets.GITHUB_TOKEN }}
+ ssh-key: ${{ secrets.CUDA_PULL_KEY }}
+ ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
- name: Run tests
working-directory: ./wrappers/rust
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # Running tests from the root workspace will run all workspace members' tests by default
- # We need to limit the number of threads to avoid running out of memory on weaker machines
- # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
+ if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+ # tests are split to phases since NTT domain is global but tests have conflicting requirements
run: |
- cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2 --ignored
- cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2
-
- - name: Run baby bear tests
- working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- run: |
- cargo test --release --verbose -- --ignored
- cargo test --release --verbose
-
- - name: Run stark252 tests
- working-directory: ./wrappers/rust/icicle-fields/icicle-stark252
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- run: |
- cargo test --release --verbose -- --ignored
- cargo test --release --verbose
-
- - name: Run m31 tests
- working-directory: ./wrappers/rust/icicle-fields/icicle-m31
- if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- run: |
- cargo test --release --verbose -- --ignored
- cargo test --release --verbose
-
- # build-windows:
- # name: Build on Windows
- # runs-on: windows-2022
- # needs: check-changed-files
- # steps:
- # - name: Checkout Repo
- # uses: actions/checkout@v4
- # - name: Download and Install Cuda
- # if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # id: cuda-toolkit
- # uses: Jimver/cuda-toolkit@v0.2.11
- # with:
- # cuda: '12.0.0'
- # method: 'network'
- # # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
- # sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
- # - name: Build targets
- # working-directory: ./wrappers/rust
- # if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
- # env:
- # CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
- # CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
- # # Building from the root workspace will build all members of the workspace by default
- # run: cargo build --release --verbose
+ cargo build --workspace --release --features=cuda_backend
+ cargo test --workspace --release --verbose --features=cuda_backend -- --skip phase
+ cargo test phase2 --workspace --release --features=cuda_backend
+ cargo test phase3 --workspace --release --features=cuda_backend
diff --git a/.gitignore b/.gitignore
index efe26533f..fd8086c28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,6 @@
**/Cargo.lock
**/icicle/build/
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
-**/build*
+**/build/*
+**tar.gz
+icicle/backend/cuda
diff --git a/Frostythelion.gif b/Frostythelion.gif
new file mode 100644
index 000000000..ad184dec5
Binary files /dev/null and b/Frostythelion.gif differ
diff --git a/INGOLOGO.svg b/INGOLOGO.svg
new file mode 100644
index 000000000..2c01fcd3b
--- /dev/null
+++ b/INGOLOGO.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/McPaw.gif b/McPaw.gif
new file mode 100644
index 000000000..d688da6ab
Binary files /dev/null and b/McPaw.gif differ
diff --git a/README.md b/README.md
index f94fac89c..a2bf0c134 100644
--- a/README.md
+++ b/README.md
@@ -1,97 +1,287 @@
# ICICLE
-
ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.
+
+ ICICLE is a high-performance cryptographic acceleration library designed to optimize cryptographic computations across various hardware platforms, including CPUs, GPUs, and other accelerators.
+
+
+
## Background
Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
-We believe GPUs are as important for ZK as for AI.
+We believe that ICICLE will be a cornerstone in the acceleration of ZKPs:
-- GPUs are a perfect match for ZK compute - around 97% of ZK protocol runtime is parallel by nature.
-- GPUs are simple for developers to use and scale compared to other hardware platforms.
-- GPUs are extremely competitive in terms of power / performance and price (3x cheaper).
-- GPUs are popular and readily available.
+- **Versatility**: Supports multiple hardware platforms, making it adaptable to various computational environments.
+- **Efficiency:** Designed to leverage the parallel nature of ZK computations, whether on GPUs, CPUs, or other accelerators.
+- **Scalability:** Provides an easy-to-use and scalable solution for developers, allowing them to optimize cryptographic operations with minimal effort.
## Getting Started
-ICICLE is a CUDA implementation of general functions widely used in ZKP.
+This guide will help you get started with ICICLE in C++, Rust, and Go.
> [!NOTE]
-> Developers: We highly recommend reading our [documentation]
+> **Developers**: We highly recommend reading our [documentation](https://dev.ingonyama.com/) for a comprehensive explanation of ICICLE’s capabilities.
> [!TIP]
-> Try out ICICLE by running some [examples] using ICICLE in C++ and our Rust bindings
+> Try out ICICLE by running some [examples] available in C++, Rust, and Go bindings. Check out our install-and-use examples in [C++](https://github.com/ingonyama-zk/icicle/tree/yshekel/V3/examples/c%2B%2B/install-and-use-icicle), [Rust](https://github.com/ingonyama-zk/icicle/tree/yshekel/V3/examples/rust/install-and-use-icicle) and [Go](TODO)
### Prerequisites
-- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 12.0 or newer.
-- [CMake]((https://cmake.org/files/)), version 3.18 and above. Latest version is recommended.
-- [GCC](https://gcc.gnu.org/install/download.html) version 9, latest version is recommended.
-- Any Nvidia GPU (which supports CUDA Toolkit version 12.0 or above).
+- Any compatible hardware: ICICLE supports various hardware, including CPUs, Nvidia GPUs, and other accelerators.
+- [CMake](https://cmake.org/files/), Version 3.18 or above. Latest version recommended. Required only if building from source.
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads), Required only if using NVIDIA GPUs (version 12.0 or newer).
> [!NOTE]
-> It is possible to use CUDA 11 for cards which don't support CUDA 12, however we don't officially support this version and in the future there may be issues.
+> For older GPUs that only support CUDA 11, ICICLE may still function, but official support is for CUDA 12 onwards.
-### Accessing Hardware
-If you don't have access to an Nvidia GPU we have some options for you.
+### Accessing Hardware
-Checkout [Google Colab](https://colab.google/). Google Colab offers a free [T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/) instance and ICICLE can be used with it, reference this guide for setting up your [Google Colab workplace][GOOGLE-COLAB-ICICLE].
+If you don't have access to an Nvidia GPU we have some options for you.
+
+[Google Colab](https://colab.google/) offers a free [T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/) instance and ICICLE can be used with it, reference this guide for setting up your [Google Colab workplace][GOOGLE-COLAB-ICICLE].
If you require more compute and have an interesting research project, we have [bounty and grant programs][GRANT_PROGRAM].
+## Building ICICLE from source
+
+ICICLE provides build systems for C++, Rust, and Go. Each build system incorporates the core ICICLE library, which contains the essential cryptographic primitives. Refer to the [Getting started page](https://dev.ingonyama.com/icicle/introduction) for full details about building and using ICICLE.
+
+> [!WARNING]
+> Ensure ICICLE libraries are installed correctly when building or installing a library/application that depends on ICICLE so that they can be located at runtime.
+
+### Rust
+
+In cargo.toml, specify the ICICLE libs to use:
+
+```bash
+[dependencies]
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-babybear = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+# add other ICICLE crates here if need additional fields/curves
+```
+
+You can specify `branch=branch-name`, `tag=tag-name`, or `rev=commit-id`.
+
+Build the Rust project:
+
+```bash
+cargo build --release
+```
+
+### Go
+
+There are two ways to build from source in Go:
+
+1. Clone the repo, update your go.mod to point to the local clone, and build ICICLE within the clone
+
+```sh
+git clone https://github.com/ingonyama-zk/icicle.git
+```
+
+Add ICICLE v3 to your go.mod file:
+
+```go
+require github.com/ingonyama-zk/icicle/v3 v3.0.0
+
+replace github.com/ingonyama-zk/icicle/v3 => ../path/to/cloned/icicle
+```
+
+Navigate to the cloned repo's golang bindings and build the library using the supplied [build script][ICICLE-GO-BUILD-SCRIPT]
+
+```sh
+cd icicle/wrappers/golang
+chmod +x build.sh
+./build.sh -curve=bn254
+```
+
+2. Update your go.mod to include ICICLE as a dependency, navigate to the dependency in your GOMODCACHE and build ICICLE there
+
+```sh
+go get github.com/ingonyama-zk/icicle/v3
+cd $(go env GOMODCACHE)/github.com/ingonyama-zk/icicle/v3@/wrappers/golang
+chmod +x build.sh
+./build.sh -curve=bn254
+```
+
+> [!NOTE]
+> To specify the field, use the flag -field=, where can be one of the following: babybear, stark252, m31.
+> To specify a curve, use the flag -curve=, where can be one of the following: bn254, bls12_377, bls12_381, bw6_761, grumpkin.
+
+Once ICICLE has been built, you can add specific packages when you need them in your application:
+
+```go
+import (
+ runtime "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+ core "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ bn254 "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ bn254MSM "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+)
+```
+
+### C++
+
+ICICLE can be built and tested in C++ using CMake. The build process is straightforward, but there are several flags you can use to customize the build for your needs.
+
+**Clone the ICICLE repository:**
-### Build systems
+```bash
+git clone https://github.com/ingonyama-zk/icicle.git
+cd icicle
+```
+
+**Configure the build:**
-ICICLE has three build systems.
+```bash
+mkdir -p build && rm -rf build/*
+cmake -S icicle -B build -DFIELD=babybear
+```
-- [ICICLE core][ICICLE-CORE], C++ and CUDA
-- [ICICLE Rust][ICICLE-RUST] bindings, requires [Rust](https://www.rust-lang.org/) version 1.70 and above
-- [ICICLE Golang][ICICLE-GO] bindings, requires [Go](https://go.dev/) version 1.20 and above
+> [!NOTE]
+> To specify the field, use the flag -DFIELD=field, where field can be one of the following: babybear, stark252, m31.
+> To specify a curve, use the flag -DCURVE=curve, where curve can be one of the following: bn254, bls12_377, bls12_381, bw6_761, grumpkin.
-ICICLE core always needs to be built as part of the other build systems as it contains the core ICICLE primitives implemented in CUDA. Reference these guides for the different build systems, [ICICLE core guide][ICICLE-CORE-README], [ICICLE Rust guide][ICICLE-RUST-README] and [ICICLE Golang guide][ICICLE-GO-README].
+**Build the project:**
-### Compiling ICICLE
+```bash
+cmake --build build -j # -j is for multi-core compilation
+```
-Running ICICLE via Rust bindings is highly recommended and simple:
-- Clone this repo
- - go to our [Rust bindings][ICICLE-RUST]
- - Enter a [curve](./wrappers/rust/icicle-curves) implementation
- - run `cargo build --release` to build or `cargo test` to build and execute tests
+**Link you application (or library) to ICICLE:**
-In any case you would want to compile and run core icicle c++ tests, just follow these setps:
-- Clone this repo
- - go to [ICICLE core][ICICLE-CORE]
- - execute the small [script](https://github.com/ingonyama-zk/icicle/tree/main/icicle#running-tests) to compile via cmake and run c++ and cuda tests
+```cmake
+target_link_libraries(yourApp PRIVATE icicle_field_babybear icicle_device)
+```
-## Docker
+**Install (optional):**
-We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.
+To install the libs, specify the install prefix `-DCMAKE_INSTALL_PREFIX=/install/dir/`. Then after building, use cmake to install the libraries:
+```sh
+cmake -S icicle -B build -DFIELD=babybear -DCMAKE_INSTALL_PREFIX=/path/to/install/dir/
+cmake --build build -j # build
+cmake --install build # install icicle to /path/to/install/dir/
```
-docker build -t .
-docker run --gpus all -it /bin/bash
+
+**Run tests (optional):**
+
+Add `-DBUILD_TESTS=ON` to the cmake command, build and execute tests:
+
+```bash
+cmake -S icicle -B build -DFIELD=babybear -DBUILD_TESTS=ON
+cmake --build build -j
+cd build/tests
+ctest
```
+or choose the test-suite
+
+```bash
+./build/tests/test_field_api # or another test suite
+# can specify tests using regex. For example for tests with ntt in the name:
+./build/tests/test_field_api --gtest_filter="*ntt*"
+```
+
+> [!NOTE]
+> Most tests assume a CUDA backend exists and will fail otherwise, if a CUDA device is not found.
+
+**Build Flags:**
+
+You can customize your ICICLE build with the following flags:
+
+- `-DCPU_BACKEND=ON/OFF`: Enable or disable built-in CPU backend. `default=ON`.
+- `-DCMAKE_INSTALL_PREFIX=/install/dir`: Specify install directory. `default=/usr/local`.
+- `-DBUILD_TESTS=ON/OFF`: Enable or disable tests. `default=OFF`.
+- `-DBUILD_BENCHMARKS=ON/OFF`: Enable or disable benchmarks. `default=OFF`.
+
+## Install CUDA backend
+
+To install the CUDA backend
+
+1. [Download the release binaries](https://github.com/ingonyama-zk/icicle/releases/).
+2. Install it, by extracting the binaries to `/opt/` or any other custom install path.
+3. In your application, load the cuda backend and select a CUDA device.
+4. All subsequent API will now use the selected device.
+
+
+Rust:
+
+```rust
+use icicle_runtime::{runtime, Device};
+
+runtime::load_backend_from_env_or_default().unwrap();
+// or load programmatically
+runtime::load_backend("/path/to/backend/installdir").unwrap();
+// Select CUDA device
+let device = Device::new("CUDA", 1 /*gpu-id*/);
+icicle_runtime::set_device(&device).unwrap();
+
+// Any call will now execute on GPU-1
+```
+
+Go:
+
+```go
+import(
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+)
+
+result := runtime.LoadBackendFromEnvOrDefault()
+// or load from custom install dir
+result := runtime.LoadBackend("/path/to/backend/installdir", true)
+// Select CUDA device
+device := runtime.CreateDevice("CUDA", 0) // or other
+result := runtime.SetDevice(device)
+
+// Any call will now execute on GPU-0
+```
+C++:
+
+```cpp
+#include "icicle/runtime.h"
+
+// Load the installed backend
+eIcicleError result = icicle_load_backend_from_env_or_default();
+// or load it programmatically
+eIcicleError result = icicle_load_backend("/path/to/backend/installdir", true);
+
+// Select CUDA device
+icicle::Device device = {"CUDA", 0 /*gpu-id*/};
+eIcicleError result = icicle_set_device(device);
+
+// Any call will now execute on GPU-0
+```
+
+Full details can be found in our [getting started docs](https://dev.ingonyama.com/icicle/introduction)
+
## Contributions
-Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+Join our [Discord Server][DISCORD] and find us on the ICICLE channel. We will be happy to work together to support your use case, and talk features, bugs and design.
### Development Contributions
@@ -123,12 +313,14 @@ This will ensure our custom hooks are run and will make it easier to follow our
## Help & Support
-For help and support talk to our devs in our discord channel ["ICICLE"](https://discord.gg/EVVXTdt6DF)
-
+For help and support talk to our devs in our discord channel [#ICICLE](https://discord.gg/EVVXTdt6DF) or contact us at .
## License
-ICICLE is distributed under the terms of the MIT License.
+ICICLE frontend is distributed under the terms of the MIT License.
+
+> [!NOTE]
+> ICICLE backends, excluding the CPU backend, are distributed under a special license and are not covered by the MIT license.
See [LICENSE-MIT][LMIT] for details.
@@ -137,18 +329,21 @@ See [LICENSE-MIT][LMIT] for details.
[BLS12-377]: ./icicle/curves/
[BN254]: ./icicle/curves/
[BW6-671]: ./icicle/curves/
-[NVCC]: https://docs.nvidia.com/cuda/#installation-guides
+[Grumpkin]: ./icicle/curves/
+[babybear]: ./icicle/fields/
+[stark252]: ./icicle/fields/
+[m31]: ./icicle/fields/
[LMIT]: ./LICENSE
[DISCORD]: https://discord.gg/Y4SkbDf2Ff
[googletest]: https://github.com/google/googletest/
[HOOKS_DOCS]: https://git-scm.com/docs/githooks
[HOOKS_PATH]: ./scripts/hooks/
-[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
[GOOGLE-COLAB-ICICLE]: https://dev.ingonyama.com/icicle/colab-instructions
[GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
[ICICLE-CORE]: ./icicle/
[ICICLE-RUST]: ./wrappers/rust/
[ICICLE-GO]: ./wrappers/golang/
+[ICICLE-GO-BUILD-SCRIPT]: ./wrappers/golang/build.sh
[ICICLE-CORE-README]: ./icicle/README.md
[ICICLE-RUST-README]: ./wrappers/rust/README.md
[ICICLE-GO-README]: ./wrappers/golang/README.md
diff --git a/docs/docs/icicle/arch_overview.md b/docs/docs/icicle/arch_overview.md
new file mode 100644
index 000000000..5529ba4cb
--- /dev/null
+++ b/docs/docs/icicle/arch_overview.md
@@ -0,0 +1,34 @@
+# Architecture Overview
+
+## Introduction
+
+ICICLE V3 is designed with flexibility and extensibility in mind, offering a robust framework that supports multiple compute backends and accommodates various cryptographic needs. This section provides an overview of ICICLE's architecture, highlighting its open and closed components, multi-device support, and extensibility.
+
+## Open Frontend and CPU Backend
+
+- **Frontend (FE):** The ICICLE frontend is open-source and designed to provide a unified API across different programming languages, including C++, Rust, and Go. This frontend abstracts the complexity of working with different backends, allowing developers to write backend-agnostic code that can be deployed across various platforms.
+- **CPU Backend:** ICICLE includes an open-source CPU backend that allows for development and testing on standard hardware. This backend is ideal for prototyping and for environments where specialized hardware is not available.
+
+## Closed CUDA Backend
+
+- **CUDA Backend:** ICICLE also includes a high-performance CUDA backend that is closed-source. This backend is optimized for NVIDIA GPUs and provides significant acceleration for cryptographic operations.
+- **Installation and Licensing:** The CUDA backend needs to be downloaded and installed. Refer to the [installation guide](./install_cuda_backend.md) for detailed instructions.
+
+## Extensible Design
+
+ICICLE is designed to be extensible, allowing developers to integrate new backends or customize existing ones to suit their specific needs. The architecture supports:
+
+- **Custom Backends:** Developers can create their own backends to leverage different hardware or optimize for specific use cases. The process of building and integrating a custom backend is documented in the [Build Your Own Backend](./build_your_own_backend.md) section.
+- **Pluggable Components:** ICICLE's architecture allows for easy integration of additional cryptographic primitives or enhancements, ensuring that the framework can evolve with the latest advancements in cryptography and hardware acceleration.
+
+## Multi-Device Support
+
+- **Scalability:** ICICLE supports multi-device configurations, enabling the distribution of workloads across multiple GPUs or other hardware accelerators. This feature allows for scaling ZK proofs and other cryptographic operations across larger data centers or high-performance computing environments.
+
+---
+
+### Conclusion
+
+The architecture of ICICLE V3 is built to be flexible, scalable, and extensible, making it a powerful tool for developers working with zero-knowledge proofs and other cryptographic operations. Whether you're working with open-source CPU backends or closed-source CUDA backends, ICICLE provides the tools and flexibility needed to achieve high performance and scalability in cryptographic computations.
+
+Explore the following sections to learn more about building your own backend, using ICICLE across multiple devices, and integrating it into your projects.
diff --git a/docs/docs/icicle/benchmarks.md b/docs/docs/icicle/benchmarks.md
new file mode 100644
index 000000000..4a9ee6a08
--- /dev/null
+++ b/docs/docs/icicle/benchmarks.md
@@ -0,0 +1,3 @@
+# Benchmarks
+
+TODO
\ No newline at end of file
diff --git a/docs/docs/icicle/build_from_source.md b/docs/docs/icicle/build_from_source.md
new file mode 100644
index 000000000..8e0f3fcbf
--- /dev/null
+++ b/docs/docs/icicle/build_from_source.md
@@ -0,0 +1,180 @@
+
+# Build ICICLE from source
+
+This guide will help you get started with building, testing, and installing ICICLE, whether you're using C++, Rust, or Go. It also covers installation of the CUDA backend and important build options.
+
+## Building and Testing ICICLE frontend
+
+### C++: Build, Test, and Install (Frontend)
+
+ICICLE can be built and tested in C++ using CMake. The build process is straightforward, but there are several flags you can use to customize the build for your needs.
+
+#### Build Commands
+
+1. **Clone the ICICLE repository:**
+ ```bash
+ git clone https://github.com/ingonyama-zk/icicle.git
+ cd icicle
+ ```
+
+2. **Configure the build:**
+ ```bash
+ mkdir -p build && rm -rf build/*
+ cmake -S icicle -B build -DFIELD=babybear
+ ```
+
+:::info
+To specify the field, use the flag -DFIELD=field, where field can be one of the following: babybear, stark252, m31.
+
+To specify a curve, use the flag -DCURVE=curve, where curve can be one of the following: bn254, bls12_377, bls12_381, bw6_761, grumpkin.
+:::
+
+:::tip
+If you have access to cuda backend repo, it can be built along ICICLE frontend by adding the following to the cmake command
+- `-DCUDA_BACKEND=local` # if you have it locally
+- `-DCUDA_BACKEND=` # to pull CUDA backend, given you have access
+:::
+
+3. **Build the project:**
+ ```bash
+ cmake --build build -j
+ ```
+ This is building the [libicicle_device](./libraries.md#icicle-device) and the [libicicle_field_babybear](./libraries.md#icicle-core) frontend lib that correspond to the field or curve.
+
+4. **Link:**
+Link you application (or library) to ICICLE:
+```cmake
+target_link_libraries(yourApp PRIVATE icicle_field_babybear icicle_device)
+```
+
+5. **Installation (optional):**
+To install the libs, specify the install prefix in the [cmake command](./build_from_source.md#build-commands)
+`-DCMAKE_INSTALL_PREFIX=/install/dir/`. Default install path on linux is `/usr/local` if not specified. For other systems it may differ. The cmake command will print it to the log
+```
+-- CMAKE_INSTALL_PREFIX=/install/dir/for/cmake/install
+```
+Then after building, use cmake to install the libraries:
+```
+cmake -S icicle -B build -DFIELD=babybear -DCMAKE_INSTALL_PREFIX=/path/to/install/dir/
+cmake --build build -j # build
+cmake --install build # install icicle to /path/to/install/dir/
+```
+
+6. **Run tests (optional):**
+Add `-DBUILD_TESTS=ON` to the [cmake command](./build_from_source.md#build-commands) and build.
+Execute all tests
+```bash
+cmake -S icicle -B build -DFIELD=babybear -DBUILD_TESTS=ON
+cmake --build build -j
+cd build/tests
+ctest
+```
+or choose the test-suite
+```bash
+./build/tests/test_field_api # or another test suite
+# can specify tests using regex. For example for tests with ntt in the name:
+./build/tests/test_field_api --gtest_filter="*ntt*"
+```
+:::note
+Most tests assume a cuda backend exists and will fail otherwise if cannot find a CUDA device.
+:::
+
+#### Build Flags
+
+You can customize your ICICLE build with the following flags:
+
+- `-DCPU_BACKEND=ON/OFF`: Enable or disable built-in CPU backend. `default=ON`.
+- `-DCMAKE_INSTALL_PREFIX=/install/dir`: Specify install directory. `default=/usr/local`.
+- `-DBUILD_TESTS=ON/OFF`: Enable or disable tests. `default=OFF`.
+- `-DBUILD_BENCHMARKS=ON/OFF`: Enable or disable benchmarks. `default=OFF`.
+
+#### Features
+
+By default, all [features](./libraries.md#supported-curves-and-operations) are enabled.
+This is since installed backends may implement and register all APIs. Missing APIs in the frontend would cause linkage to fail due to missing symbols. Therefore by default we include them in the frontend part too.
+
+To disable features, add the following to the cmake command.
+- ntt: `-DNTT=OFF`
+- msm: `-DMSM=OFF`
+- g2 msm: `-DG2=OFF`
+- ecntt: `-DECNTT=OFF`
+- extension field: `-DEXT_FIELD=OFF`
+
+:::tip
+Disabling features is useful when developing with a backend that is slow to compile (e.g. CUDA backend);
+:::
+
+### Rust: Build, Test, and Install
+
+To build and test ICICLE in Rust, follow these steps:
+
+1. **Navigate to the Rust bindings directory:**
+```bash
+cd wrappers/rust # or go to a specific field/curve 'cd wrappers/rust/icicle-fields/icicle-babybear'
+```
+
+2. **Build the Rust project:**
+```bash
+cargo build --release
+```
+By default, all [supported features are enabled](#features).
+Cargo features are used to disable features, rather than enable them, for the reason explained [here](#features):
+- `no_g2` to disable G2 MSM
+- `no_ecntt` to disable ECNTT
+
+They can be disabled as follows:
+```bash
+cargo build --release --features=no_ecntt,no_g2
+```
+
+:::note
+If you have access to cuda backend repo, it can be built along ICICLE frontend by using the following cargo features:
+- `cuda_backend` : if the cuda backend resides in `icicle/backend/cuda`
+- `pull_cuda_backend` : to pull main branch and build it
+:::
+
+
+3. **Run tests:**
+```bash
+cargo test # optional: --features=no_ecntt,no_g2,cuda_backend
+```
+:::note
+Most tests assume a CUDA backend is installed and fail otherwise.
+:::
+
+4. **Install the library:**
+
+By default, the libraries are installed to the `target//deps/icicle` dir. If you want them installed elsewhere, define the env variable:
+```bash
+export ICICLE_INSTALL_DIR=/path/to/install/dir
+```
+
+#### Use as cargo dependency
+In cargo.toml, specify the ICICLE libs to use:
+
+```bash
+[dependencies]
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-babybear = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+# add other ICICLE crates here if need additional fields/curves
+```
+
+Can specify `branch = ` or `tag = ` or `rev = `.
+
+To disable features:
+```bash
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", features = ["no_g2"] }
+```
+
+As explained above, the libs will be built and installed to `target//deps/icicle` so you can easily link to them. Alternatively you can set `ICICLE_INSTALL_DIR` env variable for a custom install directory.
+
+:::warning
+Make sure to install icicle libs when installing a library/application that depends on icicle such that it is located at runtime.
+:::
+
+### Go: Build, Test, and Install (TODO)
+
+## Install cuda backend
+
+[Install CUDA Backend (and License)](./install_cuda_backend.md#installation)
\ No newline at end of file
diff --git a/docs/docs/icicle/build_your_own_backend.md b/docs/docs/icicle/build_your_own_backend.md
new file mode 100644
index 000000000..5cb2fc52a
--- /dev/null
+++ b/docs/docs/icicle/build_your_own_backend.md
@@ -0,0 +1,3 @@
+# Build Your Own Backend
+
+TODO
\ No newline at end of file
diff --git a/docs/docs/icicle/core.md b/docs/docs/icicle/core.md
deleted file mode 100644
index e887bae3b..000000000
--- a/docs/docs/icicle/core.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# ICICLE Core
-
-ICICLE Core is a library written in C++/CUDA. All the ICICLE primitives are implemented within ICICLE Core.
-
-The Core is split into logical modules that can be compiled into static libraries using different [strategies](#compilation-strategies). You can then [link](#linking) these libraries with your C++ project or write your own [bindings](#writing-new-bindings-for-icicle) for other programming languages. If you want to use ICICLE with existing bindings please refer to the [Rust](/icicle/rust-bindings) or [Golang](/icicle/golang-bindings) bindings documentation.
-
-## Supported curves, fields and operations
-
-### Supported curves and operations
-
-| Operation\Curve | [bn254](https://neuromancer.sk/std/bn/bn254) | [bls12-377](https://neuromancer.sk/std/bls/BLS12-377) | [bls12-381](https://neuromancer.sk/std/bls/BLS12-381) | [bw6-761](https://eprint.iacr.org/2020/351) | grumpkin |
-| --- | :---: | :---: | :---: | :---: | :---: |
-| [MSM][MSM_DOCS] | ✅ | ✅ | ✅ | ✅ | ✅ |
-| G2 | ✅ | ✅ | ✅ | ✅ | ❌ |
-| [NTT][NTT_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
-| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
-| [VecOps][VECOPS_CODE] | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Polynomials][POLY_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
-| [Poseidon](primitives/poseidon) | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Merkle Tree](primitives/poseidon#the-tree-builder) | ✅ | ✅ | ✅ | ✅ | ✅ |
-
-### Supported fields and operations
-
-| Operation\Field | [babybear](https://eprint.iacr.org/2023/824.pdf) | [Stark252](https://docs.starknet.io/documentation/architecture_and_concepts/Cryptography/p-value/) |
-| --- | :---: | :---: |
-| [VecOps][VECOPS_CODE] | ✅ | ✅ |
-| [Polynomials][POLY_DOCS] | ✅ | ✅ |
-| [NTT][NTT_DOCS] | ✅ | ✅ |
-| Extension Field | ✅ | ❌ |
-
-### Supported hashes
-
-| Hash | Sizes |
-| --- | :---: |
-| Keccak | 256, 512 |
-
-## Compilation strategies
-
-Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields, you compile ICICLE once per curve or field that is needed. It's that simple. Currently, the following choices are supported:
-
-- [Field mode][COMPILE_FIELD_MODE] - used for STARK fields like BabyBear / Mersenne / Goldilocks. Includes field arithmetic, NTT, Poseidon, Extension fields and other primitives.
-- [Curve mode][COMPILE_CURVE_MODE] - used for SNARK curves like BN254 / BLS curves / Grumpkin / etc. Curve mode is built upon field mode, so it includes everything that field does It also includes curve operations / MSM / ECNTT / G2 and other curve-related primitives.
-
-:::info
-
-If you only want to use a curve's scalar or base field, you still need to use curve mode. You can disable MSM with [options](#compilation-options)
-
-:::
-
-### Compiling for a field
-
-You can compile ICICLE for a field using this command:
-
-```sh
-cd icicle
-mkdir -p build
-cmake -DFIELD= -S . -B build
-cmake --build build -j
-```
-
-This command will output `libingo_field_.a` into `build/lib`.
-
-### Compiling for a curve
-
-:::note
-
-Field related primitives will be compiled for the scalar field of the curve
-
-:::
-
-You can compile ICICLE for a SNARK curve using this command:
-
-```sh
-cd icicle
-mkdir -p build
-cmake -DCURVE= -S . -B build
-cmake --build build -j
-```
-
-Where `` can be one of `bn254`/`bls12_377`/`bls12_381`/`bw6_761`/`grumpkin`.
-
-This command will output both `libingo_curve_.a` and `libingo_field_.a` into `build/lib`.
-
-### Compilation options
-
-There exist multiple options that allow you to customize your build or enable additional functionality.
-
-#### EXT_FIELD
-
-Used only in [field mode][COMPILE_FIELD_MODE] to add an Extension field. Adds all supported field operations for the extension field.
-
-Default: `OFF`
-
-Usage: `-DEXT_FIELD=ON`
-
-#### G2
-
-Used only in [curve mode][COMPILE_CURVE_MODE] to add G2 definitions. Also adds G2 MSM.
-
-Default: `OFF`
-
-Usage: `-DG2=ON`
-
-#### ECNTT
-
-Used only in [curve mode][COMPILE_CURVE_MODE] to add ECNTT function.
-
-Default: `OFF`
-
-Usage: `-DECNTT=ON`
-
-#### MSM
-
-Used only in [curve mode][COMPILE_CURVE_MODE] to add MSM function. As MSM takes a lot of time to build, you can disable it with this option to reduce compilation time.
-
-Default: `ON`
-
-Usage: `-DMSM=OFF`
-
-#### BUILD_HASH
-
-Can be used in any mode to build a hash library. Currently it only includes Keccak hash function, but more are coming.
-
-Default: `OFF`
-
-Usage: `-DBUILD_HASH=ON`
-
-#### BUILD_TESTS
-
-Can be used in any mode to include tests runner binary.
-
-Default: `OFF`
-
-USAGE: `-DBUILD_TESTS=ON`
-
-#### BUILD_BENCHMARKS
-
-Can be used in any mode to include benchmarks runner binary.
-
-Default: `OFF`
-
-USAGE: `-DBUILD_BENCHMARKS=ON`
-
-#### DEVMODE
-
-Can be used in any mode to include debug symbols in the build.
-
-Default: `OFF`
-
-USAGE: `-DEVMODE=ON`
-
-## Linking
-
-To link ICICLE with your project you first need to compile ICICLE with options of your choice. After that you can use CMake `target_link_libraries` to link with the generated static libraries and `target_include_directories` to include ICICLE headers (located in `icicle/include`).
-
-Refer to our [c++ examples](https://github.com/ingonyama-zk/icicle/tree/main/examples/c%2B%2B) for more info. Take a look at this [CMakeLists.txt](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/CMakeLists.txt#L22)
-
-## Writing new bindings for ICICLE
-
-Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
-
-Static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
-
-Let's review the [Golang bindings][GOLANG_BINDINGS] since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
- // Now you can call the C functions from the ICICLE libraries.
- // Note that C function calls are prefixed with 'C.' in Go code.
-
- out := (*C.BN254_projective_t)(unsafe.Pointer(p))
- in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
-
- C.projective_from_affine_bn254(out, in)
-}
-```
-
-The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
-
-If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
-
-
-[GOLANG_BINDINGS]: golang-bindings.md
-[COMPILE_CURVE_MODE]: #compiling-for-a-curve
-[COMPILE_FIELD_MODE]: #compiling-for-a-field
-[NTT_DOCS]: primitives/ntt
-[MSM_DOCS]: primitives/msm
-[POLY_DOCS]: polynomials/overview
-[VECOPS_CODE]: https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/vec_ops/vec_ops.cuh
-
diff --git a/docs/docs/icicle/faq_and_troubleshooting.md b/docs/docs/icicle/faq_and_troubleshooting.md
new file mode 100644
index 000000000..79d07f5e6
--- /dev/null
+++ b/docs/docs/icicle/faq_and_troubleshooting.md
@@ -0,0 +1,9 @@
+# FAQ and troubleshooting
+
+## Frequently asked questions
+
+TODO
+
+## Troubleshooting
+
+TODO
\ No newline at end of file
diff --git a/docs/docs/icicle/getting_started.md b/docs/docs/icicle/getting_started.md
new file mode 100644
index 000000000..4476ef08b
--- /dev/null
+++ b/docs/docs/icicle/getting_started.md
@@ -0,0 +1,170 @@
+# Getting started Guide
+
+## Overview
+
+This guide will walk you through the entire process of building, testing, and installing ICICLE using your preferred programming language—C++, Rust, or Go. Whether you're deploying on a CPU or leveraging CUDA for accelerated performance, this guide provides comprehensive instructions to get you started. It also outlines the typical workflow for a user, including key installation steps:
+
+
+1. **Install ICICLE or build it from source**: This is explained in this guide. For building from source, refer to the [Build from Source page](./build_from_source.md).
+2. **Follow the [Programmer’s Guide](./programmers_guide/general.md)**: Learn how to use ICICLE APIs.
+3. **Start using ICICLE APIs on your CPU**: Your application will now use ICICLE on the CPU.
+4. **Accelerate your application on a GPU**: [install the CUDA backend](./install_cuda_backend.md), load it, and select it in your application ([C++](./programmers_guide/cpp.md#loading-a-backend),[Rust](./programmers_guide/rust.md#loading-a-backend), [Go](./programmers_guide/go.md#loading-a-backend)).
+5. **Run on the GPU**: Once the GPU backend is selected, all subsequent API calls will execute on the GPU.
+6. **Optimize for multi-GPU environments**: Refer to the [Multi-GPU](./multi-device.md) Guide to fully utilize your system’s capabilities.
+7. **Review memory management**: Revisit the [Memory Management section](./programmers_guide/general.md#device-abstraction) to allocate memory on the device efficiently and try to keep data on the GPU as long as possible.
+
+
+The rest of this page details the content of a release, how to install it, and how to use it. ICICLE binaries are released for multiple Linux distributions, including Ubuntu 20.04, Ubuntu 22.04, RHEL 8, and RHEL 9.
+
+:::note
+Future releases will also include support for macOS and other systems.
+:::
+
+## Content of a Release
+
+Each ICICLE release includes a tar file named `icicle30-.tar.gz`, where `icicle30` indicates version 3.0. This tar file contains ICICLE frontend build artifacts and headers for a specific distribution. The tar file structure includes:
+
+- **`./icicle/include/`**: This directory contains all the necessary header files for using the Icicle library from C++.
+- **`./icicle/lib/`**:
+ - **Icicle Libraries**: All the core Icicle libraries are located in this directory. Applications linking to Icicle will use these libraries.
+ - **Backends**: The `./icicle/lib/backend/` directory houses backend libraries, including the CUDA backend (not included in this tar).
+
+- **CUDA backend** comes as separate tar `icicle30--cuda122.tar.gz`
+ - per distribution, for icicle-frontend V3.0 and CUDA 12.2.
+
+## installing and using icicle
+
+- [Full C++ example](https://github.com/ingonyama-zk/icicle/tree/yshekel/V3/examples/c++/install-and-use-icicle)
+- [Full Rust example](https://github.com/ingonyama-zk/icicle/tree/yshekel/V3/examples/rust/install-and-use-icicle)
+- [Full Go example](https://github.com/ingonyama-zk/icicle/tree/yshekel/V3/examples/golang/install-and-use-icicle)
+
+*(TODO update links to main branch when merged)
+
+1. **Extract and install the Tar Files**:
+ - [Download](https://github.com/ingonyama-zk/icicle/releases) the appropriate tar files for your distribution (Ubuntu 20.04, Ubuntu 22.04, or UBI 8,9 for RHEL compatible binaries).
+ - **Frontend libs and headers** should be installed in default search paths (such as `/usr/lib` and `usr/local/include`) for the compiler and linker to find.
+ - **Backend libs** should be installed in `/opt`
+ - Extract it to your desired location:
+ ```bash
+ # install the frontend part (Can skip for Rust)
+ tar xzvf icicle30-ubuntu22.tar.gz
+ cp -r ./icicle/lib/* /usr/lib/
+ cp -r ./icicle/include/icicle/ /usr/local/include/ # copy C++ headers
+ # extract CUDA backend (OPTIONAL)
+ tar xzvf icicle30-ubuntu22-cuda122.tar.gz -C /opt
+ ```
+
+ :::note
+ Installing the frontend is optional for Rust. Rust does not use it.
+ :::
+
+ :::tip
+ You may install to any directory, but you need to ensure it can be found by the linker at compile and runtime.
+ You can install anywhere and use a symlink to ensure it can be easily found as if it were in the default directory.
+ :::
+
+2. **Linking Your Application**:
+
+ Applications need to link to the ICICLE device library and to every field and/or curve library. The backend libraries are dynamically loaded at runtime, so there is no need to link to them.
+
+ **C++**
+ - When compiling your C++ application, link against the ICICLE libraries:
+ ```bash
+ g++ -o myapp myapp.cpp -licicle_device -licicle_field_bn254 -licicle_curve_bn254
+ # if not installed in standard dirs, for example /custom/path/, need to specify it
+ g++ -o myapp myapp.cpp -I/custom/path/icicle/include -L/custom/path/icicle/lib -licicle_device -licicle_field_bn254 -licicle_curve_bn254 -Wl,-rpath,/custom/path/icicle/lib/
+ ```
+
+ - Or via cmake
+ ```bash
+ # Add the executable
+ add_executable(example example.cpp)
+ # Link the libraries
+ target_link_libraries(example icicle_device icicle_field_bn254 icicle_curve_bn254)
+
+ # OPTIONAL (if not installed in default location)
+
+ # The following is setting compile and runtime paths for headers and libs assuming
+ # - headers in /custom/path/icicle/include
+ # - libs in/custom/path/icicle/lib
+
+ # Include directories
+ target_include_directories(example PUBLIC /custom/path/icicle/include)
+ # Library directories
+ target_link_directories(example PUBLIC /custom/path/icicle/lib/)
+ # Set the RPATH so linker finds icicle libs at runtime
+ set_target_properties(example PROPERTIES
+ BUILD_RPATH /custom/path/icicle/lib/
+ INSTALL_RPATH /custom/path/icicle/lib/)
+ ```
+
+ :::tip
+ If you face linkage issues, try `ldd myapp` to see the runtime dependencies. If ICICLE libs are not found, you need to add the install directory to the search path of the linker. In a development environment, you can do that using the environment variable export `LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/custom/path/icicle/lib` or similar (for non-Linux). For deployment, make sure it can be found and avoid using LD_LIBRARY_PATH.
+
+ Alternatively, you can embed the search path in the app as an rpath by adding `-Wl,-rpath,/custom/path/icicle/lib/`. This is demonstrated above.
+ :::
+
+ **Rust**
+ - When building the ICICLE crates, ICICLE frontend libs are built from source, along with the Rust bindings. They are installed to `target//deps/icicle`, and Cargo will link them correctly. Note that you still need to install the CUDA backend if you have a CUDA GPU.
+ - Simply use `cargo build` or `cargo run` and it should link to icicle libs.
+
+ **Go** - TODO
+
+:::warning
+When deploying an application (whether in C++, Rust, or Go), you must make sure to either deploy the ICICLE libs (that you download or build from source) along with the application binaries (as tar, Docker image, package manager installer, or otherwise) or make sure to install ICICLE (and the backend) on the target machine. Otherwise, the target machine will have linkage issues.
+:::
+
+## Backend Loading
+
+The ICICLE library dynamically loads backend libraries at runtime. By default, it searches for backends in the following order:
+
+1. **Environment Variable**: If the `ICICLE_BACKEND_INSTALL_DIR` environment variable is defined, ICICLE will prioritize this location.
+2. **Default Directory**: If the environment variable is not set, Icicle will search in the default directory `/opt/icicle/lib/backend`.
+
+:::warning
+If building ICICLE frontend from source, make sure to load a backend that is compatible with the frontend version. CUDA backend libs are forward compatible with newer frontends (e.g., CUDA-backend-3.0 works with ICICLE-3.2). The opposite is not guaranteed.
+:::
+
+If you install in a custom dir, make sure to set `ICICLE_BACKEND_INSTALL_DIR`:
+```bash
+ICICLE_BACKEND_INSTALL_DIR=path/to/icicle/lib/backend/ myapp # for an executable myapp
+ICICLE_BACKEND_INSTALL_DIR=path/to/icicle/lib/backend/ cargo run # when using cargo
+```
+
+Then to load backend from ICICLE_BACKEND_INSTALL_DIR or `/opt/icicle/lib/backend` in your application:
+
+**C++**
+```cpp
+extern "C" eIcicleError icicle_load_backend_from_env_or_default();
+```
+**Rust**
+```rust
+pub fn load_backend_from_env_or_default() -> Result<(), eIcicleError>;
+```
+**Go**
+```go
+TODO
+```
+
+### Custom Backend Loading
+
+If you need to load a backend from a custom location at any point during runtime, you can call the following function:
+
+**C++**
+```cpp
+extern "C" eIcicleError icicle_load_backend(const char* path, bool is_recursive);
+```
+- **`path`**: The directory where the backend libraries are located.
+- **`is_recursive`**: If `true`, the function will search for backend libraries recursively within the specified path.
+
+**Rust**
+```rust
+ pub fn load_backend(path: &str) -> Result<(), eIcicleError>; // OR
+ pub fn load_backend_non_recursive(path: &str) -> Result<(), eIcicleError>;
+```
+- **`path`**: The directory where the backend libraries are located.
+
+**Go**
+```go
+TODO
+```
diff --git a/docs/docs/icicle/golang-bindings.md b/docs/docs/icicle/golang-bindings.md
index c2122b97f..ea2647c45 100644
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -7,57 +7,60 @@ The Golang bindings are comprised of multiple packages.
[`core`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/core) which defines all shared methods and structures, such as configuration structures, or memory slices.
-[`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.
+[`runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/runtime) which defines abstractions for ICICLE methods for allocating memory, initializing and managing streams, and `Device` which enables users to define and keep track of devices.
-Each supported curve, field, and hash has its own package which you can find in the respective directories [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang). If your project uses BN254 you only need to import that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
+Each supported curve and field has its own package which you can find in the respective directories [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang). If your project uses BN254 you only need to import that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
## Using ICICLE Golang bindings in your project
To add ICICLE to your `go.mod` file.
```bash
-go get github.com/ingonyama-zk/icicle
+go get github.com/ingonyama-zk/icicle/v3
```
If you want to specify a specific branch
```bash
-go get github.com/ingonyama-zk/icicle@
+go get github.com/ingonyama-zk/icicle/v3@
```
For a specific commit
```bash
-go get github.com/ingonyama-zk/icicle@
+go get github.com/ingonyama-zk/icicle/v3@
```
-To build the shared libraries you can run [this](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/build.sh) script:
+### Building from source
+
+To build the shared libraries you can run [this](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/build.sh) script inside the downloaded go dependency:
```sh
-./build.sh [-curve=] [-field=] [-hash=] [-cuda_version=] [-g2] [-ecntt] [-devmode]
+./build.sh [-curve=] [-field=] [-cuda_version=] [-skip_msm] [-skip_ntt] [-skip_g2] [-skip_ecntt] [-skip_fieldext]
curve - The name of the curve to build or "all" to build all supported curves
field - The name of the field to build or "all" to build all supported fields
-hash - The name of the hash to build or "all" to build all supported hashes
--g2 - Optional - build with G2 enabled
--ecntt - Optional - build with ECNTT enabled
--devmode - Optional - build in devmode
+-skip_msm - Optional - build with MSM disabled
+-skip_ntt - Optional - build with NTT disabled
+-skip_g2 - Optional - build with G2 disabled
+-skip_ecntt - Optional - build with ECNTT disabled
+-skip_fieldext - Optional - build without field extension
-help - Optional - Displays usage information
```
:::note
-If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
+If more than one curve or more than one field is supplied, the last one supplied will be built
:::
-To build ICICLE libraries for all supported curves with G2 and ECNTT enabled.
+To build ICICLE libraries for all supported curves without certain features, you can use their -skip_ flags. For example, for disabling G2 and ECNTT:
```bash
-./build.sh -curve=all -g2 -ecntt
+./build.sh -curve=all -skip_g2 -skip_ecntt
```
-If you wish to build for a specific curve, for example bn254, without G2 or ECNTT enabled.
+By default, all features are enabled. To build for a specific field or curve, you can pass the `-field=` or `-curve=` flags:
``` bash
./build.sh -curve=bn254
@@ -67,15 +70,24 @@ Now you can import ICICLE into your project
```go
import (
- "github.com/stretchr/testify/assert"
- "testing"
-
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
...
```
+### Building with precompiled libs
+
+Download the frontend release binaries from our [github release page](https://github.com/ingonyama-zk/icicle/releases), for example: icicle30-ubuntu22.tar.gz for ICICLE v3 on ubuntu 22.04
+
+Extract the libs and move them to the downloaded go dependency in your GOMODCACHE
+
+```sh
+# extract frontend part
+tar xzvf icicle30-ubuntu22.tar.gz
+cp -r ./icicle/lib/* $(go env GOMODCACHE)/github.com/ingonyama-zk/icicle/v3@/build/lib/
+```
+
## Running tests
To run all tests, for all curves:
@@ -84,23 +96,23 @@ To run all tests, for all curves:
go test ./... -count=1
```
-If you wish to run test for a specific curve:
+If you wish to run test for a specific curve or field:
```bash
-go test -count=1
+go test -count=1
```
## How do Golang bindings work?
-The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+The golang packages are binded to the libraries produced from compiling ICICLE using cgo.
-1. These libraries (named `libingo_curve_.a` and `libingo_field_.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+1. These libraries (named `libicicle_curve_.a` and `libicicle_field_.a`) can be imported in your Go project to leverage the accelerated functionalities provided by ICICLE.
2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:
```go
/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lingo_curve_bn254 -L$/path/to/shared/libs -lingo_field_bn254 -lstdc++ -lm
+#cgo LDFLAGS: -L/path/to/shared/libs -licicle_device -lstdc++ -lm -Wl,-rpath=/path/to/shared/libs
#include "icicle.h" // make sure you use the correct header file(s)
*/
import "C"
@@ -118,19 +130,19 @@ Replace `/path/to/shared/libs` with the actual path where the shared libraries a
### Supported curves and operations
| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
-| --- | :---: | :---: | :---: | :---: | :---: |
-| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
-| G2 | ✅ | ✅ | ✅ | ✅ | ❌ |
-| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
-| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
-| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
-| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+| --------------- | :---: | :-------: | :-------: | :-----: | :------: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2 | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
### Supported fields and operations
| Operation\Field | babybear |
-| --- | :---: |
-| VecOps | ✅ |
-| Polynomials | ✅ |
-| NTT | ✅ |
-| Extension Field | ✅ |
+| --------------- | :------: |
+| VecOps | ✅ |
+| Polynomials | ✅ |
+| NTT | ✅ |
+| Extension Field | ✅ |
diff --git a/docs/docs/icicle/golang-bindings/ecntt.md b/docs/docs/icicle/golang-bindings/ecntt.md
index 3947995ed..4fd1acf8e 100644
--- a/docs/docs/icicle/golang-bindings/ecntt.md
+++ b/docs/docs/icicle/golang-bindings/ecntt.md
@@ -5,7 +5,7 @@
The `ECNtt[T any]()` function performs the Elliptic Curve Number Theoretic Transform (EC-NTT) on the input points slice, using the provided dir (direction), cfg (configuration), and stores the results in the results slice.
```go
-func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
+func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) runtime.EIcicleError
```
### Parameters
@@ -17,7 +17,7 @@ func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTC
### Return Value
-- **`CudaError`**: A `core.IcicleError` value, which will be `core.IcicleErrorCode(0)` if the EC-NTT operation was successful, or an error if something went wrong.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the EC-NTT operation was successful, or an error if something went wrong.
## NTT Configuration (NTTConfig)
@@ -25,29 +25,29 @@ The `NTTConfig` structure holds configuration parameters for the NTT operation,
```go
type NTTConfig[T any] struct {
- Ctx cr.DeviceContext
- CosetGen T
- BatchSize int32
- ColumnsBatch bool
- Ordering Ordering
- areInputsOnDevice bool
- areOutputsOnDevice bool
- IsAsync bool
- NttAlgorithm NttAlgorithm
+ StreamHandle runtime.Stream
+ CosetGen T
+ BatchSize int32
+ ColumnsBatch bool
+ Ordering Ordering
+ areInputsOnDevice bool
+ areOutputsOnDevice bool
+ IsAsync bool
+ Ext config_extension.ConfigExtensionHandler
}
```
### Fields
-- **`Ctx`**: Device context containing details like device ID and stream ID.
-- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
+- **`StreamHandle`**: Specifies the stream (queue) to use for async execution.
+- **`CosetGen`**: Coset generator. Used to perform coset (i)NTTs.
- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
-- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows.
- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`), affecting how data is arranged.
- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
-- **`NttAlgorithm`**: Explicitly select the NTT algorithm. ECNTT supports running on `Radix2` algoruithm.
+- **`Ext`**: Extended configuration for backend.
### Default Configuration
@@ -63,30 +63,38 @@ func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/ecntt"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/ntt"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func Main() {
- // Obtain the default NTT configuration with a predefined coset generator.
- cfg := GetDefaultNttConfig()
-
- // Define the size of the input scalars.
- size := 1 << 18
-
- // Generate Points for the ECNTT operation.
- points := GenerateProjectivePoints(size)
-
- // Set the direction of the NTT (forward or inverse).
- dir := core.KForward
-
- // Allocate memory for the results of the NTT operation.
- results := make(core.HostSlice[Projective], size)
-
- // Perform the NTT operation.
- err := ECNtt(points, dir, &cfg, results)
- if err != cr.CudaSuccess {
- panic("ECNTT operation failed")
- }
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+ // Set Cuda device to perform
+ device := runtime.CreateDevice("CUDA", 0)
+ runtime.SetDevice(&device)
+ // Obtain the default NTT configuration with a predefined coset generator.
+ cfg := ntt.GetDefaultNttConfig()
+
+ // Define the size of the input scalars.
+ size := 1 << 18
+
+ // Generate Points for the ECNTT operation.
+ points := bn254.GenerateProjectivePoints(size)
+
+ // Set the direction of the NTT (forward or inverse).
+ dir := core.KForward
+
+ // Allocate memory for the results of the NTT operation.
+ results := make(core.HostSlice[bn254.Projective], size)
+
+ // Perform the NTT operation.
+ err := ecntt.ECNtt(points, dir, &cfg, results)
+ if err != runtime.Success {
+ panic("ECNTT operation failed")
+ }
}
```
diff --git a/docs/docs/icicle/golang-bindings/keccak.md b/docs/docs/icicle/golang-bindings/keccak.md
deleted file mode 100644
index 1ecc80de1..000000000
--- a/docs/docs/icicle/golang-bindings/keccak.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Keccak
-
-## Keccak Example
-
-```go
-package main
-
-import (
- "encoding/hex"
-
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/hash/keccak"
-)
-
-func createHostSliceFromHexString(hexString string) core.HostSlice[uint8] {
- byteArray, err := hex.DecodeString(hexString)
- if err != nil {
- panic("Not a hex string")
- }
- return core.HostSliceFromElements([]uint8(byteArray))
-}
-
-func main() {
- input := createHostSliceFromHexString("1725b6")
- outHost256 := make(core.HostSlice[uint8], 32)
-
- cfg := keccak.GetDefaultHashConfig()
- e := keccak.Keccak256(input, int32(input.Len()), 1, outHost256, &cfg)
- if e.CudaErrorCode != cr.CudaSuccess {
- panic("Keccak256 hashing failed")
- }
-
- outHost512 := make(core.HostSlice[uint8], 64)
- e = keccak.Keccak512(input, int32(input.Len()), 1, outHost512, &cfg)
- if e.CudaErrorCode != cr.CudaSuccess {
- panic("Keccak512 hashing failed")
- }
-
- numberOfBlocks := 3
- outHostBatch256 := make(core.HostSlice[uint8], 32*numberOfBlocks)
- e = keccak.Keccak256(input, int32(input.Len()/numberOfBlocks), int32(numberOfBlocks), outHostBatch256, &cfg)
- if e.CudaErrorCode != cr.CudaSuccess {
- panic("Keccak256 batch hashing failed")
- }
-}
-```
-
-## Keccak Methods
-
-```go
-func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
-func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
-```
-
-### Parameters
-
-- **`input`**: A slice containing the input data for the Keccak256 hash function. It can reside in either host memory or device memory.
-- **`inputBlockSize`**: An integer specifying the size of the input data for a single hash.
-- **`numberOfBlocks`**: An integer specifying the number of results in the hash batch.
-- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
-- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.
-
-### Return Value
-
-- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
-
-## HashConfig
-
-The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
-
-```go
-type HashConfig struct {
- Ctx cr.DeviceContext
- areInputsOnDevice bool
- areOutputsOnDevice bool
- IsAsync bool
-}
-```
-
-### Fields
-
-- **`Ctx`**: Device context containing details like device id and stream.
-- **`areInputsOnDevice`**: Indicates if input data is located on the device.
-- **`areOutputsOnDevice`**: Indicates if output hash is stored on the device.
-- **`IsAsync`**: If true, runs the Keccak256/Keccak512 operation asynchronously.
-
-### Default Configuration
-
-Use `GetDefaultHashConfig` to obtain a default configuration, which can then be customized as needed.
-
-```go
-func GetDefaultHashConfig() HashConfig
-```
\ No newline at end of file
diff --git a/docs/docs/icicle/golang-bindings/msm-pre-computation.md b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
index 888a01dd5..f13998d3e 100644
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -4,9 +4,9 @@ To understand the theory behind MSM pre computation technique refer to Niall Emm
## Core package
-### MSM PrecomputePoints
+### MSM PrecomputeBases
-`PrecomputePoints` and `G2PrecomputePoints` exists for all supported curves.
+`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.
#### Description
@@ -14,18 +14,17 @@ This function extends each provided base point $(P)$ with its multiples $(2^lP,
The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
-#### `PrecomputePoints`
+#### `PrecomputeBases`
Precomputes points for MSM by extending each base point with its multiples.
```go
-func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
+func PrecomputeBases(bases core.HostOrDeviceSlice, cfg *core.MSMConfig, outputBases core.DeviceSlice) runtime.EIcicleError
```
##### Parameters
-- **`points`**: A slice of the original affine points to be extended with their multiples.
-- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`bases`**: A slice of the original affine points to be extended with their multiples.
- **`cfg`**: The MSM configuration parameters.
- **`outputBases`**: The device slice allocated for storing the extended points.
@@ -37,37 +36,43 @@ package main
import (
"log"
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
- cfg := bn254.GetDefaultMSMConfig()
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+ // Set Cuda device to perform
+ device := runtime.CreateDevice("CUDA", 0)
+ runtime.SetDevice(&device)
+
+ cfg := core.GetDefaultMSMConfig()
points := bn254.GenerateAffinePoints(1024)
- var precomputeFactor int32 = 8
+ cfg.PrecomputeFactor = 8
var precomputeOut core.DeviceSlice
- precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+ precomputeOut.Malloc(points[0].Size(), points.Len()*int(cfg.PrecomputeFactor))
- err := bn254.PrecomputePoints(points, 1024, &cfg, precomputeOut)
- if err != cr.CudaSuccess {
+ err := msm.PrecomputeBases(points, &cfg, precomputeOut)
+ if err != runtime.Success {
log.Fatalf("PrecomputeBases failed: %v", err)
}
}
```
-#### `G2PrecomputePoints`
+#### `G2PrecomputeBases`
This method is the same as `PrecomputePoints` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
```go
-func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
+func G2PrecomputeBases(bases core.HostOrDeviceSlice, cfg *core.MSMConfig, outputBases core.DeviceSlice) runtime.EIcicleError
```
##### Parameters
-- **`points`**: A slice of the original affine points to be extended with their multiples.
-- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`bases`**: A slice of the original affine points to be extended with their multiples.
- **`cfg`**: The MSM configuration parameters.
- **`outputBases`**: The device slice allocated for storing the extended points.
@@ -79,20 +84,26 @@ package main
import (
"log"
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/g2"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
- cfg := g2.G2GetDefaultMSMConfig()
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+ // Set Cuda device to perform
+ device := runtime.CreateDevice("CUDA", 0)
+ runtime.SetDevice(&device)
+
+ cfg := core.GetDefaultMSMConfig()
points := g2.G2GenerateAffinePoints(1024)
- var precomputeFactor int32 = 8
+ cfg.PrecomputeFactor = 8
var precomputeOut core.DeviceSlice
- precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+ precomputeOut.Malloc(points[0].Size(), points.Len()*int(cfg.PrecomputeFactor))
- err := g2.G2PrecomputePoints(points, 1024, 0, &cfg, precomputeOut)
- if err != cr.CudaSuccess {
+ err := g2.G2PrecomputeBases(points, &cfg, precomputeOut)
+ if err != runtime.Success {
log.Fatalf("PrecomputeBases failed: %v", err)
}
}
diff --git a/docs/docs/icicle/golang-bindings/msm.md b/docs/docs/icicle/golang-bindings/msm.md
index 72710c551..7209eca47 100644
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -6,13 +6,19 @@
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
- bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+ // Set Cuda device to perform
+ device := runtime.CreateDevice("CUDA", 0)
+ runtime.SetDevice(&device)
+
// Obtain the default MSM configuration.
cfg := core.GetDefaultMSMConfig()
@@ -24,43 +30,43 @@ func main() {
points := bn254.GenerateAffinePoints(size)
// Create a CUDA stream for asynchronous operations.
- stream, _ := cr.CreateStream()
+ stream, _ := runtime.CreateStream()
var p bn254.Projective
// Allocate memory on the device for the result of the MSM operation.
var out core.DeviceSlice
- _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+ _, e := out.MallocAsync(p.Size(), 1, stream)
- if e != cr.CudaSuccess {
+ if e != runtime.Success {
panic(e)
}
// Set the CUDA stream in the MSM configuration.
- cfg.Ctx.Stream = &stream
+ cfg.StreamHandle = stream
cfg.IsAsync = true
// Perform the MSM operation.
- e = bn254_msm.Msm(scalars, points, &cfg, out)
+ e = msm.Msm(scalars, points, &cfg, out)
- if e != cr.CudaSuccess {
+ if e != runtime.Success {
panic(e)
}
// Allocate host memory for the results and copy the results from the device.
outHost := make(core.HostSlice[bn254.Projective], 1)
- cr.SynchronizeStream(&stream)
+ runtime.SynchronizeStream(stream)
+ runtime.DestroyStream(stream)
outHost.CopyFromDevice(&out)
// Free the device memory allocated for the results.
out.Free()
}
-
```
## MSM Method
```go
-func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) cr.CudaError
+func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) runtime.EIcicleError
```
### Parameters
@@ -72,7 +78,7 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
### Return Value
-- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the MSM operation.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
## MSMConfig
@@ -80,37 +86,37 @@ The `MSMConfig` structure holds configuration parameters for the MSM operation,
```go
type MSMConfig struct {
- Ctx cr.DeviceContext
- PrecomputeFactor int32
- C int32
- Bitsize int32
- LargeBucketFactor int32
- batchSize int32
- areScalarsOnDevice bool
- AreScalarsMontgomeryForm bool
- arePointsOnDevice bool
- ArePointsMontgomeryForm bool
- areResultsOnDevice bool
- IsBigTriangle bool
- IsAsync bool
+ StreamHandle runtime.Stream
+ PrecomputeFactor int32
+ C int32
+ Bitsize int32
+ BatchSize int32
+ ArePointsSharedInBatch bool
+ areScalarsOnDevice bool
+ AreScalarsMontgomeryForm bool
+ areBasesOnDevice bool
+ AreBasesMontgomeryForm bool
+ areResultsOnDevice bool
+ IsAsync bool
+ Ext config_extension.ConfigExtensionHandler
}
```
### Fields
-- **`Ctx`**: Device context containing details like device id and stream.
+- **`StreamHandle`**: Specifies the stream (queue) to use for async execution.
- **`PrecomputeFactor`**: Controls the number of extra points to pre-compute.
- **`C`**: Window bitsize, a key parameter in the "bucket method" for MSM.
- **`Bitsize`**: Number of bits of the largest scalar.
-- **`LargeBucketFactor`**: Sensitivity to frequently occurring buckets.
-- **`batchSize`**: Number of results to compute in one batch.
+- **`BatchSize`**: Number of results to compute in one batch.
+- **`ArePointsSharedInBatch`**: Bases are shared for batch. Set to true if all MSMs use the same bases. Otherwise, the number of bases and number of scalars are expected to be equal.
- **`areScalarsOnDevice`**: Indicates if scalars are located on the device.
- **`AreScalarsMontgomeryForm`**: True if scalars are in Montgomery form.
-- **`arePointsOnDevice`**: Indicates if points are located on the device.
-- **`ArePointsMontgomeryForm`**: True if point coordinates are in Montgomery form.
+- **`areBasesOnDevice`**: Indicates if bases are located on the device.
+- **`AreBasesMontgomeryForm`**: True if point coordinates are in Montgomery form.
- **`areResultsOnDevice`**: Indicates if results are stored on the device.
-- **`IsBigTriangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`IsAsync`**: If true, runs MSM asynchronously.
+- **`Ext`**: Extended configuration for backend.
### Default Configuration
@@ -120,9 +126,9 @@ Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be c
func GetDefaultMSMConfig() MSMConfig
```
-## How do I toggle between the supported algorithms?
+## Batched msm
-When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle reduction and `cfg.Ctx.IsBigTriangle = false` will activate iterative reduction.
+For batch msm, simply allocate the results array with size corresponding to batch size and set the `ArePointsSharedInBatch` flag in config struct.
```go
...
@@ -147,7 +153,7 @@ The number of results is interpreted from the size of `var out core.DeviceSlice`
batchSize := 3
var p G2Projective
var out core.DeviceSlice
-out.Malloc(batchSize*p.Size(), p.Size())
+out.Malloc(p.Size(), batchSize)
...
```
@@ -164,7 +170,7 @@ Now you may import `g2` package of the specified curve.
```go
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/g2"
)
```
@@ -174,25 +180,26 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
- g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+ "log"
+
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
cfg := core.GetDefaultMSMConfig()
- size := 1 << 12
- batchSize := 3
- totalSize := size * batchSize
- scalars := bn254.GenerateScalars(totalSize)
- points := g2.G2GenerateAffinePoints(totalSize)
-
- var p g2.G2Projective
- var out core.DeviceSlice
- out.Malloc(batchSize*p.Size(), p.Size())
- g2.G2Msm(scalars, points, &cfg, out)
+ points := bn254.GenerateAffinePoints(1024)
+ var precomputeFactor int32 = 8
+ var precomputeOut core.DeviceSlice
+ precomputeOut.Malloc(points[0].Size(), points.Len()*int(precomputeFactor))
+
+ err := msm.PrecomputeBases(points, &cfg, precomputeOut)
+ if err != runtime.Success {
+ log.Fatalf("PrecomputeBases failed: %v", err)
+ }
}
-
```
`G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
diff --git a/docs/docs/icicle/golang-bindings/multi-gpu.md b/docs/docs/icicle/golang-bindings/multi-gpu.md
index 3d7cdf187..186c02018 100644
--- a/docs/docs/icicle/golang-bindings/multi-gpu.md
+++ b/docs/docs/icicle/golang-bindings/multi-gpu.md
@@ -19,48 +19,64 @@ import (
"fmt"
"sync"
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ bn254 "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
- numDevices, _ := cr.GetDeviceCount()
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+
+ device := runtime.CreateDevice("CUDA", 0)
+ err := runtime.SetDevice(&device)
+ numDevices, _ := runtime.GetDeviceCount()
fmt.Println("There are ", numDevices, " devices available")
+
+ if err != runtime.Success {
+ panic(err)
+ }
wg := sync.WaitGroup{}
for i := 0; i < numDevices; i++ {
+ internalDevice := runtime.Device{DeviceType: device.DeviceType, Id: int32(i)}
wg.Add(1)
- // RunOnDevice makes sure each MSM runs on a single thread
- cr.RunOnDevice(i, func(args ...any) {
+ runtime.RunOnDevice(&internalDevice, func(args ...any) {
defer wg.Done()
- cfg := bn254.GetDefaultMSMConfig()
+ currentDevice, err := runtime.GetActiveDevice()
+ if err != runtime.Success {
+ panic("Failed to get current device")
+ }
+
+ fmt.Println("Running on ", currentDevice.GetDeviceType(), " ", currentDevice.Id, " device")
+
+ cfg := msm.GetDefaultMSMConfig()
cfg.IsAsync = true
- for _, power := range []int{10, 18} {
- size := 1 << power // 2^pwr
-
- // generate random scalars
- scalars := bn254.GenerateScalars(size)
- points := bn254.GenerateAffinePoints(size)
-
- // create a stream and allocate result pointer
- stream, _ := cr.CreateStream()
- var p bn254.Projective
- var out core.DeviceSlice
- out.MallocAsync(p.Size(), p.Size(), stream)
- // assign stream to device context
- cfg.Ctx.Stream = &stream
-
- // execute MSM
- bn254.Msm(scalars, points, &cfg, out)
- // read result from device
- outHost := make(core.HostSlice[bn254.Projective], 1)
- outHost.CopyFromDeviceAsync(&out, stream)
- out.FreeAsync(stream)
-
- // sync the stream
- cr.SynchronizeStream(&stream)
+ size := 1 << 10
+ scalars := bn254.GenerateScalars(size)
+ points := bn254.GenerateAffinePoints(size)
+
+ stream, _ := runtime.CreateStream()
+ var p bn254.Projective
+ var out core.DeviceSlice
+ _, err = out.MallocAsync(p.Size(), 1, stream)
+ if err != runtime.Success {
+ panic("Allocating bytes on device for Projective results failed")
+ }
+ cfg.StreamHandle = stream
+
+ err = msm.Msm(scalars, points, &cfg, out)
+ if err != runtime.Success {
+ panic("Msm failed")
}
+ outHost := make(core.HostSlice[bn254.Projective], 1)
+ outHost.CopyFromDeviceAsync(&out, stream)
+ out.FreeAsync(stream)
+
+ runtime.SynchronizeStream(stream)
+ runtime.DestroyStream(stream)
+ // Check with gnark-crypto
})
}
wg.Wait()
@@ -71,7 +87,7 @@ This example demonstrates a basic pattern for distributing tasks across multiple
## Device Management API
-To streamline device management we offer as part of `cuda_runtime` package methods for dealing with devices.
+To streamline device management we offer as part of `runtime` package methods for dealing with devices.
### `RunOnDevice`
@@ -87,7 +103,7 @@ While the goroutine is locked to the host thread, the Go runtime will not assign
**Parameters:**
-- **`deviceId int`**: The ID of the device on which to run the provided function. Device IDs start from 0.
+- **`device *Device`**: A pointer to the `Device` instance to be used to run code.
- **`funcToRun func(args ...any)`**: The function to be executed on the specified device.
- **`args ...any`**: Arguments to be passed to `funcToRun`.
@@ -102,7 +118,8 @@ Any goroutines launched within `funcToRun` are not automatically bound to the sa
**Example:**
```go
-RunOnDevice(0, func(args ...any) {
+device := runtime.CreateDevice("CUDA", 0)
+RunOnDevice(&device, func(args ...any) {
fmt.Println("This runs on GPU 0")
// CUDA-related operations here will target GPU 0
}, nil)
@@ -110,7 +127,7 @@ RunOnDevice(0, func(args ...any) {
### `SetDevice`
-Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.
+Sets the active device for the current host thread. All subsequent calls made from this thread will target the specified device.
:::warning
This function should not be used directly in conjunction with goroutines. If you want to run multi-gpu scenarios with goroutines you should use [RunOnDevice](#runondevice)
@@ -118,38 +135,27 @@ This function should not be used directly in conjunction with goroutines. If you
**Parameters:**
-- **`device int`**: The ID of the device to set as the current device.
+- **`device *Device`**: A pointer to the `Device` instance to be used to run code.
**Returns:**
-- **`CudaError`**: Error code indicating the success or failure of the operation.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
### `GetDeviceCount`
-Retrieves the number of CUDA-capable devices available on the host.
+Retrieves the number of devices available on the host.
**Returns:**
-- **`(int, CudaError)`**: The number of devices and an error code indicating the success or failure of the operation.
+- **`(int, EIcicleError)`**: The number of devices and an error code indicating the success or failure of the operation.
-### `GetDevice`
+### `GetActiveDevice`
-Gets the ID of the currently active device for the calling host thread.
+Gets the device of the currently active device for the calling host thread.
**Returns:**
-- **`(int, CudaError)`**: The ID of the current device and an error code indicating the success or failure of the operation.
-
-### `GetDeviceFromPointer`
-
-Retrieves the device associated with a given pointer.
-
-**Parameters:**
-
-- **`ptr unsafe.Pointer`**: Pointer to query.
-
-**Returns:**
+- **`(*Device, EIcicleError)`**: The device pointer and an error code indicating the success or failure of the operation.
-- **`int`**: The device ID associated with the memory pointed to by `ptr`.
-This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
+This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA and other backends, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
diff --git a/docs/docs/icicle/golang-bindings/ntt.md b/docs/docs/icicle/golang-bindings/ntt.md
index b51c79f9a..9a947603e 100644
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -6,56 +6,66 @@
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/ntt"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
- "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+ "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
)
func init() {
- cfg := bn254.GetDefaultNttConfig()
- initDomain(18, cfg)
+ // Load backend using env path
+ runtime.LoadBackendFromEnvOrDefault()
+ // Set Cuda device to perform
+ device := runtime.CreateDevice("CUDA", 0)
+ runtime.SetDevice(&device)
+
+ cfg := core.GetDefaultNTTInitDomainConfig()
+ initDomain(18, cfg)
}
-func initDomain[T any](largestTestSize int, cfg core.NTTConfig[T]) core.IcicleError {
- rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
- rou := rouMont.Bits()
- rouIcicle := bn254.ScalarField{}
+func initDomain(largestTestSize int, cfg core.NTTInitDomainConfig) runtime.EIcicleError {
+ rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
+ rou := rouMont.Bits()
+ rouIcicle := bn254.ScalarField{}
+ limbs := core.ConvertUint64ArrToUint32Arr(rou[:])
- rouIcicle.FromLimbs(rou[:])
- e := bn254.InitDomain(rouIcicle, cfg.Ctx, false)
- return e
+ rouIcicle.FromLimbs(limbs)
+ e := ntt.InitDomain(rouIcicle, cfg)
+ return e
}
func main() {
- // Obtain the default NTT configuration with a predefined coset generator.
- cfg := bn254.GetDefaultNttConfig()
+ // Obtain the default NTT configuration with a predefined coset generator.
+ cfg := ntt.GetDefaultNttConfig()
- // Define the size of the input scalars.
- size := 1 << 18
+ // Define the size of the input scalars.
+ size := 1 << 18
- // Generate scalars for the NTT operation.
- scalars := bn254.GenerateScalars(size)
+ // Generate scalars for the NTT operation.
+ scalars := bn254.GenerateScalars(size)
- // Set the direction of the NTT (forward or inverse).
- dir := core.KForward
+ // Set the direction of the NTT (forward or inverse).
+ dir := core.KForward
- // Allocate memory for the results of the NTT operation.
- results := make(core.HostSlice[bn254.ScalarField], size)
+ // Allocate memory for the results of the NTT operation.
+ results := make(core.HostSlice[bn254.ScalarField], size)
- // Perform the NTT operation.
- err := bn254.Ntt(scalars, dir, &cfg, results)
- if err.CudaErrorCode != cr.CudaSuccess {
- panic("NTT operation failed")
- }
+ // Perform the NTT operation.
+ err := ntt.Ntt(scalars, dir, &cfg, results)
+ if err != runtime.Success {
+ panic("NTT operation failed")
+ }
+
+ ntt.ReleaseDomain()
}
```
## NTT Method
```go
-func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
+func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) runtime.EIcicleError
```
### Parameters
@@ -67,7 +77,7 @@ func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTCo
### Return Value
-- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the NTT operation.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
## NTT Configuration (NTTConfig)
@@ -75,29 +85,29 @@ The `NTTConfig` structure holds configuration parameters for the NTT operation,
```go
type NTTConfig[T any] struct {
- Ctx cr.DeviceContext
- CosetGen T
- BatchSize int32
- ColumnsBatch bool
- Ordering Ordering
- areInputsOnDevice bool
- areOutputsOnDevice bool
- IsAsync bool
- NttAlgorithm NttAlgorithm
+ StreamHandle runtime.Stream
+ CosetGen T
+ BatchSize int32
+ ColumnsBatch bool
+ Ordering Ordering
+ areInputsOnDevice bool
+ areOutputsOnDevice bool
+ IsAsync bool
+ Ext config_extension.ConfigExtensionHandler
}
```
### Fields
-- **`Ctx`**: Device context containing details like device ID and stream ID.
-- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
+- **`StreamHandle`**: Specifies the stream (queue) to use for async execution.
+- **`CosetGen`**: Coset generator. Used to perform coset (i)NTTs.
- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
-- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
-- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
+- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows.
+- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`), affecting how data is arranged.
- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
-- **`NttAlgorithm`**: Explicitly select the NTT algorithm. Default value: Auto (the implementation selects radix-2 or mixed-radix algorithm based on heuristics).
+- **`Ext`**: Extended configuration for backend.
### Default Configuration
@@ -112,7 +122,7 @@ func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
Before performing NTT operations, it's necessary to initialize the NTT domain; it only needs to be called once per GPU since the twiddles are cached.
```go
-func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bool) core.IcicleError
+func InitDomain(primitiveRoot bn254.ScalarField, cfg core.NTTInitDomainConfig) runtime.EIcicleError
```
This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.
@@ -122,30 +132,9 @@ This function initializes the domain with a given primitive root, optionally usi
The `ReleaseDomain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
```go
-func ReleaseDomain(ctx cr.DeviceContext) core.IcicleError
+func ReleaseDomain() runtime.EIcicleError
```
-### Parameters
-
-- **`ctx`**: a reference to the `DeviceContext` object, which represents the CUDA device context.
-
### Return Value
-The function returns a `core.IcicleError`, which represents the result of the operation. If the operation is successful, the function returns `core.IcicleErrorCode(0)`.
-
-### Example
-
-```go
-import (
- "github.com/icicle-crypto/icicle-core/cr"
- "github.com/icicle-crypto/icicle-core/core"
-)
-
-func example() {
- cfg := GetDefaultNttConfig()
- err := ReleaseDomain(cfg.Ctx)
- if err != nil {
- // Handle the error
- }
-}
-```
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md
index c39c8f2f0..e93d9a0a2 100644
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -17,9 +17,10 @@ Icicle exposes a number of vector operations which a user can use:
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/vecOps"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
@@ -30,8 +31,8 @@ func main() {
cfg := core.DefaultVecOpsConfig()
// Perform vector multiplication
- err := bn254.VecOp(a, b, out, cfg, core.Add)
- if err != cr.CudaSuccess {
+ err := vecOps.VecOp(a, b, out, cfg, core.Add)
+ if err != runtime.Success {
panic("Vector addition failed")
}
}
@@ -43,9 +44,10 @@ func main() {
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/vecOps"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
@@ -56,8 +58,8 @@ func main() {
cfg := core.DefaultVecOpsConfig()
// Perform vector multiplication
- err := bn254.VecOp(a, b, out, cfg, core.Sub)
- if err != cr.CudaSuccess {
+ err := vecOps.VecOp(a, b, out, cfg, core.Sub)
+ if err != runtime.Success {
panic("Vector subtraction failed")
}
}
@@ -69,9 +71,10 @@ func main() {
package main
import (
- "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
- cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
- bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/vecOps"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
)
func main() {
@@ -82,8 +85,8 @@ func main() {
cfg := core.DefaultVecOpsConfig()
// Perform vector multiplication
- err := bn254.VecOp(a, b, out, cfg, core.Mul)
- if err != cr.CudaSuccess {
+ err := vecOps.VecOp(a, b, out, cfg, core.Mul)
+ if err != runtime.Success {
panic("Vector multiplication failed")
}
}
@@ -92,7 +95,7 @@ func main() {
### VecOps Method
```go
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
+func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret runtime.EIcicleError)
```
#### Parameters
@@ -105,7 +108,7 @@ func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.V
#### Return Value
-- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the vector operation.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
### VecOpsConfig
@@ -113,21 +116,23 @@ The `VecOpsConfig` structure holds configuration parameters for the vector opera
```go
type VecOpsConfig struct {
- Ctx cr.DeviceContext
- isAOnDevice bool
- isBOnDevice bool
- isResultOnDevice bool
- IsAsync bool
+ StreamHandle runtime.Stream
+ isAOnDevice bool
+ isBOnDevice bool
+ isResultOnDevice bool
+ IsAsync bool
+ Ext config_extension.ConfigExtensionHandler
}
```
#### Fields
-- **Ctx**: Device context containing details like device ID and stream ID.
-- **isAOnDevice**: Indicates if vector `a` is located on the device.
-- **isBOnDevice**: Indicates if vector `b` is located on the device.
-- **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
-- **IsAsync**: Controls whether the vector operation runs asynchronously.
+- **`StreamHandle`**: Specifies the stream (queue) to use for async execution.
+- **`isAOnDevice`**: Indicates if vector `a` is located on the device.
+- **`isBOnDevice`**: Indicates if vector `b` is located on the device.
+- **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory).
+- **`IsAsync`**: Controls whether the vector operation runs asynchronously.
+- **`Ext`**: Extended configuration for backend.
#### Default Configuration
@@ -146,7 +151,7 @@ The function takes a matrix represented as a 1D slice and transposes it, storing
### Function
```go
-func TransposeMatrix(in, out core.HostOrDeviceSlice, columnSize, rowSize int, ctx cr.DeviceContext, onDevice, isAsync bool) (ret core.IcicleError)
+func TransposeMatrix(in, out core.HostOrDeviceSlice, columnSize, rowSize int, config core.VecOpsConfig) runtime.EIcicleError
```
## Parameters
@@ -155,13 +160,11 @@ func TransposeMatrix(in, out core.HostOrDeviceSlice, columnSize, rowSize int, ct
- **`out`**: The output matrix is a `core.HostOrDeviceSlice`, which will be the transpose of the input matrix, stored as a 1D slice.
- **`columnSize`**: The number of columns in the input matrix.
- **`rowSize`**: The number of rows in the input matrix.
-- **`ctx`**: The device context `cr.DeviceContext` to be used for the matrix transpose operation.
-- **`onDevice`**: Indicates whether the input and output slices are stored on the device (GPU) or the host (CPU).
-- **`isAsync`**: Indicates whether the matrix transpose operation should be executed asynchronously.
+- **`config`**: A `VecOpsConfig` object containing various configuration options for the vector operations.
## Return Value
-The function returns a `core.IcicleError` value, which represents the result of the matrix transpose operation. If the operation is successful, the returned value will be `0`.
+- **`EIcicleError`**: A `runtime.EIcicleError` value, which will be `runtime.Success` if the operation was successful, or an error if something went wrong.
## Example Usage
@@ -173,11 +176,11 @@ var output = make(core.HostSlice[ScalarField], 20)
// ...
// Get device context
-ctx, _ := cr.GetDefaultDeviceContext()
+cfg, _ := runtime.GetDefaultDeviceContext()
// Transpose the matrix
-err := TransposeMatrix(input, output, 5, 4, ctx, false, false)
-if err.IcicleErrorCode != core.IcicleErrorCode(0) {
+err := TransposeMatrix(input, output, 5, 4, cfg)
+if err != runtime.Success {
// Handle the error
}
diff --git a/docs/docs/icicle/install_cuda_backend.md b/docs/docs/icicle/install_cuda_backend.md
new file mode 100644
index 000000000..32c76793d
--- /dev/null
+++ b/docs/docs/icicle/install_cuda_backend.md
@@ -0,0 +1,30 @@
+
+# CUDA Backend
+
+## Overview
+
+The CUDA backend in ICICLE V3 is a high-performance, closed-source component designed to accelerate cryptographic computations using NVIDIA GPUs. This backend includes specialized libraries optimized for various cryptographic fields and curves, providing significant speedups for operations such as MSM, NTT, and elliptic curve operations.
+
+## Installation
+
+The CUDA backend is a closed-source component that requires a license. [To install the CUDA backend, see here](./getting_started#installing-and-using-icicle).
+
+### Licensing
+
+:::note
+Currently, the CUDA backend is free to use via Ingonyama’s icicle-cuda-backend-license server. By default, the CUDA backend will attempt to access this server. For more details, please contact support@ingonyama.com.
+:::
+
+The CUDA backend requires a valid license to function. There are two types of CUDA backend licenses:
+
+ 1. **Floating license**: In this mode, you host a license server, provided as a binary. This license supports a limited number of concurrent GPUs (N), which can be distributed across your machines as needed. N is decremented by 1 for each GPU using ICICLE per process. Once the process terminates (or crashes), the licenses are released.
+ 2. **Node locked license**: In this mode, the license is tied to a specific machine. The CUDA backend will accept the license only if it is used on the licensed machine.
+
+**To specify the license server address or file path::**
+
+```
+export ICICLE_LICENSE=port@ip # For license server
+export ICICLE_LICENSE=/path/to/license # For node-locked license
+```
+
+For further assist , contact our support team for assistance support@ingonyama.com
diff --git a/docs/docs/icicle/introduction.md b/docs/docs/icicle/introduction.md
deleted file mode 100644
index c45985055..000000000
--- a/docs/docs/icicle/introduction.md
+++ /dev/null
@@ -1,247 +0,0 @@
-# Getting started with ICICLE
-
-This guide is oriented towards developers who want to start writing code with the ICICLE libraries. If you just want to run your existing ZK circuits on GPU refer to [this guide](./integrations.md#using-icicle-integrations) please.
-
-## ICICLE repository overview
-
-![ICICLE API overview](../../static/img/apilevels.png)
-
-The diagram above displays the general architecture of ICICLE and the API layers that exist. The CUDA API, which we also call ICICLE Core, is the lowest level and is comprised of CUDA kernels which implement all primitives such as MSM as well as C++ wrappers which expose these methods for different curves.
-
-ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or linked with your C++ project. You can also implement a wrapper for it in any other language.
-
-Based on this dependency architecture, the ICICLE repository has three main sections:
-
-- [ICICLE Core](#icicle-core)
-- [ICICLE Rust bindings](#icicle-rust-and-golang-bindings)
-- [ICICLE Golang bindings](#icicle-rust-and-golang-bindings)
-
-### ICICLE Core
-
-[ICICLE Core](/icicle/core) is a library that directly works with GPU by defining CUDA kernels and algorithms that invoke them. It contains code for [fast field arithmetic](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/field/field.cuh), cryptographic primitives used in ZK such as [NTT](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/ntt/), [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/msm/), [Poseidon Hash](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/poseidon/), [Polynomials](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/polynomials/) and others.
-
-ICICLE Core would typically be compiled into a static library and either used in a third party language such as Rust or Golang, or linked with your own C++ project.
-
-### ICICLE Rust and Golang bindings
-
-- [ICICLE Rust bindings](/icicle/rust-bindings)
-- [ICICLE Golang bindings](/icicle/golang-bindings)
-
-These bindings allow you to easily use ICICLE in a Rust or Golang project. Setting up Golang bindings requires a bit of extra steps compared to the Rust bindings which utilize the `cargo build` tool.
-
-## Running ICICLE
-
-This guide assumes that you have a Linux or Windows machine with an Nvidia GPU installed. If you don't have access to an Nvidia GPU you can access one for free on [Google Colab](https://colab.google/).
-
-:::info note
-
-ICICLE can only run on Linux or Windows. **MacOS is not supported**.
-
-:::
-
-### Prerequisites
-
-- NVCC (version 12.0 or newer)
-- cmake 3.18 and above
-- GCC - version 9 or newer is recommended.
-- Any Nvidia GPU
-- Linux or Windows operating system.
-
-#### Optional Prerequisites
-
-- Docker, latest version.
-- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html)
-
-If you don't wish to install these prerequisites you can follow this tutorial using a [ZK-Container](https://github.com/ingonyama-zk/icicle/blob/main/Dockerfile) (docker container). To learn more about using ZK-Containers [read this](../ZKContainers.md).
-
-### Setting up ICICLE and running tests
-
-The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run an ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and [skip](#icicle-core-1) the docker section.
-
-#### Setting up environment with Docker
-
-Lets begin by cloning the ICICLE repository:
-
-```sh
-git clone https://github.com/ingonyama-zk/icicle
-```
-
-We will proceed to build the docker image [found here](https://github.com/ingonyama-zk/icicle/blob/main/Dockerfile):
-
-```sh
-docker build -t icicle-demo .
-docker run -it --runtime=nvidia --gpus all --name icicle_container icicle-demo
-```
-
-- `-it` runs the container in interactive mode with a terminal.
-- `--gpus all` Allocate all available GPUs to the container. You can also specify which GPUs to use if you don't want to allocate all.
-- `--runtime=nvidia` Use the NVIDIA runtime, necessary for GPU support.
-
-To read more about these settings reference this [article](https://developer.nvidia.com/nvidia-container-runtime).
-
-If you accidentally close your terminal and want to reconnect just call:
-
-```sh
-docker exec -it icicle_container bash
-```
-
-Lets make sure that we have the correct CUDA version before proceeding
-
-```sh
-nvcc --version
-```
-
-You should see something like this
-
-```sh
-nvcc: NVIDIA (R) Cuda compiler driver
-Copyright (c) 2005-2023 NVIDIA Corporation
-Built on Tue_Aug_15_22:02:13_PDT_2023
-Cuda compilation tools, release 12.2, V12.2.140
-Build cuda_12.2.r12.2/compiler.33191640_0
-```
-
-Make sure the release version is at least 12.0.
-
-#### ICICLE Core
-
-ICICLE Core is found under [`/icicle`](https://github.com/ingonyama-zk/icicle/tree/main/icicle). To build and run the tests first:
-
-```sh
-cd icicle
-```
-
-For this example, we are going to compile ICICLE for a `bn254` curve. However other compilation strategies are supported.
-
-```sh
-mkdir -p build
-cmake -S . -B build -DCURVE=bn254 -DBUILD_TESTS=ON
-cmake --build build -j
-```
-
-`-DBUILD_TESTS` option compiles the tests, without this flag `ctest` won't work.
-`-DCURVE` option tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/cmake/CurvesCommon.cmake#L2).
-
-The output in `build` folder should include the static libraries for the compiled curve.
-
-To run the test
-
-```sh
-cd build/tests
-ctest
-```
-
-#### ICICLE Rust
-
-The rust bindings work by first compiling the CUDA static libraries as seen [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs). The compilation of CUDA and the Rust library is all handled by the rust build toolchain.
-
-Similar to ICICLE Core here we also have to compile per curve.
-
-Lets compile curve `bn254`
-
-```sh
-cd wrappers/rust/icicle-curves/icicle-bn254
-```
-
-Now lets build our library
-
-```sh
-cargo build --release
-```
-
-This may take a couple of minutes since we are compiling both the CUDA and Rust code.
-
-To run the tests
-
-```sh
-cargo test
-```
-
-We also include some benchmarks
-
-```sh
-cargo bench
-```
-
-#### ICICLE Golang
-
-The Golang bindings require compiling ICICLE Core first. We supply a [build script](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/build.sh) to help build what you need.
-
-Script usage:
-
-```sh
-./build.sh [-curve=] [-field=] [-hash=] [-cuda_version=] [-g2] [-ecntt] [-devmode]
-
-curve - The name of the curve to build or "all" to build all supported curves
-field - The name of the field to build or "all" to build all supported fields
-hash - The name of the hash to build or "all" to build all supported hashes
--g2 - Optional - build with G2 enabled
--ecntt - Optional - build with ECNTT enabled
--devmode - Optional - build in devmode
-```
-
-:::note
-
-If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
-
-:::
-
-Once the library has been built, you can use and test the Golang bindings.
-
-To test a specific curve, field or hash, change to it's directory and then run:
-
-```sh
-go test ./tests -count=1 -failfast -timeout 60m -p 2 -v
-```
-
-You will be able to see each test that runs, how long it takes and whether it passed or failed
-
-### Running ICICLE examples
-
-ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/examples) these examples cover some simple use cases using C++, rust and golang.
-
-Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/example.cu).
-
-```sh
-cd examples/c++/msm
-./compile.sh
-./run.sh
-```
-
-:::tip
-
-Read through the compile.sh and CMakeLists.txt to understand how to link your own C++ project with ICICLE
-
-:::
-
-#### Running with Docker
-
-In each example directory, ZK-container files are located in a subdirectory `.devcontainer`.
-
-```sh
-msm/
-├── .devcontainer
- ├── devcontainer.json
- └── Dockerfile
-```
-
-Now lets build our docker file and run the test inside it. Make sure you have installed the [optional prerequisites](#optional-prerequisites).
-
-```sh
-docker build -t icicle-example-msm -f .devcontainer/Dockerfile .
-```
-
-Lets start and enter the container
-
-```sh
-docker run -it --rm --gpus all -v .:/icicle-example icicle-example-msm
-```
-
-Inside the container you can run the same commands:
-
-```sh
-./compile.sh
-./run.sh
-```
-
-You can now experiment with our other examples, perhaps try to run a rust or golang example next.
diff --git a/docs/docs/icicle/libraries.md b/docs/docs/icicle/libraries.md
new file mode 100644
index 000000000..00c561e16
--- /dev/null
+++ b/docs/docs/icicle/libraries.md
@@ -0,0 +1,59 @@
+# ICICLE libraries
+
+ICICLE is composed of two main logical parts:
+1. [**ICICLE device library**](#icicle-device)
+2. [**ICICLE template core library**](#icicle-core)
+
+## ICICLE device
+
+The ICICLE device library serves as an abstraction layer for interacting with various hardware devices. It provides a comprehensive interface for tasks such as setting the active device, querying device-specific information like free and total memory, determining the number of available devices, and managing memory allocation. Additionally, it offers functionality for copying data to and from devices, managing task queues (streams) for efficient device utilization, and abstracting the complexities of device management away from the user.
+
+See programmers guide for more details. [C++](./programmers_guide/cpp#device-management), [Rust](./programmers_guide/rust#device-management), [Go TODO](./programmers_guide/go)
+
+## ICICLE Core
+
+ICICLE Core is a template library written in C++ that implements fundamental cryptographic operations, including field and curve arithmetic, as well as higher-level APIs such as MSM and NTT.
+
+The Core can be [instantiated](./build_from_source) for different fields, curves, and other cryptographic components, allowing you to tailor it to your specific needs. You can link your application to one or more ICICLE libraries, depending on the requirements of your project. For example, you might only need the babybear library or a combination of babybear and a Merkle tree builder.
+
+
+### Rust
+Each library has a corresponding crate. See [programmers guide](./programmers_guide/general.md) for more details.
+
+### Supported curves, fields and operations
+
+#### Supported curves and operations
+
+| Operation\Curve | [bn254](https://neuromancer.sk/std/bn/bn254) | [bls12-377](https://neuromancer.sk/std/bls/BLS12-377) | [bls12-381](https://neuromancer.sk/std/bls/BLS12-381) | [bw6-761](https://eprint.iacr.org/2020/351) | grumpkin |
+| --------------------------------------------------- | :------------------------------------------: | :---------------------------------------------------: | :---------------------------------------------------: | :-----------------------------------------: | :------: |
+| [MSM](./primitives/msm) | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2 MSM | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [NTT](./primitives/ntt) | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [Vector operations](./primitives/vec_ops) | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Polynomials](./polynomials/overview) | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [Poseidon](primitives/poseidon) | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Merkle Tree](primitives/poseidon#the-tree-builder) | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+#### Supported fields and operations
+
+| Operation\Field | [babybear](https://eprint.iacr.org/2023/824.pdf) | [Stark252](https://docs.starknet.io/documentation/architecture_and_concepts/Cryptography/p-value/) |
+| ----------------------------------------- | :----------------------------------------------: | :------------------------------------------------------------------------------------------------: |
+| [Vector operations](./primitives/vec_ops) | ✅ | ✅ |
+| [Polynomials](./polynomials/overview) | ✅ | ✅ |
+| [NTT](primitives/ntt) | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
+
+#### Supported hashes
+
+| Hash | Sizes |
+| ------ | :------: |
+| Keccak | 256, 512 |
+
+## Backend
+Each backend must implement the device API interface.
+Each backend may implement
+- One or more ICICLE library. For example implement only bn254 curve.
+- One or more APIs in this library. For example MSM only.
+
+See [CUDA backend](./install_cuda_backend.md) and [Build Your Own Backend](./build_your_own_backend.md) for more info about implementing a backend.
diff --git a/docs/docs/icicle/migrate_from_v2.md b/docs/docs/icicle/migrate_from_v2.md
new file mode 100644
index 000000000..792f3b435
--- /dev/null
+++ b/docs/docs/icicle/migrate_from_v2.md
@@ -0,0 +1,93 @@
+
+# Migration from Icicle V2 to V3
+
+Icicle V3 introduces a unified interface for high-performance computing across various devices, extending the functionality that was previously limited to GPUs in Icicle V2. This guide will assist you in transitioning from Icicle V2 to V3 by highlighting the key changes and providing examples for both C++ and Rust.
+
+## Key Conceptual Changes
+
+- **Device Independence**: n V2, Icicle was primarily designed for GPU computation, directly utilizing CUDA APIs. In V3, Icicle has evolved to support a broader range of computational devices, including CPUs, GPUs, and other accelerators. As a result, CUDA APIs have been replaced with device-agnostic Icicle APIs.
+
+- **Unified API**: The APIs are now standardized across all devices, ensuring consistent usage and reducing the complexity of managing different hardware backends.
+
+:::warning
+When migrating from V2 to V3, it is important to note that, by default, your code now executes on the CPU. This contrasts with V2, which was exclusively a CUDA library. For details on installing and using CUDA GPUs, refer to the [CUDA backend guide](./install_cuda_backend.md).
+:::
+
+## Migration Guide for C++
+
+### Replacing CUDA APIs with Icicle APIs
+
+In Icicle V3, CUDA-specific APIs have been replaced with Icicle APIs that are designed to be backend-agnostic. This allows your code to run on different devices without requiring modifications.
+
+- **Device Management**: Use Icicle's device management APIs instead of CUDA-specific functions. For example, instead of `cudaSetDevice()`, you would use `icicle_set_device()`.
+
+- **Memory Management**: Replace CUDA memory management functions such as `cudaMalloc()` and `cudaFree()` with Icicle's `icicle_malloc()` and `icicle_free()`.
+
+- **Stream Management**: Replace `cudaStream_t` with `icicleStreamHandle` and use Icicle's stream management functions.
+
+For a detailed overview and examples, please refer to the [Icicle C++ Programmer's Guide](./programmers_guide/cpp.md) for full API details.
+
+### Example Migration
+
+**V2 (CUDA-specific):**
+```cpp
+cudaStream_t stream;
+cudaStreamCreate(&stream);
+void* device_ptr;
+cudaMalloc(&device_ptr, 1024);
+// Perform operations using CUDA APIs
+cudaStreamDestroy(stream);
+cudaFree(device_ptr);
+```
+
+**V3 (Device-agnostic):**
+```cpp
+icicleStreamHandle stream;
+icicle_create_stream(&stream);
+void* device_ptr;
+icicle_malloc(&device_ptr, 1024);
+// Perform operations using Icicle APIs
+icicle_destroy_stream(stream);
+icicle_free(device_ptr);
+```
+
+## Migration Guide for Rust
+
+### Replacing `icicle_cuda_runtime` with `icicle_runtime`
+
+In Icicle V3, the `icicle_cuda_runtime` crate is replaced with the `icicle_runtime` crate. This change reflects the broader support for different devices beyond just CUDA-enabled GPUs.
+
+- **Device Management**: Use `icicle_runtime`'s device management functions instead of those in `icicle_cuda_runtime`. The `Device` struct remains central, but it's now part of a more generalized runtime.
+
+- **Memory Abstraction**: The `DeviceOrHostSlice` trait remains for memory abstraction, allowing seamless data handling between host and device.
+
+- **Stream Management**: Replace `CudaStream` with `IcicleStream`, which now supports broader device types.
+
+### Example Migration
+
+**V2 (`icicle_cuda_runtime`):**
+```rust
+use icicle_cuda_runtime::{CudaStream, DeviceVec, HostSlice};
+
+let mut stream = CudaStream::create().unwrap();
+let mut device_memory = DeviceVec::cuda_malloc(1024).unwrap();
+// Perform operations using CudaStream and related APIs
+stream.synchronize().unwrap();
+```
+
+**V3 (`icicle_runtime`):**
+```rust
+use icicle_runtime::{IcicleStream, DeviceVec, HostSlice};
+
+let mut stream = IcicleStream::create().unwrap();
+let mut device_memory = DeviceVec::device_malloc(1024).unwrap();
+// Perform operations using IcicleStream and related APIs
+stream.synchronize().unwrap();
+```
+
+### Other Considerations
+
+- **API Names**: While most API names remain consistent, they are now part of a more generalized runtime that can support multiple devices. Ensure that you update the crate imports and function calls accordingly.
+- **Backend Loading**: Ensure that you are loading the appropriate backend using the `load_backend_from_env_or_default()` function, which is essential for setting up the runtime environment.
+
+For further details and examples, refer to the [Programmer's Guide](./programmers_guide/general.md).
diff --git a/docs/docs/icicle/multi-device.md b/docs/docs/icicle/multi-device.md
new file mode 100644
index 000000000..07f690a8d
--- /dev/null
+++ b/docs/docs/icicle/multi-device.md
@@ -0,0 +1,79 @@
+
+# Multi-Device with ICICLE
+
+:::info
+If you are looking for the Multi-Device API documentation refer [here](./rust-bindings/multi-gpu.md) for Rust and [here](./golang-bindings/multi-gpu.md) for Golang.
+:::
+
+One common challenge with Zero-Knowledge (ZK) computation is managing large input sizes. It's not uncommon to encounter circuits surpassing 2^25 constraints, which can push even the most advanced compute devices to their limits. To effectively scale and process such large circuits, leveraging multiple devices becomes a necessity.
+
+Multi-Device programming involves developing software to operate across multiple GPUs, CPUs, or other specialized hardware devices. Let's first explore different approaches to Multi-Device programming, then we will cover how ICICLE allows you to easily develop your ZK computations to run across many devices.
+
+## Approaches to Multi-Device Programming
+
+There are many [different strategies](https://github.com/NVIDIA/multi-gpu-programming-models) available for implementing multi-device systems, but they can generally be split into two categories:
+
+### Device Server Approach
+
+This approach typically involves a single or multiple CPUs managing threads that read from and write to multiple devices, such as GPUs, CPUs, or accelerators. You can think of it as a scaled-up HOST-Device model.
+
+![Multi-Device Server Approach](image.png)
+
+This approach doesn't necessarily allow for tackling larger computation sizes, but it does enable the simultaneous computation of tasks that wouldn't fit on a single device.
+
+For example, if you needed to compute two MSMs of size 2^26 on a GPU with 16GB of VRAM, you would normally have to perform these tasks asynchronously. However, by doubling the number of GPUs or other devices in your system, you can now run these computations in parallel.
+
+### Inter-Device Approach
+
+This more sophisticated approach involves leveraging technologies such as [GPUDirect, NCCL, NVSHMEM](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-cwes1084/), NVLink, or other interconnect technologies to combine multiple devices and split a computation across different hardware.
+
+This approach requires redesigning the algorithm at the software level to be compatible with splitting tasks among devices. In some cases, to minimize latency, special inter-device connections can be installed on a server to allow direct communication between multiple devices.
+
+## Writing ICICLE Code for Multi-Device Setups
+
+Currently, ICICLE adopts a Device Server approach, where we assume you have a machine with multiple devices (GPUs, CPUs, etc.) and wish to run computations on each device.
+
+Each thread needs to set a device. Following api calls (including memory management and compute apis) will execute on that device, for this thread.
+
+### C++
+```cpp
+// Set the device to CPU
+Device device = "CPU";
+icicle_set_device(device);
+```
+### Rust
+```rust
+// Set the device to the first CUDA GPU (GPU-0)
+let device = icicle_runtime::Device::new("CUDA", 0 /* =device_id*/);
+icicle_runtime::set_device(&device).unwrap();
+```
+
+### Go (TODO)
+
+## Best Practices
+
+- **Avoid Hardcoding Device IDs:** If you want your software to take full advantage of all available devices on a machine, use methods such as `icicle_get_device_count()` to support an arbitrary number of devices.
+```cpp
+/**
+ * @brief Checks if the specified device is available.
+ *
+ * @param dev The device to check for availability.
+ * @return eIcicleError Status of the device availability check.
+ * - `SUCCESS` if the device is available.
+ * - `INVALID_DEVICE` if the device is not available.
+ */
+extern "C" eIcicleError icicle_is_device_available(const Device& dev);
+
+ /**
+ * @brief Get number of available devices active device for thread
+ *
+
+ * @return eIcicleError Status of the device set
+ */
+extern "C" eIcicleError icicle_get_device_count(int& device_count /*OUT*/);
+```
+
+- **Launch One CPU Thread per Device:** To avoid [common errors](https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/) and ensure code readability, we recommend creating a dedicated thread for each device. Within each CPU thread, you can launch as many tasks as you like for a device, as long as they all run on the same device ID. This will make your code more manageable, easier to read, and performant.
+---
+
+This guide provides an overview of multi-device support in ICICLE, explaining the approaches and best practices for efficiently scaling ZK computations across multiple devices. For further information or support, please refer to our [documentation](./) or join the discussion on [Discord](https://discord.gg/6vYrE7waPj).
diff --git a/docs/docs/icicle/multi-gpu.md b/docs/docs/icicle/multi-gpu.md
deleted file mode 100644
index 4a31051ce..000000000
--- a/docs/docs/icicle/multi-gpu.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Multi GPU with ICICLE
-
-:::info
-
-If you are looking for the Multi GPU API documentation refer [here](./rust-bindings/multi-gpu.md) for Rust and [here](./golang-bindings/multi-gpu.md) for Golang.
-
-:::
-
-One common challenge with Zero-Knowledge computation is managing the large input sizes. It's not uncommon to encounter circuits surpassing 2^25 constraints, pushing the capabilities of even advanced GPUs to their limits. To effectively scale and process such large circuits, leveraging multiple GPUs in tandem becomes a necessity.
-
-Multi-GPU programming involves developing software to operate across multiple GPU devices. Lets first explore different approaches to Multi-GPU programming then we will cover how ICICLE allows you to easily develop youR ZK computations to run across many GPUs.
-
-## Approaches to Multi GPU programming
-
-There are many [different strategies](https://github.com/NVIDIA/multi-gpu-programming-models) available for implementing multi GPU, however, it can be split into two categories.
-
-### GPU Server approach
-
-This approach usually involves a single or multiple CPUs opening threads to read / write from multiple GPUs. You can think about it as a scaled up HOST - Device model.
-
-![alt text](image.png)
-
-This approach won't let us tackle larger computation sizes but it will allow us to compute multiple computations which we wouldn't be able to load onto a single GPU.
-
-For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel.
-
-### Inter GPU approach
-
-This approach involves a more sophisticated approach to multi GPU computation. Using technologies such as [GPUDirect, NCCL, NVSHMEM](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-cwes1084/) and NVLink it's possible to combine multiple GPUs and split a computation among different devices.
-
-This approach requires redesigning the algorithm at the software level to be compatible with splitting amongst devices. In some cases, to lower latency to a minimum, special inter GPU connections would be installed on a server to allow direct communication between multiple GPUs.
-
-## Writing ICICLE Code for Multi GPUs
-
-The approach we have taken for the moment is a GPU Server approach; we assume you have a machine with multiple GPUs and you wish to run some computation on each GPU.
-
-To dive deeper and learn about the API check out the docs for our different ICICLE API
-
-- [Rust Multi GPU APIs](./rust-bindings/multi-gpu.md)
-- [Golang Multi GPU APIs](./golang-bindings/multi-gpu.md)
-- C++ Multi GPU APIs
-
-## Best practices
-
-- Never hardcode device IDs, if you want your software to take advantage of all GPUs on a machine use methods such as `get_device_count` to support arbitrary number of GPUs.
-
-- Launch one CPU thread per GPU. To avoid [nasty errors](https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/) and hard to read code we suggest that for every GPU you create a dedicated thread. Within a CPU thread you should be able to launch as many tasks as you wish for a GPU as long as they all run on the same GPU id. This will make your code way more manageable, easy to read and performant.
-
-## ZKContainer support for multi GPUs
-
-Multi GPU support should work with ZK-Containers by simply defining which devices the docker container should interact with:
-
-```sh
-docker run -it --gpus '"device=0,2"' zk-container-image
-```
-
-If you wish to expose all GPUs
-
-```sh
-docker run --gpus all zk-container-image
-```
diff --git a/docs/docs/icicle/overview.md b/docs/docs/icicle/overview.md
index 2a7f9e847..49fb67ecc 100644
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -1,58 +1,77 @@
-# What is ICICLE?
+---
+slug: /icicle/overview
+title: ICICLE Overview
+---
+
+# ICICLE Overview
+
+## What is ICICLE?
[![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)
-[ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs. ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.
+[ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library designed to accelerate zero-knowledge proofs (ZKPs) using multiple compute backends, including GPUs, CPUs, and potentially other platforms. ICICLE's key strength lies in its ability to implement blazing-fast cryptographic primitives, enabling developers to significantly reduce proving times with minimal effort.
+
+## Key Features
-ICICLE allows developers with minimal GPU experience to effortlessly accelerate their ZK application; from our experiments, even the most naive implementation may yield 10X improvement in proving times.
+- **Acceleration of “zk” Math:** ICICLE provides optimized implementations for cryptographic primitives crucial to zero-knowledge proofs, such as elliptic curve operations, MSM, NTT, Poseidon hash, and more.
+- **Set of Libraries:** ICICLE includes a comprehensive set of libraries supporting various fields, curves, and other cryptographic needs.
+- **Cross-Language Support:** Available bindings for C++, Rust, Go, and potentially Python make ICICLE accessible across different development environments.
+- **Backend Agnosticism:** Develop on CPU and deploy on various backends, including GPUs, specialized hardware, and other emerging platforms, depending on your project's needs.
+- **Extensibility:** Designed for easy integration and extension, allowing you to build and deploy custom backends and cryptographic primitives.
-ICICLE has been used by many leading ZK companies such as [Celer Network](https://github.com/celer-network), [Gnark](https://github.com/Consensys/gnark) and others to accelerate their ZK proving pipeline.
+## Evolution from V2 to V3
-## Dont have access to a GPU?
+Originally, ICICLE was focused solely on GPU acceleration. With the release of V3, ICICLE now supports multiple backends, making it more versatile and adaptable to different hardware environments. Whether you're leveraging the power of GPUs or exploring other compute platforms, ICICLE V3 is designed to fit your needs.
-We understand that not all developers have access to a GPU and we don't want this to limit anyone from developing with ICICLE.
-Here are some ways we can help you gain access to GPUs:
+## Who Uses ICICLE?
-:::note
+ICICLE has been successfully integrated and used by leading ZK companies such as [Celer Network](https://github.com/celer-network), [Gnark](https://github.com/Consensys/gnark), and others to enhance their ZK proving pipelines.
-If none of the following options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to help you.
+## Don't Have Access to a GPU?
-:::
+We understand that not all developers have access to GPUs, but this shouldn't limit your ability to develop with ICICLE. Here are some ways to gain access to GPUs.
### Grants
-At Ingonyama we are interested in accelerating the progress of ZK and cryptography. If you are an engineer, developer or an academic researcher we invite you to checkout [our grant program](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges). We will give you access to GPUs and even pay you to do your dream research!
+At Ingonyama, we are committed to accelerating progress in ZK and cryptography. If you're an engineer, developer, or academic researcher, we invite you to check out [our grant program](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges). We can provide access to GPUs and even fund your research.
### Google Colab
-This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.
+Google Colab is a great platform to start experimenting with ICICLE instantly. It offers free access to NVIDIA T4 GPUs, which are more than sufficient for experimenting and prototyping with ICICLE.
-For an extensive guide on how to setup Google Colab with ICICLE refer to [this article](./colab-instructions.md).
+For a detailed guide on setting up Google Colab with ICICLE, refer to [this article](./colab-instructions.md).
### Vast.ai
-[Vast.ai](https://vast.ai/) is a global GPU marketplace where you can rent many different types of GPUs by the hour for [competitive pricing](https://vast.ai/pricing). They provide on-demand and interruptible rentals depending on your need or use case; you can learn more about their rental types [here](https://vast.ai/faq#rental-types).
+[Vast.ai](https://vast.ai/) offers a global GPU marketplace where you can rent different types of GPUs by the hour at competitive prices. Whether you need on-demand or interruptible rentals, Vast.ai provides flexibility for various use cases. Learn more about their rental options [here](https://vast.ai/faq#rental-types).
+
+## What Can You Do with ICICLE?
+
+[ICICLE](https://github.com/ingonyama-zk/icicle) can be used similarly to any other cryptography library. Through various integrations, ICICLE has proven effective in multiple use cases:
+
+### Circuit Developers
-## What can you do with ICICLE?
+If you're a circuit developer facing bottlenecks, integrating ICICLE into your prover may solve performance issues. ICICLE is integrated into popular ZK provers like [Gnark](https://github.com/Consensys/gnark) and [Halo2](https://github.com/zkonduit/halo2), enabling immediate GPU acceleration without additional code changes.
-[ICICLE](https://github.com/ingonyama-zk/icicle) can be used in the same way you would use any other cryptography library. While developing and integrating ICICLE into many proof systems, we found some use case categories:
+### Integrating into Existing ZK Provers
-### Circuit developers
+ICICLE allows for selective acceleration of specific parts of your ZK prover, helping to address specific bottlenecks without requiring a complete rewrite of your prover.
-If you are a circuit developer and are experiencing bottlenecks while running your circuits, an ICICLE integrated prover may be the solution.
+### Developing Your Own ZK Provers
-ICICLE has been integrated into a number of popular ZK provers including [Gnark prover](https://github.com/Consensys/gnark) and [Halo2](https://github.com/zkonduit/halo2). This means that you can enjoy GPU acceleration for your existing circuits immediately without writing a single line of code by simply switching on the GPU prover flag!
+For those building ZK provers from the ground up, ICICLE is an ideal tool for creating optimized and scalable provers. The ability to scale across multiple machines within a data center is a key advantage when using ICICLE with GPUs.
-### Integrating into existing ZK provers
+### Developing Proof of Concepts
-From our collaborations we have learned that its possible to accelerate a specific part of your prover to solve for a specific bottleneck.
+ICICLE is also well-suited for prototyping and developing small-scale projects. With bindings for Golang and Rust, you can easily create a library implementing specific cryptographic primitives, such as a KZG commitment library, using ICICLE.
-ICICLE can be used to accelerate specific parts of your prover without completely rewriting your ZK prover.
+---
-### Developing your own ZK provers
+## Get Started with ICICLE
-If your goal is to build a ZK prover from the ground up, ICICLE is an ideal tool for creating a highly optimized and scalable ZK prover. A key benefit of using GPUs with ICICLE is the ability to scale your ZK prover efficiently across multiple machines within a data center.
+Explore the full capabilities of ICICLE by diving into the [Architecture](./arch_overview.md), [Getting Started Guide](./getting_started.md) and the [programmers guide](./programmers_guide/general.md) to learn how to integrate, deploy, and extend ICICLE across different backends.
-### Developing proof of concepts
+If you have any questions or need support, feel free to reach out on [Discord] or [GitHub](https://github.com/ingonyama-zk). We're here to help you accelerate your ZK development with ICICLE.
-ICICLE is also ideal for developing small prototypes. ICICLE has Golang and Rust bindings so you can easily develop a library implementing a specific primitive using ICICLE. An example would be develop a KZG commitment library using ICICLE.
+[ICICLE-OVERVIEW]: ./icicle/overview.md
+[Discord]: https://discord.gg/6vYrE7waPj
\ No newline at end of file
diff --git a/docs/docs/icicle/polynomials/overview.md b/docs/docs/icicle/polynomials/overview.md
index 06058ffe4..881eeef88 100644
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -55,20 +55,8 @@ The Polynomial class encapsulates a polynomial, providing a variety of operation
This section outlines how to use the Polynomial API in C++. Bindings for Rust and Go are detailed under the Bindings sections.
-### Backend Initialization
-
-Initialization with an appropriate factory is required to configure the computational context and backend.
-
-```cpp
-#include "polynomials/polynomials.h"
-#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
-
-// Initialize with a CUDA backend
-Polynomial::initialize(std::make_shared());
-```
-
:::note
-Initialization of a factory must be done per linked curve or field.
+Make sure to set an ICICLE device prior to using the polynomial API.
:::
### Construction
@@ -336,7 +324,7 @@ The Polynomial API includes comprehensive support for multi-GPU environments, a
Like other components of the icicle framework, the Polynomial API allows explicit setting of the current CUDA device:
```cpp
-cudaSetDevice(int deviceID);
+icicle_set_device(devA);
```
This function sets the active CUDA device. All subsequent operations that allocate or deal with polynomial data will be performed on this device.
@@ -347,10 +335,10 @@ Polynomials are always allocated on the current CUDA device at the time of their
```cpp
// Set the device before creating polynomials
-cudaSetDevice(0);
+icicle_set_device(devA);
Polynomial p1 = Polynomial::from_coefficients(coeffs, size);
-cudaSetDevice(1);
+icicle_set_device(devB);
Polynomial p2 = Polynomial::from_coefficients(coeffs, size);
```
@@ -360,7 +348,7 @@ When performing operations that result in the creation of new polynomials (such
```cpp
// Ensure both operands are on the same device
-cudaSetDevice(0);
+icicle_set_device(devA);
auto p3 = p1 + p2; // Throws an exception if p1 and p2 are not on the same device
```
diff --git a/docs/docs/icicle/primitives/Icicle_Release_README.md b/docs/docs/icicle/primitives/Icicle_Release_README.md
new file mode 100644
index 000000000..4d3a76e89
--- /dev/null
+++ b/docs/docs/icicle/primitives/Icicle_Release_README.md
@@ -0,0 +1,91 @@
+
+# Icicle Release README
+
+## Overview
+
+Icicle is a powerful C++ library designed to provide flexible and efficient computation through its modular backend architecture. This README explains how to build and release Icicle for multiple Linux distributions, including Ubuntu 20.04, Ubuntu 22.04, and CentOS 7. It also describes the content of a release and how to use the generated tar files.
+
+## Content of a Release
+
+Each Icicle release includes a tar file containing the build artifacts for a specific distribution. The tar file includes the following structure:
+
+- **`./icicle/include/`**: This directory contains all the necessary header files for using the Icicle library from C++.
+
+- **`./icicle/lib/`**:
+ - **Icicle Libraries**: All the core Icicle libraries are located in this directory. Applications linking to Icicle will use these libraries.
+ - **Backends**: The `./icicle/lib/backend/` directory houses backend libraries, including the CUDA backend. While the CUDA backend is included, it will only be used on machines with a GPU. On machines without a GPU, the CUDA backend is not utilized.
+
+### Considerations
+
+Currently, the CUDA backend is included in every installation tar file, even on machines without a GPU. This ensures consistency across installations but results in additional files being installed that may not be used.
+
+**TODO**: Consider splitting the release into two separate tar files—one with the CUDA backend and one without—depending on the target machine’s hardware capabilities.
+
+## Build Docker Image
+
+To build the Docker images for each distribution and CUDA version, use the following commands:
+
+```bash
+# Ubuntu 22.04, CUDA 12.2
+docker build -t icicle-release-ubuntu22-cuda122 -f Dockerfile.ubuntu22 .
+
+# Ubuntu 20.04, CUDA 12.2
+docker build -t icicle-release-ubuntu20-cuda122 -f Dockerfile.ubuntu20 .
+
+# CentOS 7, CUDA 12.2
+docker build -t icicle-release-centos7-cuda122 -f Dockerfile.centos7 .
+```
+
+### Docker Environment Explanation
+
+The Docker images you build represent the target environment for the release. Each Docker image is tailored to a specific distribution and CUDA version. You first build the Docker image, which sets up the environment, and then use this Docker image to build the release tar file. This ensures that the build process is consistent and reproducible across different environments.
+
+## Build Libraries Inside the Docker
+
+To build the Icicle libraries inside a Docker container and output the tar file to the `release_output` directory:
+
+```bash
+mkdir -p release_output
+docker run --rm --gpus all -v ./icicle:/icicle -v ./release_output:/output -v ./scripts:/scripts icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh
+```
+
+This command executes the `build_release_and_tar.sh` script inside the Docker container, which provides the build environment. It maps the source code and output directory to the container, ensuring the generated tar file is available on the host system.
+
+You can replace `icicle-release-ubuntu22-cuda122` with another Docker image tag to build in the corresponding environment (e.g., Ubuntu 20.04 or CentOS 7).
+
+## Installing and Using the Release
+
+1. **Extract the Tar File**:
+ - Download the appropriate tar file for your distribution (Ubuntu 20.04, Ubuntu 22.04, or CentOS 7).
+ - Extract it to your desired location:
+ ```bash
+ tar -xzvf icicle--cuda122.tar.gz -C /path/to/install/location
+ ```
+
+2. **Linking Your Application**:
+ - When compiling your C++ application, link against the Icicle libraries found in `./icicle/lib/`:
+ ```bash
+ g++ -o myapp myapp.cpp -L/path/to/icicle/lib -licicle_device -licicle_field_or_curve
+ ```
+ - Note: You only need to link to the Icicle device and field or curve libraries. The backend libraries are dynamically loaded at runtime.
+
+## Backend Loading
+
+The Icicle library dynamically loads backend libraries at runtime. By default, it searches for backends in the following order:
+
+1. **Environment Variable**: If the `ICICLE_BACKEND_INSTALL_DIR` environment variable is defined, Icicle will prioritize this location.
+2. **Default Directory**: If the environment variable is not set, Icicle will search in the default directory `/opt/icicle/lib/backend`.
+
+### Custom Backend Loading
+
+If you need to load a backend from a custom location at any point during runtime, you can call the following function:
+
+```cpp
+extern "C" eIcicleError icicle_load_backend(const char* path, bool is_recursive);
+```
+
+- **`path`**: The directory where the backend libraries are located.
+- **`is_recursive`**: If `true`, the function will search for backend libraries recursively within the specified path.
+
+---
+
diff --git a/docs/docs/icicle/primitives/keccak.md b/docs/docs/icicle/primitives/keccak.md
index 37ea9904d..0d2a0dc90 100644
--- a/docs/docs/icicle/primitives/keccak.md
+++ b/docs/docs/icicle/primitives/keccak.md
@@ -1,5 +1,7 @@
# Keccak
+TODO update for V3
+
[Keccak](https://keccak.team/files/Keccak-implementation-3.2.pdf) is a cryptographic hash function designed by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van Assche. It was selected as the winner of the NIST hash function competition, becoming the basis for the [SHA-3 standard](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf).
Keccak operates on a message input of any length and produces a fixed-size hash output. The hash function is built upon the sponge construction, which involves absorbing the input data followed by squeezing out the hash value.
diff --git a/docs/docs/icicle/primitives/merkle.md b/docs/docs/icicle/primitives/merkle.md
new file mode 100644
index 000000000..ab6b1badd
--- /dev/null
+++ b/docs/docs/icicle/primitives/merkle.md
@@ -0,0 +1,3 @@
+# Merkle tree
+
+TODO update for V3
\ No newline at end of file
diff --git a/docs/docs/icicle/primitives/msm.md b/docs/docs/icicle/primitives/msm.md
index 7cb4a0abf..32c6a5de7 100644
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -1,60 +1,134 @@
-# MSM - Multi scalar multiplication
-
-MSM stands for Multi scalar multiplication, its defined as:
-
-
-
-Where
-
-$G_j \in G$ - points from an Elliptic Curve group.
-
-$a_0, \ldots, a_n$ - Scalars
-
-$MSM(a, G) \in G$ - a single EC (elliptic curve) point
-
-In words, MSM is the sum of scalar and EC point multiplications. We can see from this definition that the core operations occurring are Modular Multiplication and Elliptic curve point addition. Its obvious that multiplication can be computed in parallel and then the products summed, making MSM inherently parallelizable.
-Accelerating MSM is crucial to a ZK protocol's performance due to the [large percent of run time](https://hackmd.io/@0xMonia/SkQ6-oRz3#Hardware-acceleration-in-action) they take when generating proofs.
+# MSM - Multi Scalar Multiplication
+
+## Overview
+
+Multi-Scalar Multiplication (MSM) is a fundamental operation in elliptic curve cryptography and zero-knowledge proofs. It is defined as:
+
+$$
+MSM(a, G) = \sum_{j=0}^{n-1} a_j \cdot G_j
+$$
+
+Where:
+- $G_j \in G$ are points from an elliptic curve group.
+- $a_0, \ldots, a_n$ are scalars.
+- $MSM(a, G) \in G$ is the result, a single elliptic curve point.
+
+MSM is inherently parallelizable, making it a critical operation for optimizing performance in cryptographic protocols like zk-SNARKs. Accelerating MSM can significantly reduce the time required for proof generation.
+
+Accelerating MSM is crucial to a ZK protocol's performance due to the [large percent of run time](https://hackmd.io/@0xMonia/SkQ6-oRz3#Hardware-acceleration-in-action) they take when generating proofs.
You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).
-## Supported Bindings
+## C++ API
+
+### `MSMConfig` Struct
+
+The `MSMConfig` struct configures the MSM operation. It allows customization of parameters like the number of precomputed points, the window bitsize (`c`), and memory management. Here's the configuration structure:
+
+```cpp
+struct MSMConfig {
+ icicleStreamHandle stream;
+ int precompute_factor;
+ int c;
+ int bitsize;
+ int batch_size;
+ bool are_points_shared_in_batch;
+ bool are_scalars_on_device;
+ bool are_scalars_montgomery_form;
+ bool are_points_on_device;
+ bool are_points_montgomery_form;
+ bool are_results_on_device;
+ bool is_async;
+ ConfigExtension* ext;
+};
+```
+
+#### Default Configuration
+
+You can obtain a default `MSMConfig` using:
+
+```cpp
+ static MSMConfig default_msm_config()
+ {
+ MSMConfig config = {
+ nullptr, // stream
+ 1, // precompute_factor
+ 0, // c
+ 0, // bitsize
+ 1, // batch_size
+ true, // are_points_shared_in_batch
+ false, // are_scalars_on_device
+ false, // are_scalars_montgomery_form
+ false, // are_points_on_device
+ false, // are_points_montgomery_form
+ false, // are_results_on_device
+ false, // is_async
+ nullptr, // ext
+ };
+ return config;
+ }
+```
+
+### `msm` Function
+
+The `msm` function computes the MSM operation:
+
+```cpp
+template
+eIcicleError msm(const S* scalars, const A* bases, int msm_size, const MSMConfig& config, P* results);
+```
+
+:::note
+The API is template and can work with all ICICLE curves (if corresponding lib is linked), including G2 groups.
+:::
+
+### Batched MSM
+
+The MSM supports batch mode - running multiple MSMs in parallel. It's always better to use the batch mode instead of running single msms in serial as long as there is enough memory available. We support running a batch of MSMs that share the same points as well as a batch of MSMs that use different points.
+
+Config fields `are_points_shared_in_batch` and `batch_size` are used to configure msm for batch mode.
+
+### G2 MSM
+
+for G2 MSM, use the [same msm api](#msm-function) with the G2 types.
+
+:::note
+Supported curves have types for both G1 and G2.
+:::
+
+### Precompution
+
+#### What It Does:
+
+- The function computes a set of additional points derived from the original base points. These precomputed points are stored and later reused during the MSM computation.
+- Purpose: By precomputing and storing these points, the MSM operation can reduce the number of operations needed at runtime, which can significantly speed up the calculation.
+
+#### When to Use:
+
+- Memory vs. Speed Trade-off: Precomputation increases the memory footprint because additional points are stored, but it reduces the computational effort during MSM, making the process faster.
+- Best for Repeated Use: It’s especially beneficial when the same set of base points is used multiple times in different MSM operations.
+
+```cpp
+template
+eIcicleError msm_precompute_bases(const A* input_bases, int bases_size, const MSMConfig& config, A* output_bases);
+```
+
+:::note
+User is allocating the `output_bases` (on host or device memory) and later use it as bases when calling msm.
+:::
+
+## Rust and Go bindings
+
+The Rust and Go bindings provide equivalent functionality for their respective environments. Refer to their documentation for details on usage.
- [Golang](../golang-bindings/msm.md)
-- [Rust](../rust-bindings//msm.md)
+- [Rust](../rust-bindings/msm.md)
+
+## CUDA backend MSM
+This section describes the CUDA msm implementation and how to customize it (optional).
-## Algorithm description
+### Algorithm description
We follow the bucket method algorithm. The GPU implementation consists of four phases:
@@ -63,67 +137,20 @@ We follow the bucket method algorithm. The GPU implementation consists of four p
3. Buckets Reduction phase - bucket results are multiplied by their corresponding bucket number and each bucket module is reduced to a small number of final results. By default, this is done by an iterative algorithm which is highly parallel. Setting `is_big_triangle` to `true` will switch this phase to the running sum algorithm described in the above YouTube talk which is much less parallel.
4. Final accumulation phase - The final results from the last phase are accumulated using the double-and-add algorithm.
-## Batched MSM
-
-The MSM supports batch mode - running multiple MSMs in parallel. It's always better to use the batch mode instead of running single msms in serial as long as there is enough memory available. We support running a batch of MSMs that share the same points as well as a batch of MSMs that use different points.
-
-## MSM configuration
+## Configuring CUDA msm
+Use `ConfigExtension` object to pass backend specific configuration.
+CUDA specific msm configuration:
```cpp
- /**
- * @struct MSMConfig
- * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
- * is to create it using [default_msm_config](@ref default_msm_config) function and then you'll hopefully only need to
- * change a small number of default values for each of your MSMs.
- */
- struct MSMConfig {
- device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
- int points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be
- * a number of different points. So, if each MSM re-uses the same set of points, this
- * variable is set equal to the MSM size. And if every MSM uses a distinct set of
- * points, it should be set to the product of MSM size and [batch_size](@ref
- * batch_size). Default value: 0 (meaning it's equal to the MSM size). */
- int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
- * [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
- * there needs to be equal to the one used here. Larger values decrease the
- * number of computations to make, on-line memory footprint, but increase the static
- * memory footprint. Default value: 1 (i.e. don't pre-compute). */
- int c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
- * method" that we use to solve the MSM problem. As a rule of thumb, larger value
- * means more on-line memory footprint but also more parallelism and less computational
- * complexity (up to a certain point). Currently pre-computation is independent of
- * \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
- * [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
- * Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
- int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
- * but if a different (better) upper bound is known, it should be reflected in this
- * variable. Default value: 0 (set to the bitsize of scalar field). */
- int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur
- * very frequently. Useful for efficient treatment of non-uniform distributions of
- * scalars and "top windows" with few bits. Can be set to 0 to disable separate
- * treatment of large buckets altogether. Default value: 10. */
- int batch_size; /**< The number of MSMs to compute. Default value: 1. */
- bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value:
- * false. */
- bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
- * true. */
- bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
- bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
- * Default value: true. */
- bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
- * to false, `is_async` won't take effect because a synchronization is needed to
- * transfer results to the host. Default value: false. */
- bool is_big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
- * but also greatly decreases parallelism, so only suitable for large batches of MSMs.
- * Default value: false. */
- bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
- * non-blocking and you'd need to synchronize it explicitly by running
- * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
- * function will block the current CPU thread. */
- };
+ConfigExtension ext;
+ext.set("large_bucket_factor", 15);
+// use the config-extension in the msm config for the backend to see.
+msm_config.ext = &ext;
+// call msm
+msm(..., config,...); // msm backend is reading the config-extension
```
-## Choosing optimal parameters
+### Choosing optimal parameters
`is_big_triangle` should be `false` in almost all cases. It might provide better results only for very small MSMs (smaller than 2^8^) with a large batch (larger than 100) but this should be tested per scenario.
Large buckets exist in two cases:
@@ -154,42 +181,23 @@ This gives a good approximation within 10% of the actual required memory for mos
Here is a useful table showing optimal parameters for different MSMs. They are optimal for BLS12-377 curve when running on NVIDIA GeForce RTX 3090 Ti. This is the configuration used:
-```cpp
- msm::MSMConfig config = {
- ctx, // DeviceContext
- N, // points_size
- precomp_factor, // precompute_factor
- user_c, // c
- 0, // bitsize
- 10, // large_bucket_factor
- batch_size, // batch_size
- false, // are_scalars_on_device
- false, // are_scalars_montgomery_form
- true, // are_points_on_device
- false, // are_points_montgomery_form
- true, // are_results_on_device
- false, // is_big_triangle
- true // is_async
- };
-```
-
Here are the parameters and the results for the different cases:
-| MSM size | Batch size | Precompute factor | c | Memory estimation (GB) | Actual memory (GB) | Single MSM time (ms) |
-| --- | --- | --- | --- | --- | --- | --- |
-| 10 | 1 | 1 | 9 | 0.00227 | 0.00277 | 9.2 |
-| 10 | 1 | 23 | 11 | 0.00259 | 0.00272 | 1.76 |
-| 10 | 1000 | 1 | 7 | 0.94 | 1.09 | 0.051 |
-| 10 | 1000 | 23 | 11 | 2.59 | 2.74 | 0.025 |
-| 15 | 1 | 1 | 11 | 0.011 | 0.019 | 9.9 |
-| 15 | 1 | 16 | 16 | 0.061 | 0.065 | 2.4 |
-| 15 | 100 | 1 | 11 | 1.91 | 1.92 | 0.84 |
-| 15 | 100 | 19 | 14 | 6.32 | 6.61 | 0.56 |
-| 18 | 1 | 1 | 14 | 0.128 | 0.128 | 14.4 |
-| 18 | 1 | 15 | 17 | 0.40 | 0.42 | 5.9 |
-| 22 | 1 | 1 | 17 | 1.64 | 1.65 | 68 |
-| 22 | 1 | 13 | 21 | 5.67 | 5.94 | 54 |
-| 24 | 1 | 1 | 18 | 6.58 | 6.61 | 232 |
-| 24 | 1 | 7 | 21 | 12.4 | 13.4 | 199 |
+| MSM size | Batch size | Precompute factor | c | Memory estimation (GB) | Actual memory (GB) | Single MSM time (ms) |
+| -------- | ---------- | ----------------- | --- | ---------------------- | ------------------ | -------------------- |
+| 10 | 1 | 1 | 9 | 0.00227 | 0.00277 | 9.2 |
+| 10 | 1 | 23 | 11 | 0.00259 | 0.00272 | 1.76 |
+| 10 | 1000 | 1 | 7 | 0.94 | 1.09 | 0.051 |
+| 10 | 1000 | 23 | 11 | 2.59 | 2.74 | 0.025 |
+| 15 | 1 | 1 | 11 | 0.011 | 0.019 | 9.9 |
+| 15 | 1 | 16 | 16 | 0.061 | 0.065 | 2.4 |
+| 15 | 100 | 1 | 11 | 1.91 | 1.92 | 0.84 |
+| 15 | 100 | 19 | 14 | 6.32 | 6.61 | 0.56 |
+| 18 | 1 | 1 | 14 | 0.128 | 0.128 | 14.4 |
+| 18 | 1 | 15 | 17 | 0.40 | 0.42 | 5.9 |
+| 22 | 1 | 1 | 17 | 1.64 | 1.65 | 68 |
+| 22 | 1 | 13 | 21 | 5.67 | 5.94 | 54 |
+| 24 | 1 | 1 | 18 | 6.58 | 6.61 | 232 |
+| 24 | 1 | 7 | 21 | 12.4 | 13.4 | 199 |
The optimal values can vary per GPU and per curve. It is best to try a few combinations until you get the best results for your specific case.
diff --git a/docs/docs/icicle/primitives/ntt.md b/docs/docs/icicle/primitives/ntt.md
index 9816b3dce..7d45ef9b1 100644
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -1,5 +1,7 @@
# NTT - Number Theoretic Transform
+## Overview
+
The Number Theoretic Transform (NTT) is a variant of the Fourier Transform used over finite fields, particularly those of integers modulo a prime number. NTT operates in a discrete domain and is used primarily in applications requiring modular arithmetic, such as cryptography and polynomial multiplication.
NTT is defined similarly to the Discrete Fourier Transform (DFT), but instead of using complex roots of unity, it uses roots of unity within a finite field. The definition hinges on the properties of the finite field, specifically the existence of a primitive root of unity of order $N$ (where $N$ is typically a power of 2), and the modulo operation is performed with respect to a specific prime number that supports these roots.
@@ -21,18 +23,7 @@ NTT is particularly useful because it enables efficient polynomial multiplicatio
There exists also INTT which is the inverse operation of NTT. INTT can take as input an output sequence of integers from an NTT and reconstruct the original sequence.
-## Using NTT
-
-### Supported Bindings
-
-- [Golang](../golang-bindings/ntt.md)
-- [Rust](../rust-bindings/ntt.md)
-
-### Examples
-
-- [Rust API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/rust/ntt/src/main.rs#L1)
-
-- [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)
+## C++ API
### Ordering
@@ -52,25 +43,168 @@ The `Ordering` enum defines how inputs and outputs are arranged for the NTT oper
Choosing an algorithm is heavily dependent on your use case. For example Cooley-Tukey will often use `kRN` and Gentleman-Sande often uses `kNR`.
-### Modes
+```cpp
+enum class Ordering {
+ kNN, /**< Inputs and outputs are in natural-order. */
+ kNR, /**< Inputs are in natural-order and outputs are in bit-reversed-order. */
+ kRN, /**< Inputs are in bit-reversed-order and outputs are in natural-order. */
+ kRR, /**< Inputs and outputs are in bit-reversed-order. */
+ kNM, /**< Inputs are in natural-order and outputs are in digit-reversed-order. */
+ kMN /**< Inputs are in digit-reversed-order and outputs are in natural-order. */
+};
+```
+
+### `NTTConfig` Struct
+
+The `NTTConfig` struct configures the NTT operation. It allows customization of parameters like the batch size, column batch computation, order of inputs and outputs etc.
+
+```cpp
+ template
+ struct NTTConfig {
+ icicleStreamHandle stream;
+ S coset_gen;
+ int batch_size;
+ bool columns_batch;
+ Ordering ordering;
+ bool are_inputs_on_device;
+ bool are_outputs_on_device;
+ bool is_async;
+ ConfigExtension* ext = nullptr;
+ };
+```
+
+#### Default configuration
+
+You can obtain a default `NTTConfig` using:
+```cpp
+template
+static NTTConfig default_ntt_config()
+{
+ NTTConfig config = {
+ nullptr, // stream
+ S::one(), // coset_gen
+ 1, // batch_size
+ false, // columns_batch
+ Ordering::kNN, // ordering
+ false, // are_inputs_on_device
+ false, // are_outputs_on_device
+ false, // is_async
+ };
+ return config;
+}
+```
+
+### NTT domain
+Before computing an NTT, it is mandatory to initialize the roots of unity domain for computing the NTT.
+
+:::note
+NTT domain is constructed for a given size $2^N$ and can be used for any NTT of size smaller or equal to $2^N$. For example a domain of size 32 can be used to compute NTTs of size 2,4,8,16,32.
+:::
+
+```cpp
+template
+eIcicleError ntt_init_domain(const S& primitive_root, const NTTInitDomainConfig& config);
+```
+
+:::note
+Domain is constructed per device. When using multiple devices (e.g. GPUs), need to call it per device prior to calling ntt.
+:::
+
+To retrieve a root of unity from the domain:
+```cpp
+template S get_root_of_unity(uint64_t max_size);
+```
+
+Finally, release the domain to free up device memory when not required:
+```cpp
+template S get_root_of_unity(uint64_t max_size);
+```
+
+where
+
+```cpp
+struct NTTInitDomainConfig {
+ icicleStreamHandle stream; /**< Stream for asynchronous execution. */
+ bool is_async; /**< True if operation is asynchronous. Default value is false. */
+ ConfigExtension* ext = nullptr; /**< Backend-specific extensions. */
+};
+
+static NTTInitDomainConfig default_ntt_init_domain_config()
+{
+ NTTInitDomainConfig config = {
+ nullptr, // stream
+ false // is_async
+ };
+ return config;
+}
+```
+
+### `ntt` Function
+
+the `ntt` function computes the NTT operation:
+
+```cpp
+template
+eIcicleError ntt(const E* input, int size, NTTDir dir, const NTTConfig& config, E* output);
+
+// Where NTTDir specific whether it is a forward or inverse transform
+enum class NTTDir {
+ kForward, /**< Perform forward NTT. */
+ kInverse /**< Perform inverse NTT (iNTT). */
+};
+```
+### EC-NTT
+[The ntt api](#ntt-function) works for ECNTT too, given correct types, for supported curves.
+
+### Batch NTT
+
+Batch NTT allows you to compute many NTTs with a single API call. Batch NTT can significantly reduce read/write times as well as computation overhead by executing multiple NTT operations in parallel. Batch mode may also offer better utilization of computational resources (memory and compute).
+
+To compute a batch, set the `batch_size` and `columns_batch` fields of the config struct.
+
+### Rust and Go bindings
-NTT also supports two different modes `Batch NTT` and `Single NTT`
+- [Golang](../golang-bindings/ntt.md)
+- [Rust](../rust-bindings/ntt.md)
-Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
+### Example
-#### Single NTT
+The following example demonstartes how to use ntt and how pass custom configurations to the CUDA backend. This is discussed below.
-Single NTT will launch a single NTT computation.
+```cpp
+#include "icicle/backend/ntt_config.h"
-Choose this mode when your application requires processing individual NTT operations in isolation.
+// allocate and init input/output
+int batch_size = /*...*/;
+int log_ntt_size = /*...*/;
+int ntt_size = 1 << log_ntt_size;
+auto input = std::make_unique(batch_size * ntt_size);
+auto output = std::make_unique(batch_size * ntt_size);
+initialize_input(ntt_size, batch_size, input.get());
-#### Batch NTT Mode
+// Initialize NTT domain with fast twiddles (CUDA backend)
+scalar_t basic_root = scalar_t::omega(log_ntt_size);
+auto ntt_init_domain_cfg = default_ntt_init_domain_config();
+ConfigExtension backend_cfg_ext;
+backend_cfg_ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
+ntt_init_domain_cfg.ext = &backend_cfg_ext;
+ntt_init_domain(basic_root, ntt_init_domain_cfg);
-Batch NTT allows you to run many NTTs with a single API call. Batch NTT mode can significantly reduce read/write times as well as computation overhead by executing multiple NTT operations in parallel. Batch mode may also offer better utilization of computational resources (memory and compute).
+// ntt configuration
+NTTConfig config = default_ntt_config();
+ConfigExtension ntt_cfg_ext;
+config.batch_size = batch_size;
-## Supported algorithms
+// Compute NTT with explicit selection of Mixed-Radix algorithm.
+ntt_cfg_ext.set(CudaBackendConfig::CUDA_NTT_ALGORITHM, CudaBackendConfig::NttAlgorithm::MixedRadix);
+config.ext = &ntt_cfg_ext;
+ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get());
+```
-Our NTT implementation supports two algorithms `radix-2` and `mixed-radix`.
+### CUDA backend NTT
+This section describes the CUDA ntt implementation and how to use it.
+
+Our CUDA NTT implementation supports two algorithms `radix-2` and `mixed-radix`.
### Radix 2
@@ -157,3 +291,32 @@ Performance really depends on logn size, batch size, ordering, inverse, coset, c
For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/main/icicle/src/ntt/ntt.cu#L573) which should choose the most efficient algorithm in most cases.
We still recommend you benchmark for your specific use case if you think a different configuration would yield better results.
+
+To Explicitly choose the algorithm:
+
+```cpp
+#include "icicle/backend/ntt_config.h"
+
+NTTConfig config = default_ntt_config();
+ConfigExtension ntt_cfg_ext;
+ntt_cfg_ext.set(CudaBackendConfig::CUDA_NTT_ALGORITHM, CudaBackendConfig::NttAlgorithm::MixedRadix);
+config.ext = &ntt_cfg_ext;
+ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get());
+```
+
+
+### Fast twiddles
+
+When using the Mixed-radix algorithm, it is recommended to initialize the domain in "fast-twiddles" mode. This is essentially allocating the domain using extra memory but enables faster ntt.
+To do so simply, pass this flag to the CUDA backend.
+
+```cpp
+#include "icicle/backend/ntt_config.h"
+
+scalar_t basic_root = scalar_t::omega(log_ntt_size);
+auto ntt_init_domain_cfg = default_ntt_init_domain_config();
+ConfigExtension backend_cfg_ext;
+backend_cfg_ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
+ntt_init_domain_cfg.ext = &backend_cfg_ext;
+ntt_init_domain(basic_root, ntt_init_domain_cfg);
+```
\ No newline at end of file
diff --git a/docs/docs/icicle/primitives/overview.md b/docs/docs/icicle/primitives/overview.md
index 67956f3e2..4fe315144 100644
--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -1,12 +1,11 @@
-# ICICLE Primitives
-
-This section of the documentation is dedicated to the ICICLE primitives, we will cover the usage and internal details of our primitives such as hashing algorithms, MSM and NTT.
+# ICICLE Compute APIs
+This section of the documentation is dedicated to the main APIs provided by ICICLE. We will cover the usage and internal details of our core primitives, such as Multi-Scalar Multiplication (MSM), Number Theoretic Transform (NTT), and various hashing algorithms. Each primitive has its own dedicated page with examples and explanations for C++, Rust, and Go.
## Supported primitives
-
- [MSM](./msm.md)
- [NTT](./ntt.md)
-- [Keccak Hash](./keccak.md)
-- [Poseidon Hash](./poseidon.md)
+- [Vector Operations](./vec_ops.md)
+- [Polynomials](../polynomials/overview.md)
+- Hashing and commitment schemes coming soon
diff --git a/docs/docs/icicle/primitives/poseidon.md b/docs/docs/icicle/primitives/poseidon.md
index 5626701b9..df2af0930 100644
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -1,5 +1,7 @@
# Poseidon
+TODO update for V3
+
[Poseidon](https://eprint.iacr.org/2019/458.pdf) is a popular hash in the ZK ecosystem primarily because its optimized to work over large prime fields, a common setting for ZK proofs, thereby minimizing the number of multiplicative operations required.
Poseidon has also been specifically designed to be efficient when implemented within ZK circuits, Poseidon uses far less constraints compared to other hash functions like Keccak or SHA-256 in the context of ZK circuits.
diff --git a/docs/docs/icicle/primitives/poseidon2.md b/docs/docs/icicle/primitives/poseidon2.md
index cbf20bb5e..fcb2a7e06 100644
--- a/docs/docs/icicle/primitives/poseidon2.md
+++ b/docs/docs/icicle/primitives/poseidon2.md
@@ -1,5 +1,7 @@
# Poseidon2
+TODO update for V3
+
[Poseidon2](https://eprint.iacr.org/2023/323) is a recently released optimized version of Poseidon1. The two versions differ in two crucial points. First, Poseidon is a sponge hash function, while Poseidon2 can be either a sponge or a compression function depending on the use case. Secondly, Poseidon2 is instantiated by new and more efficient linear layers with respect to Poseidon. These changes decrease the number of multiplications in the linear layer by up to 90% and the number of constraints in Plonk circuits by up to 70%. This makes Poseidon2 currently the fastest arithmetization-oriented hash function without lookups.
diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md
new file mode 100644
index 000000000..e9e10c1a9
--- /dev/null
+++ b/docs/docs/icicle/primitives/vec_ops.md
@@ -0,0 +1,187 @@
+
+# Vector Operations API
+
+## Overview
+
+The Vector Operations API in Icicle provides a set of functions for performing element-wise and scalar-vector operations on vectors, matrix operations, and miscellaneous operations like bit-reversal and slicing. These operations can be performed on the host or device, with support for asynchronous execution.
+
+### VecOpsConfig
+
+The `VecOpsConfig` struct is a configuration object used to specify parameters for vector operations.
+
+#### Fields
+
+- **`stream: icicleStreamHandle`**: Specifies the CUDA stream for asynchronous execution. If `nullptr`, the default stream is used.
+- **`is_a_on_device: bool`**: Indicates whether the first input vector (`a`) is already on the device. If `false`, the vector will be copied from the host to the device.
+- **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional.
+- **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host.
+- **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity.
+- **`ext: ConfigExtension*`**: Backend-specific extensions.
+
+#### Default Configuration
+
+```cpp
+static VecOpsConfig default_vec_ops_config() {
+ VecOpsConfig config = {
+ nullptr, // stream
+ false, // is_a_on_device
+ false, // is_b_on_device
+ false, // is_result_on_device
+ false, // is_async
+ };
+ return config;
+}
+```
+
+### Element-wise Operations
+
+These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector.
+
+#### `vector_add`
+
+Adds two vectors element-wise.
+
+```cpp
+template
+eIcicleError vector_add(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_sub`
+
+Subtracts vector `b` from vector `a` element-wise.
+
+```cpp
+template
+eIcicleError vector_sub(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_mul`
+
+Multiplies two vectors element-wise.
+
+```cpp
+template
+eIcicleError vector_mul(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_div`
+
+Divides vector `a` by vector `b` element-wise.
+
+```cpp
+template
+eIcicleError vector_div(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_accumulate`
+
+Adds vector b to a, inplace.
+
+```cpp
+template
+eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config);
+```
+
+#### `convert_montogomery`
+
+Convert a vector of field elements to/from montgomery form.
+```cpp
+template
+eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
+```
+
+### Scalar-Vector Operations
+
+These functions apply a scalar operation to each element of a vector.
+
+#### `scalar_add_vec / scalar_sub_vec`
+
+Adds a scalar to each element of a vector.
+
+```cpp
+template
+eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `scalar_sub_vec`
+
+Subtract each element of a vector from a scalar `scalar-vec`.
+
+```cpp
+template
+eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `scalar_mul_vec`
+
+Multiplies each element of a vector by a scalar.
+
+```cpp
+template
+eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+### Matrix Operations
+
+These functions perform operations on matrices.
+
+#### `matrix_transpose`
+
+Transposes a matrix.
+
+```cpp
+template
+eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out);
+```
+
+### Miscellaneous Operations
+
+#### `bit_reverse`
+
+Reorders the vector elements based on a bit-reversal pattern.
+
+```cpp
+template
+eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out);
+```
+
+#### `slice`
+
+Extracts a slice from a vector.
+
+```cpp
+template
+eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+```
+
+#### `highest_non_zero_idx`
+
+Finds the highest non-zero index in a vector.
+
+```cpp
+template
+eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsConfig& config, int64_t* out_idx);
+```
+
+#### `polynomial_eval`
+
+Evaluates a polynomial at given domain points.
+
+```cpp
+template
+eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* domain, uint64_t domain_size, const VecOpsConfig& config, T* evals /*OUT*/);
+```
+
+#### `polynomial_division`
+
+Divides two polynomials.
+
+```cpp
+template
+eIcicleError polynomial_division(const T* numerator, int64_t numerator_deg, const T* denumerator, int64_t denumerator_deg, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, T* r_out /*OUT*/, uint64_t r_size);
+```
+
+### Rust and Go bindings
+
+- [Golang](../golang-bindings/vec-ops.md)
+- [Rust](../rust-bindings/vec-ops.md)
diff --git a/docs/docs/icicle/programmers_guide/cpp.md b/docs/docs/icicle/programmers_guide/cpp.md
new file mode 100644
index 000000000..bd534da1e
--- /dev/null
+++ b/docs/docs/icicle/programmers_guide/cpp.md
@@ -0,0 +1,300 @@
+# Icicle C++ Usage Guide
+
+## Overview
+
+This guide covers the usage of Icicle's C++ API, including device management, memory operations, data transfer, synchronization, and compute APIs.
+
+## Device Management
+
+:::note
+See all icicle runtime APIs in [runtime.h](https://github.com/ingonyama-zk/icicle/blob/yshekel/V3/icicle/include/icicle/runtime.h)
+:::
+
+### Loading a Backend
+
+The backend can be loaded from a specific path or from an environment variable. This is essential for setting up the computing environment.
+
+```cpp
+#include "icicle/runtime.h"
+eIcicleError result = icicle_load_backend_from_env_or_default();
+// or load from custom install dir
+eIcicleError result = icicle_load_backend("/path/to/backend/installdir", true);
+```
+
+### Setting and Getting Active Device
+
+You can set the active device for the current thread and retrieve it when needed:
+
+```cpp
+icicle::Device device = {"CUDA", 0}; // or other
+eIcicleError result = icicle_set_device(device);
+// or query current (thread) device
+eIcicleError result = icicle_get_active_device(device);
+```
+
+### Querying Device Information
+
+Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
+
+```cpp
+int device_count;
+eIcicleError result = icicle_get_device_count(device_count);
+
+bool is_host_memory;
+eIcicleError result = icicle_is_host_memory(ptr);
+
+bool is_device_memory;
+eIcicleError result = icicle_is_active_device_memory(ptr);
+```
+
+## Memory Management
+
+### Allocating and Freeing Memory
+
+Memory can be allocated and freed on the active device:
+
+```cpp
+void* ptr;
+eIcicleError result = icicle_malloc(&ptr, 1024); // Allocate 1024 bytes
+eIcicleError result = icicle_free(ptr); // Free the allocated memory
+```
+
+### Asynchronous Memory Operations
+
+You can perform memory allocation and deallocation asynchronously using streams:
+
+```cpp
+icicleStreamHandle stream;
+eIcicleError err = icicle_create_stream(&stream);
+
+void* ptr;
+err = icicle_malloc_async(&ptr, 1024, stream);
+err = icicle_free_async(ptr, stream);
+```
+
+### Querying Available Memory
+
+Retrieve the total and available memory on the active device:
+
+```cpp
+size_t total_memory, available_memory;
+eIcicleError err = icicle_get_available_memory(total_memory, available_memory);
+```
+
+### Setting Memory Values
+
+Set memory to a specific value on the active device, synchronously or asynchronously:
+
+```cpp
+eIcicleError err = icicle_memset(ptr, 0, 1024); // Set 1024 bytes to 0
+eIcicleError err = icicle_memset_async(ptr, 0, 1024, stream);
+```
+
+## Data Transfer
+
+### Copying Data
+
+Data can be copied between host and device, or between devices. The location of the memory is inferred from the pointers:
+
+```cpp
+eIcicleError result = icicle_copy(dst, src, size);
+eIcicleError result = icicle_copy_async(dst, src, size, stream);
+```
+
+### Explicit Data Transfers
+
+To avoid device-inference overhead, use explicit copy functions:
+
+```cpp
+eIcicleError result = icicle_copy_to_host(host_dst, device_src, size);
+eIcicleError result = icicle_copy_to_host_async(host_dst, device_src, size, stream);
+
+eIcicleError result = icicle_copy_to_device(device_dst, host_src, size);
+eIcicleError result = icicle_copy_to_device_async(device_dst, host_src, size, stream);
+```
+
+## Stream Management
+
+### Creating and Destroying Streams
+
+Streams are used to manage asynchronous operations:
+
+```cpp
+icicleStreamHandle stream;
+eIcicleError result = icicle_create_stream(&stream);
+eIcicleError result = icicle_destroy_stream(stream);
+```
+
+## Synchronization
+
+### Synchronizing Streams and Devices
+
+Ensure all previous operations on a stream or device are completed before proceeding:
+
+```cpp
+eIcicleError result = icicle_stream_synchronize(stream);
+eIcicleError result = icicle_device_synchronize();
+```
+
+## Device Properties
+
+### Checking Device Availability
+
+Check if a device is available and retrieve a list of registered devices:
+
+```cpp
+icicle::Device dev;
+eIcicleError result = icicle_is_device_available(dev);
+```
+
+### Querying Device Properties
+
+Retrieve properties of the active device:
+
+```cpp
+DeviceProperties properties;
+eIcicleError result = icicle_get_device_properties(properties);
+
+/******************/
+// where DeviceProperties is
+struct DeviceProperties {
+ bool using_host_memory; // Indicates if the device uses host memory
+ int num_memory_regions; // Number of memory regions available on the device
+ bool supports_pinned_memory; // Indicates if the device supports pinned memory
+ // Add more properties as needed
+};
+```
+
+
+## Compute APIs
+
+### Multi-Scalar Multiplication (MSM) Example
+
+Icicle provides high-performance compute APIs such as the Multi-Scalar Multiplication (MSM) for cryptographic operations. Here's a simple example of how to use the MSM API.
+
+```cpp
+#include
+#include "icicle/runtime.h"
+#include "icicle/api/bn254.h"
+
+using namespace bn254;
+
+int main()
+{
+ // Load installed backends
+ icicle_load_backend_from_env_or_default();
+
+ // trying to choose CUDA if available, or fallback to CPU otherwise (default device)
+ const bool is_cuda_device_available = (eIcicleError::SUCCESS == icicle_is_device_available("CUDA"));
+ if (is_cuda_device_available) {
+ Device device = {"CUDA", 0}; // GPU-0
+ ICICLE_CHECK(icicle_set_device(device)); // ICICLE_CHECK asserts that the api call returns eIcicleError::SUCCESS
+ } // else we stay on CPU backend
+
+ // Setup inputs
+ int msm_size = 1024;
+ auto scalars = std::make_unique(msm_size);
+ auto points = std::make_unique(msm_size);
+ projective_t result;
+
+ // Generate random inputs
+ scalar_t::rand_host_many(scalars.get(), msm_size);
+ projective_t::rand_host_many(points.get(), msm_size);
+
+ // (optional) copy scalars to device memory explicitly
+ scalar_t* scalars_d = nullptr;
+ auto err = icicle_malloc((void**)&scalars_d, sizeof(scalar_t) * msm_size);
+ // Note: need to test err and make sure no errors occurred
+ err = icicle_copy(scalars_d, scalars.get(), sizeof(scalar_t) * msm_size);
+
+ // MSM configuration
+ MSMConfig config = default_msm_config();
+ // tell icicle that the scalars are on device. Note that EC points and result are on host memory in this example.
+ config.are_scalars_on_device = true;
+
+ // Execute the MSM kernel (on the current device)
+ eIcicleError result_code = msm(scalars_d, points.get(), msm_size, config, &result);
+ // OR call bn254_msm(scalars_d, points.get(), msm_size, config, &result);
+
+ // Free the device memory
+ icicle_free(scalars_d);
+
+ // Check for errors
+ if (result_code == eIcicleError::SUCCESS) {
+ std::cout << "MSM result: " << projective_t::to_affine(result) << std::endl;
+ } else {
+ std::cerr << "MSM computation failed with error: " << get_error_string(result_code) << std::endl;
+ }
+
+ return 0;
+}
+```
+
+### Polynomial Operations Example
+
+Here's another example demonstrating polynomial operations using Icicle:
+
+```cpp
+#include
+#include "icicle/runtime.h"
+#include "icicle/polynomials/polynomials.h"
+#include "icicle/api/bn254.h"
+
+using namespace bn254;
+
+// define bn254Poly to be a polynomial over the scalar field of bn254
+using bn254Poly = Polynomial;
+
+static bn254Poly randomize_polynomial(uint32_t size)
+{
+ auto coeff = std::make_unique(size);
+ for (int i = 0; i < size; i++)
+ coeff[i] = scalar_t::rand_host();
+ return bn254Poly::from_rou_evaluations(coeff.get(), size);
+}
+
+int main()
+{
+ // Load backend and set device
+ icicle_load_backend_from_env_or_default();
+
+ // trying to choose CUDA if available, or fallback to CPU otherwise (default device)
+ const bool is_cuda_device_available = (eIcicleError::SUCCESS == icicle_is_device_available("CUDA"));
+ if (is_cuda_device_available) {
+ Device device = {"CUDA", 0}; // GPU-0
+ ICICLE_CHECK(icicle_set_device(device)); // ICICLE_CHECK asserts that the API call returns eIcicleError::SUCCESS
+ } // else we stay on CPU backend
+
+ int poly_size = 1024;
+
+ // build domain for ntt is required for some polynomial ops that rely on ntt
+ ntt_init_domain(scalar_t::omega(12), default_ntt_init_domain_config());
+
+ // randomize polynomials f(x),g(x) over the scalar field of bn254
+ bn254Poly f = randomize_polynomial(poly_size);
+ bn254Poly g = randomize_polynomial(poly_size);
+
+ // Perform polynomial multiplication
+ auto result = f * g; // Executes on the current device
+
+ ICICLE_LOG_INFO << "Done";
+
+ return 0;
+}
+```
+
+In this example, the polynomial multiplication is used to perform polynomial multiplication on CUDA or CPU, showcasing the flexibility and power of Icicle's compute APIs.
+
+## Error Handling
+
+### Checking for Errors
+
+Icicle APIs return an `eIcicleError` enumeration value. Always check the returned value to ensure that operations were successful.
+
+```cpp
+if (result != eIcicleError::SUCCESS) {
+ // Handle error
+}
+```
+
+This guide provides an overview of the essential APIs available in Icicle for C++. The provided examples should help you get started with integrating Icicle into your high-performance computing projects.
diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md
new file mode 100644
index 000000000..3d2062c56
--- /dev/null
+++ b/docs/docs/icicle/programmers_guide/general.md
@@ -0,0 +1,107 @@
+
+# Icicle Programmer's Guide
+
+## Compute APIs
+
+Icicle offers a variety of compute APIs, including Number Theoretic Transforms (NTT), Multi Scalar Multiplication (MSM), vector operations, Elliptic Curve NTT (ECNTT), polynomials, and more. These APIs follow a consistent structure, making it straightforward to apply the same usage patterns across different operations.
+
+[Check out all details about compute APIs here](../primitives/overview.md).
+
+### Common Structure of Compute APIs
+
+Each compute API in Icicle typically involves the following components:
+
+- **Inputs and Outputs**: The data to be processed and the resulting output are passed to the API functions. These can reside either on the host (CPU) or on a device (GPU).
+
+- **Parameters**: Parameters such as the size of data to be processed are provided to control the computation.
+
+- **Configuration Struct**: A configuration struct is used to specify additional options for the computation. This struct has default values but can be customized as needed.
+
+The configuration struct allows users to modify settings such as:
+
+- Specifying whether inputs and outputs are on the host or device.
+- Adjusting the data layout for specific optimizations.
+- Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use.
+
+### Example (C++)
+
+```cpp
+#include "icicle/vec_ops.h"
+
+// Create config struct for vector add
+VecOpsConfig config = default_vec_ops_config();
+// optionally modify the config struct here
+
+// Call the API
+eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res);
+```
+
+Where `VecOpsConfig` is defined in `icicle/vec_ops.h`:
+
+```cpp
+struct VecOpsConfig {
+ icicleStreamHandle stream; /**< Stream for asynchronous execution. */
+ bool is_a_on_device; /**< True if `a` is on the device, false if it is not. Default value: false. */
+ bool is_b_on_device; /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
+ bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */
+ bool is_async; /**< Whether to run the vector operations asynchronously. */
+ ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
+};
+```
+
+This pattern is consistent across most Icicle APIs, in C++/Rust/Go, providing flexibility while maintaining a familiar structure. For NTT, MSM, and other operations, include the corresponding header and call the template APIs.
+
+### Config struct extension
+
+In special cases, where an application wants to specify backend specific options, this is achieved with a config-extension struct.
+For example the CPU backend has an option regarding how many threads to use for a vector addition looks as follows:
+```cpp
+#include "icicle/vec_ops.h"
+
+// Create config struct for vector add
+VecOpsConfig config = default_vec_ops_config();
+ConfigExtension ext;
+config.ext = &ext;
+ext.set("n_threads", 8); // tell the CPU backend to use 8 threads
+// Call the API
+eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res);
+```
+
+:::note
+This is not device-agnostic behavior, meaning such code is aware of the backend.
+Having said that, it is not an error to pass options to a backend that is not aware of them.
+:::
+
+## Device Abstraction
+
+Icicle provides a device abstraction layer that allows you to interact with different compute devices such as CPUs and GPUs seamlessly. The device abstraction ensures that your code can work across multiple hardware platforms without modification.
+
+### Device Management
+
+- **Loading Backends**: Backends are loaded dynamically based on the environment configuration or a specified path.
+- **Setting Active Device**: The active device for a thread can be set, allowing for targeted computation on a specific device.
+
+## Streams
+
+Streams in Icicle allow for asynchronous execution and memory operations, enabling parallelism and non-blocking execution. Streams are associated with specific devices, and you can create, destroy, and synchronize streams to manage your workflow.
+
+:::note
+For compute APIs, streams go into the `config.stream` field along with the `is_async=true` config flag.
+:::
+
+### Memory Management
+
+Icicle provides functions for allocating, freeing, and managing memory across devices. Memory operations can be performed synchronously or asynchronously, depending on the use case.
+
+### Data Transfer
+
+Data transfer between the host and devices, or between different devices, is handled through a set of APIs that ensure efficient and error-checked operations. Asynchronous operations are supported to maximize performance.
+
+### Synchronization
+
+Synchronization ensures that all previous operations on a stream or device are completed. This is crucial when coordinating between multiple operations that depend on one another.
+
+## Additional Information
+
+- **Error Handling**: Icicle uses a specific error enumeration (`eIcicleError`) to handle and return error states across its APIs.
+- **Device Properties**: You can query various properties of devices to tailor operations according to the hardware capabilities.
diff --git a/docs/docs/icicle/programmers_guide/go.md b/docs/docs/icicle/programmers_guide/go.md
new file mode 100644
index 000000000..1cd6bec11
--- /dev/null
+++ b/docs/docs/icicle/programmers_guide/go.md
@@ -0,0 +1,296 @@
+# ICICLE Golang Usage Guide
+
+## Overview
+
+This guide covers the usage of Icicle's Golang API, including device management, memory operations, data transfer, synchronization, and compute APIs.
+
+## Device Management
+
+:::note
+See all ICICLE runtime APIs in [runtime.go](https://github.com/ingonyama-zk/icicle/blob/yshekel/V3/wrappers/golang/runtime/runtime.go)
+:::
+
+### Loading a Backend
+
+The backend can be loaded from a specific path or from an environment variable. This is essential for setting up the computing environment.
+
+```go
+import "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+
+result := runtime.LoadBackendFromEnvOrDefault()
+// or load from custom install dir
+result := runtime.LoadBackend("/path/to/backend/installdir", true)
+```
+
+### Setting and Getting Active Device
+
+You can set the active device for the current thread and retrieve it when needed:
+
+```go
+device = runtime.CreateDevice("CUDA", 0) // or other
+result := runtime.SetDevice(device)
+// or query current (thread) device
+activeDevice := runtime.GetActiveDevice()
+```
+
+### Querying Device Information
+
+Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
+
+```go
+numDevices := runtime.GetDeviceCount()
+
+var ptr unsafe.Pointer
+isHostMemory = runtime.IsHostMemory(ptr)
+isDeviceMemory = runtime.IsActiveDeviceMemory(ptr)
+```
+
+## Memory Management
+
+### Allocating and Freeing Memory
+
+Memory can be allocated and freed on the active device:
+
+```go
+ptr, err := runtime.Malloc(1024) // Allocate 1024 bytes
+err := runtime.Free(ptr) // Free the allocated memory
+```
+
+### Asynchronous Memory Operations
+
+You can perform memory allocation and deallocation asynchronously using streams:
+
+```go
+stream, err := runtime.CreateStream()
+
+ptr, err := runtime.MallocAsync(1024, stream)
+err = runtime.FreeAsync(ptr, stream)
+```
+
+### Querying Available Memory
+
+Retrieve the total and available memory on the active device:
+
+```go
+size_t total_memory, available_memory;
+availableMemory, err := runtime.GetAvailableMemory()
+freeMemory := availableMemory.Free
+totalMemory := availableMemory.Total
+```
+
+### Setting Memory Values
+
+Set memory to a specific value on the active device, synchronously or asynchronously:
+
+```go
+err := runtime.Memset(ptr, 0, 1024) // Set 1024 bytes to 0
+err := runtime.MemsetAsync(ptr, 0, 1024, stream)
+```
+
+## Data Transfer
+
+### Explicit Data Transfers
+
+To avoid device-inference overhead, use explicit copy functions:
+
+```go
+result := runtime.CopyToHost(host_dst, device_src, size)
+result := runtime.CopyToHostAsync(host_dst, device_src, size, stream)
+result := runtime.CopyToDevice(device_dst, host_src, size)
+result := runtime.CopyToDeviceAsync(device_dst, host_src, size, stream)
+```
+
+## Stream Management
+
+### Creating and Destroying Streams
+
+Streams are used to manage asynchronous operations:
+
+```go
+stream, err := runtime.CreateStream()
+err = runtime.DestroyStream(stream)
+```
+
+## Synchronization
+
+### Synchronizing Streams and Devices
+
+Ensure all previous operations on a stream or device are completed before proceeding:
+
+```go
+err := runtime.StreamSynchronize(stream)
+err := runtime.DeviceSynchronize()
+```
+
+## Device Properties
+
+### Checking Device Availability
+
+Check if a device is available and retrieve a list of registered devices:
+
+```go
+dev := runtime.CreateDevice("CPU", 0)
+isCPUAvail := runtime.IsDeviceAvailable(dev)
+```
+
+### Querying Device Properties
+
+Retrieve properties of the active device:
+
+```go
+properties, err := runtime.GetDeviceProperties(properties);
+
+/******************/
+// where DeviceProperties is
+type DeviceProperties struct {
+ UsingHostMemory bool // Indicates if the device uses host memory
+ NumMemoryRegions int32 // Number of memory regions available on the device
+ SupportsPinnedMemory bool // Indicates if the device supports pinned memory
+}
+```
+
+## Compute APIs
+
+### Multi-Scalar Multiplication (MSM) Example
+
+Icicle provides high-performance compute APIs such as the Multi-Scalar Multiplication (MSM) for cryptographic operations. Here's a simple example of how to use the MSM API.
+
+```go
+package main
+
+import (
+ "fmt"
+
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254"
+ bn254Msm "github.com/ingonyama-zk/icicle/v3/wrappers/golang/curves/bn254/msm"
+)
+
+func main() {
+
+ // Load installed backends
+ runtime.LoadBackendFromEnvOrDefault()
+
+ // trying to choose CUDA if available, or fallback to CPU otherwise (default device)
+ deviceCuda := runtime.CreateDevice("CUDA", 0) // GPU-0
+ if runtime.IsDeviceAvailable(&deviceCuda) {
+ runtime.SetDevice(&deviceCuda)
+ } // else we stay on CPU backend
+
+ // Setup inputs
+ const size = 1 << 18
+
+ // Generate random inputs
+ scalars := bn254.GenerateScalars(size)
+ points := bn254.GenerateAffinePoints(size)
+
+ // (optional) copy scalars to device memory explicitly
+ var scalarsDevice core.DeviceSlice
+ scalars.CopyToDevice(&scalarsDevice, true)
+
+ // MSM configuration
+ cfgBn254 := core.GetDefaultMSMConfig()
+
+ // allocate memory for the result
+ result := make(core.HostSlice[bn254.Projective], 1)
+
+ // execute bn254 MSM on device
+ err := bn254Msm.Msm(scalarsDevice, points, &cfgBn254, result)
+
+ // Check for errors
+ if err != runtime.Success {
+ errorString := fmt.Sprint(
+ "bn254 Msm failed: ", err)
+ panic(errorString)
+ }
+
+ // free explicitly allocated device memory
+ scalarsDevice.Free()
+}
+```
+
+### Polynomial Operations Example
+
+Here's another example demonstrating polynomial operations using Icicle:
+
+```go
+package main
+
+import (
+ "fmt"
+
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/core"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/fields/babybear"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/fields/babybear/ntt"
+ "github.com/ingonyama-zk/icicle/v3/wrappers/golang/fields/babybear/polynomial"
+)
+
+func initBabybearDomain() runtime.EIcicleError {
+ cfgInitDomain := core.GetDefaultNTTInitDomainConfig()
+ rouIcicle := babybear.ScalarField{}
+ rouIcicle.FromUint32(1461624142)
+ return ntt.InitDomain(rouIcicle, cfgInitDomain)
+}
+
+func init() {
+ // Load installed backends
+ runtime.LoadBackendFromEnvOrDefault()
+
+ // trying to choose CUDA if available, or fallback to CPU otherwise (default device)
+ deviceCuda := runtime.CreateDevice("CUDA", 0) // GPU-0
+ if runtime.IsDeviceAvailable(&deviceCuda) {
+ runtime.SetDevice(&deviceCuda)
+ } // else we stay on CPU backend
+
+ // build domain for ntt is required for some polynomial ops that rely on ntt
+ err := initBabybearDomain()
+ if err != runtime.Success {
+ errorString := fmt.Sprint(
+ "Babybear Domain initialization failed: ", err)
+ panic(errorString)
+ }
+}
+
+func main() {
+
+ // Setup inputs
+ const polySize = 1 << 10
+
+ // randomize two polynomials over babybear field
+ var fBabybear polynomial.DensePolynomial
+ defer fBabybear.Delete()
+ var gBabybear polynomial.DensePolynomial
+ defer gBabybear.Delete()
+ fBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize))
+ gBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize / 2))
+
+ // Perform polynomial multiplication
+ rBabybear := fBabybear.Multiply(&gBabybear) // Executes on the current device
+ defer rBabybear.Delete()
+ rDegree := rBabybear.Degree()
+
+ fmt.Println("f Degree: ", fBabybear.Degree())
+ fmt.Println("g Degree: ", gBabybear.Degree())
+ fmt.Println("r Degree: ", rDegree)
+}
+```
+
+In this example, the polynomial multiplication is used to perform polynomial multiplication on CUDA or CPU, showcasing the flexibility and power of Icicle's compute APIs.
+
+## Error Handling
+
+### Checking for Errors
+
+Icicle APIs return an `EIcicleError` enumeration value. Always check the returned value to ensure that operations were successful.
+
+```go
+if result != runtime.SUCCESS {
+ // Handle error
+}
+```
+
+This guide provides an overview of the essential APIs available in Icicle for C++. The provided examples should help you get started with integrating Icicle into your high-performance computing projects.
diff --git a/docs/docs/icicle/programmers_guide/rust.md b/docs/docs/icicle/programmers_guide/rust.md
new file mode 100644
index 000000000..c3f89f514
--- /dev/null
+++ b/docs/docs/icicle/programmers_guide/rust.md
@@ -0,0 +1,247 @@
+
+# Icicle Rust Usage Guide
+
+## Overview
+
+This guide covers the usage of Icicle’s Rust API, including device management, memory operations, data transfer, synchronization, and compute APIs.
+
+## Build the Rust Application and Execute
+
+To successfully build and execute the Rust application using ICICLE, you need to define the ICICLE dependencies in your Cargo.toml file:
+
+```bash
+[dependencies]
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+icicle-babybear = { git = "https://github.com/ingonyama-zk/icicle.git", branch="main" }
+# add other ICICLE crates here as needed
+```
+
+Once the dependencies are defined, you can build and run your application using the following command:
+```bash
+cargo run --release
+```
+
+This will compile your Rust application with optimizations and execute it.
+
+:::note
+The icicle-runtime crate is used to load backends, select a device, and interact with the device in an abstract way when managing memory, streams, and other resources, as explained in this guide.
+:::
+
+## Device Management
+
+### Loading a Backend
+
+The backend can be loaded from a specific path or from an environment variable. This is essential for setting up the computing environment.
+
+```rust
+use icicle_runtime::runtime;
+
+runtime::load_backend_from_env_or_default().unwrap();
+// or load from custom install dir
+runtime::load_backend("/path/to/backend/installdir").unwrap();
+```
+
+### Setting and Getting Active Device
+
+You can set the active device for the current thread and retrieve it when needed:
+
+```rust
+use icicle_runtime::Device;
+
+let device = Device::new("CUDA", 0); // or other
+icicle_runtime::set_device(&device).unwrap();
+
+let active_device = icicle_runtime::get_active_device().unwrap();
+```
+
+### Querying Device Information
+
+Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
+
+```rust
+let device_count = icicle_runtime::get_device_count().unwrap();
+```
+
+## Memory Management
+
+### Allocating and Freeing Memory
+
+Memory can be allocated on the active device using the `DeviceVec` API. This memory allocation is flexible, as it supports allocation on any device, including the CPU if the CPU backend is used.
+
+```rust
+use icicle_runtime::memory::DeviceVec;
+
+// Allocate 1024 elements on the device
+let mut device_memory: DeviceVec = DeviceVec::::device_malloc(1024).unwrap();
+```
+
+The memory is released when the `DeviceVec` object is dropped.
+
+### Asynchronous Memory Operations
+
+Asynchronous memory operations can be performed using streams. This allows for non-blocking execution, with memory allocation and deallocation occurring asynchronously.
+```rust
+use icicle_runtime::stream::IcicleStream;
+use icicle_runtime::memory::DeviceVec;
+
+let mut stream = IcicleStream::create().unwrap(); // mutability is for the destroy() method
+
+// Allocate 1024 elements asynchronously on the device
+let mut device_memory: DeviceVec = DeviceVec::::device_malloc_async(1024, &stream).unwrap();
+
+// dispatch additional copy, compute etc. ops to the stream
+
+// Synchronize the stream to ensure all operations are complete
+stream.synchronize().unwrap();
+stream.destroy().unwrap(); //
+```
+
+:::note
+Streams need be explicitly destroyed before being dropped.
+:::
+
+### Querying Available Memory
+
+You can retrieve the total and available memory on the active device using the `get_available_memory` function.
+
+```rust
+use icicle_runtime::memory::get_available_memory;
+
+// Retrieve total and available memory on the active device
+let (total_memory, available_memory) = get_available_memory().unwrap();
+
+println!("Total memory: {}", total_memory);
+println!("Available memory: {}", available_memory);
+```
+
+This function returns a tuple containing the total memory and the currently available memory on the device. It is essential for managing and optimizing resource usage in your applications.
+
+## Data Transfer
+
+### Copying Data
+
+Data can be copied between the host and device, or between devices. The location of the memory is handled by the `HostOrDeviceSlice` and `DeviceSlice` traits:
+
+```rust
+use icicle_runtime::memory::{DeviceVec, HostSlice};
+
+// Copy data from host to device
+let input = vec![1, 2, 3, 4];
+let mut d_mem = DeviceVec::::device_malloc(input.len()).unwrap();
+d_mem.copy_from_host(HostSlice::from_slice(&input)).unwrap();
+// OR
+d_mem.copy_from_host_async(HostSlice::from_slice(&input, &stream)).unwrap();
+
+// Copy data back from device to host
+let mut output = vec![0; input.len()];
+d_mem.copy_to_host(HostSlice::from_mut_slice(&mut output)).unwrap();
+// OR
+d_mem.copy_to_host_async(HostSlice::from_mut_slice(&mut output, &stream)).unwrap();
+```
+## Stream Management
+
+### Creating and Destroying Streams
+
+Streams in Icicle are used to manage asynchronous operations, ensuring that computations can run in parallel without blocking the CPU thread:
+
+```rust
+use icicle_runtime::stream::IcicleStream;
+
+// Create a stream
+let mut stream = IcicleStream::create().unwrap();
+
+// Destroy the stream
+stream.destroy().unwrap();
+```
+
+## Synchronization
+
+### Synchronizing Streams and Devices
+
+Synchronization ensures that all previous operations on a stream or device are completed before moving on to the next task. This is crucial when coordinating between multiple dependent operations:
+
+```rust
+use icicle_runtime::stream::IcicleStream;
+
+// Synchronize the stream
+stream.synchronize().unwrap();
+
+// Synchronize the device
+icicle_runtime::device_synchronize().unwrap();
+```
+These functions ensure that your operations are properly ordered and completed before the program proceeds, which is critical in parallel computing environments.
+
+## Device Properties
+
+### Checking Device Availability
+
+Check if a specific device is available and retrieve a list of registered devices:
+```rust
+use icicle_runtime::Device;
+
+let cuda_device = Device::new("CUDA", 0);
+if icicle_runtime::is_device_available(&cuda_device) {
+ println!("CUDA device is available.");
+} else {
+ println!("CUDA device is not available.");
+}
+
+let registered_devices = icicle_runtime::get_registered_devices().unwrap();
+println!("Registered devices: {:?}", registered_devices);
+```
+
+### Querying Device Properties
+
+Retrieve properties of the active device to understand its capabilities and configurations:
+
+```rust
+use icicle_runtime::Device;
+
+let cuda_device = Device::new("CUDA", 0);
+if icicle_runtime::is_device_available(&cuda_device) {
+ icicle_runtime::set_device(&cuda_device);
+ let device_props = icicle_runtime::get_device_properties().unwrap();
+ println!("Device using host memory: {}", device_props.using_host_memory);
+}
+```
+
+These functions allow you to query device capabilities and ensure that your application is running on the appropriate hardware.
+
+## Compute APIs
+
+### Multi-Scalar Multiplication (MSM) Example
+
+Icicle provides high-performance compute APIs such as Multi-Scalar Multiplication (MSM) for cryptographic operations. Here's a simple example of how to use the MSM API in Rust.
+
+```rust
+// Using bls12-377 curve
+use icicle_bls12_377::curve::{CurveCfg, G1Projective, ScalarCfg};
+use icicle_core::{curve::Curve, msm, msm::MSMConfig, traits::GenerateRandom};
+use icicle_runtime::{device::Device, memory::HostSlice};
+
+fn main() {
+ // Load backend and set device
+ let _ = icicle_runtime::runtime::load_backend_from_env_or_default();
+ let cuda_device = Device::new("CUDA", 0);
+ if icicle_runtime::is_device_available(&cuda_device) {
+ icicle_runtime::set_device(&cuda_device).unwrap();
+ }
+
+ let size = 1024;
+
+ // Randomize inputs
+ let points = CurveCfg::generate_random_affine_points(size);
+ let scalars = ScalarCfg::generate_random(size);
+
+ let mut msm_results = vec![G1Projective::zero(); 1];
+ msm::msm(
+ HostSlice::from_slice(&scalars),
+ HostSlice::from_slice(&points),
+ &MSMConfig::default(),
+ HostSlice::from_mut_slice(&mut msm_results[..]),
+ )
+ .unwrap();
+ println!("MSM result = {:?}", msm_results);
+}
+```
diff --git a/docs/docs/icicle/rust-bindings.md b/docs/docs/icicle/rust-bindings.md
index f3e6142f6..bbfb16d9c 100644
--- a/docs/docs/icicle/rust-bindings.md
+++ b/docs/docs/icicle/rust-bindings.md
@@ -4,9 +4,9 @@ Rust bindings allow you to use ICICLE as a rust library.
`icicle-core` defines all interfaces, macros and common methods.
-`icicle-cuda-runtime` defines DeviceContext which can be used to manage a specific GPU as well as wrapping common CUDA methods.
+`icicle-runtime` contains runtime APIs for memory management, stream management and more.
-`icicle-curves` implements all interfaces and macros from icicle-core for each curve. For example icicle-bn254 implements curve bn254. Each curve has its own build script which will build the CUDA libraries for that curve as part of the rust-toolchain build.
+`icicle-curves` / `icicle-fields` implement all interfaces and macros from icicle-core for each curve. For example icicle-bn254 implements curve bn254. Each curve has its own build script which will build the ICICLE libraries for that curve as part of the rust-toolchain build.
## Using ICICLE Rust bindings in your project
@@ -14,12 +14,12 @@ Simply add the following to your `Cargo.toml`.
```toml
# GPU Icicle integration
-icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git" }
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git" }
icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git" }
icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git" }
```
-`icicle-bn254` being the curve you wish to use and `icicle-core` and `icicle-cuda-runtime` contain ICICLE utilities and CUDA wrappers.
+`icicle-bn254` being the curve you wish to use and `icicle-core` and `icicle-runtime` contain ICICLE utilities and CUDA wrappers.
If you wish to point to a specific ICICLE branch add `branch = ""` or `tag = ""` to the ICICLE dependency. For a specific commit add `rev = ""`.
@@ -27,61 +27,38 @@ When you build your project ICICLE will be built as part of the build command.
## How do the rust bindings work?
-The rust bindings are just rust wrappers for ICICLE Core static libraries which can be compiled. We integrate the compilation of the static libraries into rusts toolchain to make usage seamless and easy. This is achieved by [extending rusts build command](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs).
+The rust bindings are rust crates that wrap the ICICLE Core libraries (C++). Each crate can wrap one or more ICICLE core libraries. They are built too when building the crate.
-```rust
-use cmake::Config;
-use std::env::var;
-
-fn main() {
- println!("cargo:rerun-if-env-changed=CXXFLAGS");
- println!("cargo:rerun-if-changed=../../../../icicle");
-
- let cargo_dir = var("CARGO_MANIFEST_DIR").unwrap();
- let profile = var("PROFILE").unwrap();
-
- let out_dir = Config::new("../../../../icicle")
- .define("BUILD_TESTS", "OFF") //TODO: feature
- .define("CURVE", "bn254")
- .define("CMAKE_BUILD_TYPE", "Release")
- .build_target("icicle")
- .build();
-
- println!("cargo:rustc-link-search={}/build", out_dir.display());
-
- println!("cargo:rustc-link-lib=ingo_bn254");
- println!("cargo:rustc-link-lib=stdc++");
- // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
- println!("cargo:rustc-link-lib=cudart");
-}
-```
+:::note
+Since ICICLE V3, core libraries are shared-libraries. This means that they must be installed in a directory that can be found by the linker. In addition, installing an application that depends on ICICLE must make sure to install ICICLE or have it installed on the target machine.
+:::
## Supported curves, fields and operations
### Supported curves and operations
| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
-| --- | :---: | :---: | :---: | :---: | :---: |
-| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
-| G2 | ✅ | ✅ | ✅ | ✅ | ❌ |
-| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
-| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
-| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
-| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
-| Poseidon | ✅ | ✅ | ✅ | ✅ | ✅ |
-| Merkle Tree | ✅ | ✅ | ✅ | ✅ | ✅ |
+| --------------- | :---: | :-------: | :-------: | :-----: | :------: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2 | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+| Poseidon | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Merkle Tree | ✅ | ✅ | ✅ | ✅ | ✅ |
### Supported fields and operations
| Operation\Field | babybear | stark252 |
-| --- | :---: | :---: |
-| VecOps | ✅ | ✅ |
-| Polynomials | ✅ | ✅ |
-| NTT | ✅ | ✅ |
-| Extension Field | ✅ | ❌ |
+| --------------- | :------: | :------: |
+| VecOps | ✅ | ✅ |
+| Polynomials | ✅ | ✅ |
+| NTT | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
### Supported hashes
-| Hash | Sizes |
-| --- | :---: |
+| Hash | Sizes |
+| ------ | :------: |
| Keccak | 256, 512 |
diff --git a/docs/docs/icicle/rust-bindings/ecntt.md b/docs/docs/icicle/rust-bindings/ecntt.md
index c2f790aef..426324a93 100644
--- a/docs/docs/icicle/rust-bindings/ecntt.md
+++ b/docs/docs/icicle/rust-bindings/ecntt.md
@@ -10,13 +10,7 @@ pub fn ecntt(
dir: NTTDir,
cfg: &NTTConfig,
output: &mut (impl HostOrDeviceSlice> + ?Sized),
-) -> IcicleResult<()>
-where
- C::ScalarField: FieldImpl,
- ::Config: ECNTT,
-{
- // ... function implementation ...
-}
+) -> Result<(), eIcicleError>
```
## Parameters
@@ -28,4 +22,4 @@ where
## Return Value
-- **`IcicleResult<()>`**: This function returns an `IcicleResult` which is a wrapper type that indicates success or failure of the NTT computation. On success, it contains `Ok(())`.
+- **`Result<(), eIcicleError>`**: This function returns an `eIcicleError` which is a wrapper type that indicates success or failure of the NTT computation. On success, it contains `Ok(())`.
diff --git a/docs/docs/icicle/rust-bindings/keccak.md b/docs/docs/icicle/rust-bindings/keccak.md
index 9c8b231c7..637ad3e63 100644
--- a/docs/docs/icicle/rust-bindings/keccak.md
+++ b/docs/docs/icicle/rust-bindings/keccak.md
@@ -1,5 +1,7 @@
# Keccak
+TODO update for V3
+
## Keccak Example
```rust
diff --git a/docs/docs/icicle/rust-bindings/msm-pre-computation.md b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
deleted file mode 100644
index 687704fb2..000000000
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# MSM Pre computation
-
-To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
-
-## `precompute_points`
-
-Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
-
-```rust
-pub fn precompute_points>(
- points: &(impl HostOrDeviceSlice> + ?Sized),
- msm_size: i32,
- cfg: &MSMConfig,
- output_bases: &mut DeviceSlice>,
-) -> IcicleResult<()>
-```
-
-### Parameters
-
-- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
-- **`msm_size`**: The size of a single msm in order to determine optimal parameters.
-- **`cfg`**: The MSM configuration parameters.
-- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
-
-#### Returns
-
-`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
-
-#### Description
-
-This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
-
-The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
-
-#### Example Usage
-
-```rust
-let cfg = MSMConfig::default();
-let precompute_factor = 4; // Number of points to precompute
-let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");
-
-// Precompute the bases using the specified factor
-precompute_points(&points, msm_size, &cfg, &mut extended_bases)
- .expect("Failed to precompute bases");
-```
diff --git a/docs/docs/icicle/rust-bindings/msm.md b/docs/docs/icicle/rust-bindings/msm.md
index 8cfccf053..e4f86444f 100644
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -1,54 +1,14 @@
# MSM
-## Example
-
-```rust
-use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarCfg};
-use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
- let size: usize = 1 << 10; // Define the number of points and scalars
-
- // Generate random points and scalars
- println!("Generating random G1 points and scalars for BN254...");
- let points = CurveCfg::generate_random_affine_points(size);
- let scalars = ScalarCfg::generate_random(size);
-
- // Wrap points and scalars in HostOrDeviceSlice for MSM
- let points_host = HostOrDeviceSlice::Host(points);
- let scalars_host = HostOrDeviceSlice::Host(scalars);
-
- // Allocate memory on the CUDA device for MSM results
- let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).expect("Failed to allocate CUDA memory for MSM results");
-
- // Create a CUDA stream for asynchronous execution
- let stream = CudaStream::create().expect("Failed to create CUDA stream");
- let mut cfg = msm::MSMConfig::default();
- cfg.ctx.stream = &stream;
- cfg.is_async = true; // Enable asynchronous execution
-
- // Execute MSM on the device
- println!("Executing MSM on device...");
- msm::msm(&scalars_host, &points_host, &cfg, &mut msm_results).expect("Failed to execute MSM");
-
- // Synchronize CUDA stream to ensure MSM execution is complete
- stream.synchronize().expect("Failed to synchronize CUDA stream");
-
- // Optionally, move results to host for further processing or printing
- println!("MSM execution complete.");
-}
-```
-
## MSM API Overview
```rust
-pub fn msm(
- scalars: &HostOrDeviceSlice,
- points: &HostOrDeviceSlice>,
+pub fn msm>(
+ scalars: &(impl HostOrDeviceSlice + ?Sized),
+ bases: &(impl HostOrDeviceSlice> + ?Sized),
cfg: &MSMConfig,
- results: &mut HostOrDeviceSlice>,
-) -> IcicleResult<()>
+ results: &mut (impl HostOrDeviceSlice> + ?Sized),
+) -> Result<(), eIcicleError>;
```
### Parameters
@@ -61,36 +21,33 @@ pub fn msm(
### MSM Config
```rust
-pub struct MSMConfig<'a> {
- pub ctx: DeviceContext<'a>,
- points_size: i32,
+pub struct MSMConfig {
+ pub stream_handle: IcicleStreamHandle,
pub precompute_factor: i32,
pub c: i32,
- pub bitsize: i32,
- pub large_bucket_factor: i32,
+ pub bitsize: i32,
batch_size: i32,
+ are_points_shared_in_batch: bool,
are_scalars_on_device: bool,
pub are_scalars_montgomery_form: bool,
are_points_on_device: bool,
pub are_points_montgomery_form: bool,
- are_results_on_device: bool,
- pub is_big_triangle: bool,
+ are_results_on_device: bool,
pub is_async: bool,
+ pub ext: ConfigExtension,
}
```
-- **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
-- **`point_size: i32`**:
+- **`stream_handle: IcicleStreamHandle`**: Specifies a stream for asynchronous execution.
- **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
- **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
- **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
-- **`large_bucket_factor: i32`**: Adjusts the algorithm's sensitivity to frequently occurring buckets, useful for non-uniform scalar distributions.
- **`batch_size: i32`**: The number of MSMs to compute in a single batch, for leveraging parallelism.
- **`are_scalars_montgomery_form`**: Set to `true` if scalars are in montgomery form.
- **`are_points_montgomery_form`**: Set to `true` if points are in montgomery form.
- **`are_scalars_on_device: bool`**, **`are_points_on_device: bool`**, **`are_results_on_device: bool`**: Indicate whether the corresponding buffers are on the device memory.
-- **`is_big_triangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`is_async: bool`**: Whether to perform the MSM operation asynchronously.
+- **`ext: ConfigExtension`**: extended configuration for backend.
### Usage
@@ -98,73 +55,63 @@ The `msm` function is designed to compute the sum of multiple scalar-point multi
When performing MSM operations, it's crucial to match the size of the `scalars` and `points` arrays correctly and ensure that the `results` buffer is appropriately sized to hold the output. The `MSMConfig` should be set up to reflect the specifics of the operation, including whether the operation should be asynchronous and any device-specific settings.
-## How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle reduction and `is_big_triangle=false` will activate iterative reduction.
+## Example
```rust
-...
+// Using bls12-377 curve
+use icicle_bls12_377::curve::{CurveCfg, G1Projective, ScalarCfg};
+use icicle_core::{curve::Curve, msm, msm::MSMConfig, traits::GenerateRandom};
+use icicle_runtime::{device::Device, memory::HostSlice};
+
+fn main() {
+ // Load backend and set device ...
-let mut cfg_bls12377 = msm::get_default_msm_config::();
+ // Randomize inputs
+ let size = 1024;
+ let points = CurveCfg::generate_random_affine_points(size);
+ let scalars = ScalarCfg::generate_random(size);
-// is_big_triangle will determine which algorithm to use
-cfg_bls12377.is_big_triangle = true;
+ let mut msm_results = vec![G1Projective::zero(); 1];
+ msm::msm(
+ HostSlice::from_slice(&scalars),
+ HostSlice::from_slice(&points),
+ &MSMConfig::default(),
+ HostSlice::from_mut_slice(&mut msm_results[..]),
+ )
+ .unwrap();
+ println!("MSM result = {:?}", msm_results);
+}
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-...
```
-You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
+## Batched msm
-## How do I toggle between MSM modes?
+For batch msm, simply allocate the results array with size corresponding to batch size and set the `are_points_shared_in_batch` flag in config struct.
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
+## Precomputationg
-```rust
-...
-
-let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
+Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
-...
+```rust
+/// Returns `Ok(())` if no errors occurred or a `eIcicleError` otherwise.
+pub fn precompute_bases>(
+ points: &(impl HostOrDeviceSlice> + ?Sized),
+ config: &MSMConfig,
+ output_bases: &mut DeviceSlice>,
+) -> Result<(), eIcicleError>;
```
-In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
-
-In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
-
-```rust
-...
+### Parameters
-let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
+- **`msm_size`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
+- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
-...
-```
+#### Returns
-Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
+`Ok(())` if the operation is successful, or an `eIcicleError` error otherwise.
## Parameters for optimal performance
Please refer to the [primitive description](../primitives/msm#choosing-optimal-parameters)
-
-## Support for G2 group
-
-MSM also supports G2 group.
-
-Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
-
-```rust
-...
-
-let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-let mut g2_cfg = msm::get_default_msm_config::();
-
-msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
-
-...
-```
-
-Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
diff --git a/docs/docs/icicle/rust-bindings/multi-gpu.md b/docs/docs/icicle/rust-bindings/multi-gpu.md
index 8e36a5396..29cee1aba 100644
--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -1,5 +1,7 @@
# Multi GPU APIs
+TODO update for V3
+
To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
diff --git a/docs/docs/icicle/rust-bindings/ntt.md b/docs/docs/icicle/rust-bindings/ntt.md
index 9d0b8f62f..06758972e 100644
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -1,61 +1,21 @@
# NTT
-## Example
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::{ntt::{self, NTT}, traits::GenerateRandom};
-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
- let size = 1 << 12; // Define the size of your input, e.g., 2^10
-
- let icicle_omega = ::get_root_of_unity(
- size.try_into()
- .unwrap(),
- )
-
- // Generate random inputs
- println!("Generating random inputs...");
- let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
-
- // Allocate memory on CUDA device for NTT results
- let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).expect("Failed to allocate CUDA memory");
-
- // Create a CUDA stream
- let stream = CudaStream::create().expect("Failed to create CUDA stream");
- let ctx = DeviceContext::default(); // Assuming default device context
- ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
-
- // Configure NTT
- let mut cfg = ntt::NTTConfig::default();
- cfg.ctx.stream = &stream;
- cfg.is_async = true; // Set to true for asynchronous execution
-
- // Execute NTT on device
- println!("Executing NTT on device...");
- ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).expect("Failed to execute NTT");
-
- // Synchronize CUDA stream to ensure completion
- stream.synchronize().expect("Failed to synchronize CUDA stream");
-
- // Optionally, move results to host for further processing or verification
- println!("NTT execution complete.");
-}
-```
-
## NTT API overview
```rust
-pub fn ntt(
- input: &HostOrDeviceSlice,
+pub fn ntt(
+ input: &(impl HostOrDeviceSlice + ?Sized),
dir: NTTDir,
cfg: &NTTConfig,
- output: &mut HostOrDeviceSlice,
-) -> IcicleResult<()>
-```
+ output: &mut (impl HostOrDeviceSlice + ?Sized),
+) -> Result<(), eIcicleError>;
-`ntt:ntt` expects:
+pub fn ntt_inplace(
+ inout: &mut (impl HostOrDeviceSlice + ?Sized),
+ dir: NTTDir,
+ cfg: &NTTConfig,
+) -> Result<(), eIcicleError>
+```
- **`input`** - buffer to read the inputs of the NTT from.
- **`dir`** - whether to compute forward or inverse NTT.
@@ -67,16 +27,16 @@ The `input` and `output` buffers can be on device or on host. Being on host mean
### NTT Config
```rust
-pub struct NTTConfig<'a, S> {
- pub ctx: DeviceContext<'a>,
+pub struct NTTConfig {
+ pub stream_handle: IcicleStreamHandle,
pub coset_gen: S,
pub batch_size: i32,
pub columns_batch: bool,
pub ordering: Ordering,
- are_inputs_on_device: bool,
- are_outputs_on_device: bool,
+ pub are_inputs_on_device: bool,
+ pub are_outputs_on_device: bool,
pub is_async: bool,
- pub ntt_algorithm: NttAlgorithm,
+ pub ext: ConfigExtension,
}
```
@@ -84,7 +44,7 @@ The `NTTConfig` struct is a configuration object used to specify parameters for
#### Fields
-- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
+- **`stream_handle: IcicleStreamHandle`**: Specifies the stream (queue) to use for async execution
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
@@ -99,102 +59,48 @@ The `NTTConfig` struct is a configuration object used to specify parameters for
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
+- **`ext: ConfigExtension`**: extended configuration for backend.
-- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
-`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to
-
-#### Usage
-
-Example initialization with default settings:
-
-```rust
-let default_config = NTTConfig::default();
-```
-
-Customizing the configuration:
-
-```rust
-let custom_config = NTTConfig {
- ctx: custom_device_context,
- coset_gen: my_coset_generator,
- batch_size: 10,
- columns_batch: false,
- ordering: Ordering::kRN,
- are_inputs_on_device: true,
- are_outputs_on_device: true,
- is_async: false,
- ntt_algorithm: NttAlgorithm::MixedRadix,
-};
-```
-
-### Modes
-
-NTT supports two different modes `Batch NTT` and `Single NTT`
-
-You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
+#### Example
```rust
-let mut cfg = ntt::get_default_ntt_config::();
-cfg.batch_size = 10 // your ntt using this config will run in batch mode.
-```
-
-`batch_size=1` would keep our NTT in single NTT mode.
-
-Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
-
-### Initializing the NTT Domain
-
-Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.
-
-```rust
-ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
-```
-
-### `initialize_domain`
-
-```rust
-pub fn initialize_domain(primitive_root: F, ctx: &DeviceContext, fast_twiddles: bool) -> IcicleResult<()>
-where
- F: FieldImpl,
- ::Config: NTT;
+// Setting Bn254 points and scalars
+println!("Generating random inputs on host for bn254...");
+let scalars = Bn254ScalarCfg::generate_random(size);
+let mut ntt_results = DeviceVec::::device_malloc(size).unwrap();
+
+// constructing NTT domain
+initialize_domain(
+ ntt::get_root_of_unity::(
+ size.try_into()
+ .unwrap(),
+ ),
+ &ntt::NTTInitDomainConfig::default(),
+)
+.unwrap();
+
+// Using default config
+let cfg = ntt::NTTConfig::::default();
+
+// Computing NTT
+ntt::ntt(
+ HostSlice::from_slice(&scalars),
+ ntt::NTTDir::kForward,
+ &cfg,
+ &mut ntt_results[..],
+)
+.unwrap();
```
-#### Parameters
+### NTT Domain
-- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
-- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
-- **`IcicleResult<()>`**: Will return an error if the operation fails.
-
-#### Parameters
-
-- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
-- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
-- **`IcicleResult<()>`**: Will return an error if the operation fails.
-
-### Releasing the domain
-
-The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
+Before performing NTT operations, it is mandatory to construct the domain as [explained here](../primitives/ntt.md#ntt-domain).
+In rust, we have the following functions to construct, destruct the domain and retrieve a root of unity from it:
```rust
-pub fn release_domain(ctx: &DeviceContext) -> IcicleResult<()>
-where
- F: FieldImpl,
- ::Config: NTT
+pub trait NTTDomain {
+ pub fn initialize_domain(primitive_root: F, config: &NTTInitDomainConfig) -> Result<(), eIcicleError>;
+ pub fn release_domain() -> Result<(), eIcicleError>;
+ pub fn get_root_of_unity(max_size: u64) -> F;
+}
```
-
-#### Parameters
-
-- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
-The function returns an `IcicleResult<()>`, which represents the result of the operation. If the operation is successful, the function returns `Ok(())`, otherwise it returns an error.
diff --git a/docs/docs/icicle/rust-bindings/polynomials.md b/docs/docs/icicle/rust-bindings/polynomials.md
index c168aff48..edb99c917 100644
--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -110,8 +110,7 @@ These traits are implemented for references to DensePolynomial (i.e., &DensePoly
In addition to the traits, the following methods are implemented:
```rust
-impl DensePolynomial {
- pub fn init_cuda_backend() -> bool {...}
+impl DensePolynomial {
// Returns a mutable slice of the polynomial coefficients on the device
pub fn coeffs_mut_slice(&mut self) -> &mut DeviceSlice {...}
}
@@ -131,7 +130,7 @@ Functions within the DensePolynomial API that deal with polynomial coefficients
```rust
// Assume `coeffs` could either be in host memory or CUDA device memory
-let coeffs: DeviceSlice = DeviceVec::::cuda_malloc(coeffs_len).unwrap();
+let coeffs: DeviceSlice = DeviceVec::::device_malloc(coeffs_len).unwrap();
let p_from_coeffs = PolynomialBabyBear::from_coeffs(&coeffs, coeffs.len());
// Similarly for evaluations from roots of unity
@@ -152,8 +151,6 @@ First, choose the appropriate field implementation for your polynomial operation
```rust
use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
-// Initialize the CUDA backend for polynomial operations
-PolynomialBabyBear::init_cuda_backend();
let f = PolynomialBabyBear::from_coeffs(...);
// now use f by calling the implemented traits
@@ -234,7 +231,7 @@ f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut
// Evaluate on roots-of-unity-domain
let domain_log_size = 4;
-let mut device_evals = DeviceVec::::cuda_malloc(1 << domain_log_size).unwrap();
+let mut device_evals = DeviceVec::::device_malloc(1 << domain_log_size).unwrap();
f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
```
@@ -246,7 +243,7 @@ Read or copy polynomial coefficients for further processing:
let x_squared_coeff = f.get_coeff(2); // Coefficient of x^2
// Copy coefficients to a device-specific memory space
-let mut device_mem = DeviceVec::::cuda_malloc(coeffs.len()).unwrap();
+let mut device_mem = DeviceVec::::device_malloc(coeffs.len()).unwrap();
f.copy_coeffs(0, &mut device_mem[..]);
```
diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md
index e8cafdfda..61aa71570 100644
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,56 +1,6 @@
# Vector Operations API
-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
-
-## Examples
-
-### Addition of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{add_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-add_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Subtraction of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{sub_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-sub_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Multiplication of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{mul_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let ones: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::one(); test_size]);
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
-```
+Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
## Vector Operations Configuration
@@ -61,22 +11,24 @@ The `VecOpsConfig` struct encapsulates the settings for vector operations, inclu
Defines configuration parameters for vector operations.
```rust
-pub struct VecOpsConfig<'a> {
- pub ctx: DeviceContext<'a>,
- is_a_on_device: bool,
- is_b_on_device: bool,
- is_result_on_device: bool,
+pub struct VecOpsConfig {
+ pub stream_handle: IcicleStreamHandle,
+ pub is_a_on_device: bool,
+ pub is_b_on_device: bool,
+ pub is_result_on_device: bool,
pub is_async: bool,
+ pub ext: ConfigExtension,
}
```
#### Fields
-- **`ctx: DeviceContext<'a>`**: Specifies the device context for the operation, including the device ID and memory pool.
-- **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
-- **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
-- **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
-- **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.
+- **`stream_handle: IcicleStreamHandle`**: Specifies the stream (queue) to use for async execution
+- **`is_a_on_device: bool`**: Indicates whether the input data a has been preloaded on the device memory. If `false` inputs will be copied from host to device.
+- **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
+- **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device.
+- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously.
+- **`ext: ConfigExtension`**: extended configuration for backend.
### Default Configuration
@@ -86,122 +38,67 @@ pub struct VecOpsConfig<'a> {
let cfg = VecOpsConfig::default();
```
-These are the default settings.
-
-```rust
-impl<'a> Default for VecOpsConfig<'a> {
- fn default() -> Self {
- Self::default_for_device(DEFAULT_DEVICE_ID)
- }
-}
-
-impl<'a> VecOpsConfig<'a> {
- pub fn default_for_device(device_id: usize) -> Self {
- VecOpsConfig {
- ctx: DeviceContext::default_for_device(device_id),
- is_a_on_device: false,
- is_b_on_device: false,
- is_result_on_device: false,
- is_async: false,
- }
- }
-}
-```
-
## Vector Operations
Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.
-### `VecOps` Trait
-
-```rust
-pub trait VecOps {
- fn add(
- a: &HostOrDeviceSlice,
- b: &HostOrDeviceSlice,
- result: &mut HostOrDeviceSlice,
- cfg: &VecOpsConfig,
- ) -> IcicleResult<()>;
-
- fn sub(
- a: &HostOrDeviceSlice,
- b: &HostOrDeviceSlice,
- result: &mut HostOrDeviceSlice,
- cfg: &VecOpsConfig,
- ) -> IcicleResult<()>;
-
- fn mul(
- a: &HostOrDeviceSlice,
- b: &HostOrDeviceSlice,
- result: &mut HostOrDeviceSlice,
- cfg: &VecOpsConfig,
- ) -> IcicleResult<()>;
-}
-```
-
-#### Methods
+### Methods
All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
- **`add`**: Computes the element-wise sum of two vectors.
+- **`accumulate`**: Sum input b to a inplace.
- **`sub`**: Computes the element-wise difference between two vectors.
- **`mul`**: Performs element-wise multiplication of two vectors.
+- **`transpose`**: Performs matrix transpose.
+- **`bit_reverse/bit_reverse_inplace`**: Reverse order of elements based on bit-reverse.
-## MatrixTranspose API Documentation
-
-This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
-The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
-
-### Function
```rust
-pub fn transpose_matrix(
- input: &HostOrDeviceSlice,
- row_size: u32,
- column_size: u32,
- output: &mut HostOrDeviceSlice,
- ctx: &DeviceContext,
- on_device: bool,
- is_async: bool,
-) -> IcicleResult<()>
-where
- F: FieldImpl,
- ::Config: VecOps
-```
+pub fn add_scalars(
+ a: &(impl HostOrDeviceSlice + ?Sized),
+ b: &(impl HostOrDeviceSlice + ?Sized),
+ result: &mut (impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
+
+pub fn accumulate_scalars(
+ a: &mut (impl HostOrDeviceSlice + ?Sized),
+ b: &(impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
+
+pub fn sub_scalars(
+ a: &(impl HostOrDeviceSlice + ?Sized),
+ b: &(impl HostOrDeviceSlice + ?Sized),
+ result: &mut (impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
+
+pub fn mul_scalars(
+ a: &(impl HostOrDeviceSlice + ?Sized),
+ b: &(impl HostOrDeviceSlice + ?Sized),
+ result: &mut (impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
-### Parameters
-
-- **`input`**: A slice representing the input matrix. The slice can be stored on either the host or the device.
-- **`row_size`**: The number of rows in the input matrix.
-- **`column_size`**: The number of columns in the input matrix.
-- **`output`**: A mutable slice to store the transposed matrix. The slice can be stored on either the host or the device.
-- **`ctx`**: A reference to the `DeviceContext`, which provides information about the device where the operation will be performed.
-- **`on_device`**: A boolean flag indicating whether the inputs and outputs are on the device.
-- **`is_async`**: A boolean flag indicating whether the operation should be performed asynchronously.
-
-### Return Value
-
-`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
-
-### Example
-
-```rust
-use icicle::HostOrDeviceSlice;
-use icicle::DeviceContext;
-use icicle::FieldImpl;
-use icicle::VecOps;
-
-let input: HostOrDeviceSlice = // ...;
-let mut output: HostOrDeviceSlice = // ...;
-let ctx: DeviceContext = // ...;
-
-transpose_matrix(&input, 5, 4, &mut output, &ctx, true, false)
- .expect("Failed to transpose matrix");
-```
-
-The function takes a matrix represented as a 1D slice, transposes it, and stores the result in another 1D slice. The input and output slices can be stored on either the host or the device, and the operation can be performed synchronously or asynchronously.
-
-The function is generic and can work with any type `F` that implements the `FieldImpl` trait. The `::Config` type must also implement the `VecOps` trait, which provides the `transpose` method used to perform the actual transposition.
-
-The function returns an `IcicleResult<()>`, indicating whether the operation was successful or not.
+pub fn transpose_matrix(
+ input: &(impl HostOrDeviceSlice + ?Sized),
+ nof_rows: u32,
+ nof_cols: u32,
+ output: &mut (impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
+
+pub fn bit_reverse(
+ input: &(impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+ output: &mut (impl HostOrDeviceSlice + ?Sized),
+) -> Result<(), eIcicleError>;
+
+pub fn bit_reverse_inplace(
+ input: &mut (impl HostOrDeviceSlice + ?Sized),
+ cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>;
+```
\ No newline at end of file
diff --git a/docs/docs/introduction.md b/docs/docs/introduction.md
index 14cb3ae72..b173cf5f9 100644
--- a/docs/docs/introduction.md
+++ b/docs/docs/introduction.md
@@ -6,42 +6,37 @@ title: ''
# Welcome to Ingonyama's Developer Documentation
-Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications. Our focus is on democratizing access to compute intensive cryptography and making it accessible for developers to build on top of.
+Ingonyama is a next-generation semiconductor company focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications. Our focus is on democratizing access to compute-intensive cryptography and making it accessible for developers to build upon.
-Currently our flagship products are:
+Our flagship product is **ICICLE**
-- **ICICLE**:
- [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your existing ZK protocols in a matter of hours or implement your protocol from scratch on GPU.
+#### **ICICLE v3**
+[ICICLE v3](https://github.com/ingonyama-zk/icicle) is a versatile cryptography library designed to support multiple compute backends, including CUDA, CPU, and potentially others like Metal, WebGPU, Vulkan, and ZPU. Originally focused on GPU acceleration, ICICLE has evolved to offer backend-agnostic cryptographic acceleration, allowing you to build ZK provers or other cryptographic applications with ease, leveraging the best available hardware for your needs.
----
-
-## Our current take on hardware acceleration
-
-We believe GPUs are as important for ZK as for AI.
+- **Multiple Backend Support:** Develop on CPU and deploy on various backends including CUDA and potentially Metal, WebGPU, Vulkan, ZPU, or even remote machines.
+- **Cross-Language Compatibility:** Use ICICLE across multiple programming languages such as C++, Rust, Go, and possibly Python.
+- **Optimized for ZKPs:** Accelerate cryptographic operations like elliptic curve operations, MSM, NTT, Poseidon hash, and more.
-- GPUs are a perfect match for ZK compute - around 97% of ZK protocol runtime is parallel by nature.
-- GPUs are simple for developers to use and scale compared to other hardware platforms.
-- GPUs are extremely competitive in terms of power / performance and price (3x cheaper compared to FPGAs).
-- GPUs are popular and readily available.
+**Learn more about ICICLE and its multi-backend support [here][ICICLE-OVERVIEW].**
-For a more in-depth understanding on this topic we suggest you read [our article on the subject](https://www.ingonyama.com/blog/revisiting-paradigm-hardware-acceleration-for-zero-knowledge-proofs).
+---
-Despite our current focus on GPUs we are still hard at work developing a ZPU (ZK Processing Unit), with the goal of offering a programmable hardware platform for ZK. To read more about ZPUs we suggest you read this [article](https://medium.com/@ingonyama/zpu-the-zero-knowledge-processing-unit-f886a48e00e0).
+## Our Approach to Hardware Acceleration
-## ICICLE
+We believe that GPUs are as critical for ZK as they are for AI.
-[ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs.
-ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.
+- **Parallelism:** Approximately 97% of ZK protocol runtime is naturally parallel, making GPUs an ideal match.
+- **Developer-Friendly:** GPUs offer simplicity in scaling and usage compared to other hardware platforms.
+- **Cost-Effective:** GPUs provide a competitive balance of power, performance, and cost, often being 3x cheaper than FPGAs.
-ICICLE is designed to be easy to use, developers don't have to touch a single line of CUDA code. Our Rust and Golang bindings allow your team to transition from CPU to GPU with minimal changes.
+For a more in-depth understanding on this topic we suggest you read [our article on the subject](https://www.ingonyama.com/blog/revisiting-paradigm-hardware-acceleration-for-zero-knowledge-proofs).
-Learn more about ICICLE and GPUs [here][ICICLE-OVERVIEW].
## Get in Touch
If you have any questions, ideas, or are thinking of building something in this space, join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).
-Follow us on [Twitter](https://x.com/Ingo_zk) and [YouTube](https://www.youtube.com/@ingo_ZK) and sign up for our [mailing list](https://wkf.ms/3LKCbdj) to get our latest announcements.
+Follow us on [Twitter](https://x.com/Ingo_zk) and [YouTube](https://www.youtube.com/@ingo_ZK) and join us IRL at our [next event](https://www.ingonyama.com/events)
[ICICLE-OVERVIEW]: ./icicle/overview.md
[Discord]: https://discord.gg/6vYrE7waPj
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 48cb95704..c23d9b836 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -14,171 +14,213 @@ module.exports = {
},
collapsed: false,
items: [
- {
- type: "doc",
- label: "Getting started",
- id: "icicle/introduction"
- },
- {
- type: "doc",
- label: "ICICLE Core",
- id: "icicle/core",
- },
{
type: "category",
- label: "Primitives",
+ label: "Getting started",
link: {
type: `doc`,
- id: 'icicle/primitives/overview',
+ id: "icicle/getting_started",
},
- collapsed: true,
+ collapsed: false,
items: [
{
type: "doc",
- label: "MSM",
- id: "icicle/primitives/msm",
- },
- {
- type: "doc",
- label: "NTT",
- id: "icicle/primitives/ntt",
+ label: "Build ICICLE from source",
+ id: "icicle/build_from_source",
},
+ ],
+ },
+ {
+ type: "category",
+ label: "Architecture overview",
+ link: {
+ type: `doc`,
+ id: "icicle/arch_overview"
+ },
+ collapsed: false,
+ items: [
{
type: "doc",
- label: "Keccak Hash",
- id: "icicle/primitives/keccak",
+ label: "CUDA Backend",
+ id: "icicle/install_cuda_backend"
},
{
type: "doc",
- label: "Poseidon Hash",
- id: "icicle/primitives/poseidon",
+ label: "Multi Device Support",
+ id: "icicle/multi-device",
},
{
type: "doc",
- label: "Poseidon2 Hash",
- id: "icicle/primitives/poseidon2",
+ label: "Build Your Own Backend",
+ id: "icicle/build_your_own_backend"
},
- ],
- },
- {
- type: "doc",
- label: "Polynomials",
- id: "icicle/polynomials/overview",
+ ]
},
{
type: "doc",
- label: "Multi GPU Support",
- id: "icicle/multi-gpu",
+ label: "ICICLE libraries",
+ id: "icicle/libraries",
},
{
type: "category",
- label: "Golang bindings",
+ label: "Programmers guide",
link: {
type: `doc`,
- id: "icicle/golang-bindings",
+ id: "icicle/programmers_guide/general",
},
- collapsed: true,
+ collapsed: false,
items: [
- {
- type: "category",
- label: "MSM",
- link: {
- type: `doc`,
- id: "icicle/golang-bindings/msm",
- },
- collapsed: true,
- items: [
- {
- type: "doc",
- label: "MSM pre computation",
- id: "icicle/golang-bindings/msm-pre-computation",
- }
- ]
- },
{
type: "doc",
- label: "NTT",
- id: "icicle/golang-bindings/ntt",
+ label: "C++",
+ id: "icicle/programmers_guide/cpp",
},
{
type: "doc",
- label: "EC-NTT",
- id: "icicle/golang-bindings/ecntt",
+ label: "Rust",
+ id: "icicle/programmers_guide/rust",
},
{
type: "doc",
- label: "Vector operations",
- id: "icicle/golang-bindings/vec-ops",
- },
- {
- type: "doc",
- label: "Keccak Hash",
- id: "icicle/golang-bindings/keccak",
- },
- {
- type: "doc",
- label: "Multi GPU Support",
- id: "icicle/golang-bindings/multi-gpu",
- },
- ]
+ label: "Go",
+ id: "icicle/programmers_guide/go",
+ }
+ ],
},
{
type: "category",
- label: "Rust bindings",
+ label: "Compute API",
link: {
type: `doc`,
- id: "icicle/rust-bindings",
+ id: 'icicle/primitives/overview',
},
collapsed: true,
items: [
- {
- type: "category",
- label: "MSM",
- link: {
- type: `doc`,
- id: "icicle/rust-bindings/msm",
- },
- collapsed: true,
- items: [
- {
- type: "doc",
- label: "MSM pre computation",
- id: "icicle/rust-bindings/msm-pre-computation",
- }
- ]
- },
{
type: "doc",
- label: "NTT",
- id: "icicle/rust-bindings/ntt",
+ label: "MSM",
+ id: "icicle/primitives/msm",
},
{
type: "doc",
- label: "EC-NTT",
- id: "icicle/rust-bindings/ecntt",
+ label: "NTT / ECNTT",
+ id: "icicle/primitives/ntt",
},
{
type: "doc",
label: "Vector operations",
- id: "icicle/rust-bindings/vec-ops",
+ id: "icicle/primitives/vec_ops",
},
{
type: "doc",
- label: "Keccak Hash",
- id: "icicle/rust-bindings/keccak",
+ label: "Polynomials",
+ id: "icicle/polynomials/overview",
},
{
- type: "doc",
- label: "Multi GPU Support",
- id: "icicle/rust-bindings/multi-gpu",
+ type: "category",
+ label: "Golang bindings",
+ link: {
+ type: `doc`,
+ id: "icicle/golang-bindings",
+ },
+ collapsed: true,
+ items: [
+ {
+ type: "category",
+ label: "MSM",
+ link: {
+ type: `doc`,
+ id: "icicle/golang-bindings/msm",
+ },
+ collapsed: true,
+ items: [
+ {
+ type: "doc",
+ label: "MSM pre computation",
+ id: "icicle/golang-bindings/msm-pre-computation",
+ }
+ ]
+ },
+ {
+ type: "doc",
+ label: "NTT",
+ id: "icicle/golang-bindings/ntt",
+ },
+ {
+ type: "doc",
+ label: "EC-NTT",
+ id: "icicle/golang-bindings/ecntt",
+ },
+ {
+ type: "doc",
+ label: "Vector operations",
+ id: "icicle/golang-bindings/vec-ops",
+ },
+ {
+ type: "doc",
+ label: "Multi GPU Support",
+ id: "icicle/golang-bindings/multi-gpu",
+ },
+ ]
},
{
- type: "doc",
- label: "Polynomials",
- id: "icicle/rust-bindings/polynomials",
+ type: "category",
+ label: "Rust bindings",
+ link: {
+ type: `doc`,
+ id: "icicle/rust-bindings",
+ },
+ collapsed: true,
+ items: [
+ {
+ type: "doc",
+ label: "MSM",
+ id: "icicle/rust-bindings/msm",
+ },
+ {
+ type: "doc",
+ label: "NTT",
+ id: "icicle/rust-bindings/ntt",
+ },
+ {
+ type: "doc",
+ label: "ECNTT",
+ id: "icicle/rust-bindings/ecntt",
+ },
+ {
+ type: "doc",
+ label: "Vector operations",
+ id: "icicle/rust-bindings/vec-ops",
+ },
+ {
+ type: "doc",
+ label: "Polynomials",
+ id: "icicle/rust-bindings/polynomials",
+ },
+ {
+ type: "doc",
+ label: "Multi GPU Support (TODO)",
+ id: "icicle/rust-bindings/multi-gpu",
+ },
+ ],
},
],
},
+ {
+ type: "doc",
+ label: "Migrate from ICICLE V2",
+ id: "icicle/migrate_from_v2",
+ },
+ {
+ type: "doc",
+ label: "Benchmarks",
+ id: "icicle/benchmarks",
+ },
+ {
+ type: "doc",
+ label: "FAQ and Troubleshooting",
+ id: "icicle/faq_and_troubleshooting",
+ },
{
type: "doc",
label: "Google Colab Instructions",
@@ -191,11 +233,6 @@ module.exports = {
},
]
},
- {
- type: "doc",
- label: "ZK Containers",
- id: "ZKContainers",
- },
{
type: "doc",
label: "Ingonyama Grant program",
diff --git a/examples/ZKContainer.md b/examples/ZKContainer.md
deleted file mode 100644
index 1eacf3c29..000000000
--- a/examples/ZKContainer.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# ZKContainer
-
-We recommend using [ZKContainer](https://www.ingonyama.com/blog/product-announcement-zk-containers), where we have already preinstalled all the required dependencies, to run Icicle examples.
-To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).
-
-In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`.
-
-- File `Dockerfile` specifies how to build an image of a ZKContainer.
-- File `devcontainer.json` enables running ZKContainer from Visual Studio Code.
-
-## Running ZKContainer from shell
-
-```sh
-docker build -t icicle-example-poseidon -f .devcontainer/Dockerfile .
-```
-
-To run the example interactively, start the container
-
-```sh
-docker run -it --rm --gpus all -v .:/icicle-example icicle-example-poseidon
-```
-
-Inside the container, run the commands for building the library for whichever [build system](../README.md#build-systems) you choose to use.
diff --git a/examples/c++/best-practice-ntt/CMakeLists.txt b/examples/c++/best-practice-ntt/CMakeLists.txt
index d6e5b7d7d..d523b645c 100644
--- a/examples/c++/best-practice-ntt/CMakeLists.txt
+++ b/examples/c++/best-practice-ntt/CMakeLists.txt
@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
+
set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
- set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
- set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+project(example)
+
+add_executable(example example.cpp)
+target_include_directories(example PRIVATE "../../../icicle/include" "..")
+target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
+message("${CMAKE_BINARY_DIR}/icicle")
+target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
+if(BACKEND_DIR)
+ add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
+endif()
-add_executable(
- example
- example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/examples/c++/best-practice-ntt/README.md b/examples/c++/best-practice-ntt/README.md
index 0839105aa..0ba43ee29 100644
--- a/examples/c++/best-practice-ntt/README.md
+++ b/examples/c++/best-practice-ntt/README.md
@@ -17,17 +17,19 @@ Typically, you concurrently
## Best-Practices
-1. Use three separate CUDA streams for Download, Upload, and Compute operations
-2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
-3. Use in-place NTT to save on device memory.
+1. Use three separate streams for Download to device, Upload from device, and Compute operations
+2. Future: Use pinned (page-locked) memory on host to speed data bus transfers.
+3. Compute in-place NTT.
## Running the example
-To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
+To change the default curve BN254, edit `run.sh` and `CMakeLists.txt`
```sh
-./compile.sh
-./run.sh
+# for CPU
+./run.sh -d CPU
+# for CUDA
+./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
diff --git a/examples/c++/best-practice-ntt/compile.sh b/examples/c++/best-practice-ntt/compile.sh
deleted file mode 100755
index 2506ff967..000000000
--- a/examples/c++/best-practice-ntt/compile.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
-
diff --git a/examples/c++/best-practice-ntt/example.cpp b/examples/c++/best-practice-ntt/example.cpp
new file mode 100644
index 000000000..791491cf7
--- /dev/null
+++ b/examples/c++/best-practice-ntt/example.cpp
@@ -0,0 +1,126 @@
+#include
+#include
+#include
+#include
+
+#include "icicle/runtime.h"
+#include "icicle/api/bn254.h"
+using namespace bn254;
+
+#include "examples_utils.h"
+#include "icicle/backend/ntt_config.h"
+
+void initialize_input(const unsigned ntt_size, const unsigned batch_size, scalar_t* elements)
+{
+ for (unsigned i = 0; i < ntt_size * batch_size; i++) {
+ elements[i] = scalar_t::from(i + 1);
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ try_load_and_set_backend_device(argc, argv);
+
+ // set these parameters to match the desired NTT size and batch size
+ const unsigned log_ntt_size = 20;
+ const unsigned batch_size = 16;
+
+ scalar_t basic_root = scalar_t::omega(log_ntt_size);
+
+ const unsigned ntt_size = 1 << log_ntt_size;
+ std::cout << "log NTT size: " << log_ntt_size << std::endl;
+ std::cout << "Batch size: " << batch_size << std::endl;
+
+ // Create separate streams for overlapping data transfers and kernel execution.
+ icicleStreamHandle stream_compute, stream_h2d, stream_d2h;
+ ICICLE_CHECK(icicle_create_stream(&stream_compute));
+ ICICLE_CHECK(icicle_create_stream(&stream_h2d));
+ ICICLE_CHECK(icicle_create_stream(&stream_d2h));
+
+ // Initialize NTT domain
+ std::cout << "Init NTT domain" << std::endl;
+ auto ntt_init_domain_cfg = default_ntt_init_domain_config();
+ // set CUDA backend specific flag for init_domain
+ ConfigExtension backend_cfg_ext;
+ backend_cfg_ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
+ ntt_init_domain_cfg.ext = &backend_cfg_ext;
+ ICICLE_CHECK(bn254_ntt_init_domain(&basic_root, &ntt_init_domain_cfg));
+
+ std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
+ int nof_blocks = 32;
+ int block_size = ntt_size * batch_size / nof_blocks;
+ std::cout << "Number of blocks: " << nof_blocks << ", block size: " << block_size << " Bytes" << std::endl;
+
+ // on-host pinned data
+ scalar_t* h_inp[2];
+ scalar_t* h_out[2];
+ for (int i = 0; i < 2; i++) {
+ h_inp[i] = new scalar_t[ntt_size * batch_size];
+ h_out[i] = new scalar_t[ntt_size * batch_size];
+ }
+
+ // on-device in-place data
+ // we need two on-device vectors to overlap data transfers with NTT kernel execution
+ scalar_t* d_vec[2];
+ for (int i = 0; i < 2; i++) {
+ ICICLE_CHECK(icicle_malloc((void**)&d_vec[i], sizeof(scalar_t) * ntt_size * batch_size));
+ }
+
+ // initialize input data
+ initialize_input(ntt_size, batch_size, h_inp[0]);
+ initialize_input(ntt_size, batch_size, h_inp[1]);
+
+ // ntt configuration
+ NTTConfig config_compute = default_ntt_config();
+ config_compute.batch_size = batch_size;
+ config_compute.are_inputs_on_device = true;
+ config_compute.are_outputs_on_device = true;
+ config_compute.is_async = true;
+ config_compute.stream = stream_compute;
+ // backend specific config extension
+ ConfigExtension ntt_cfg_ext;
+ ntt_cfg_ext.set(CudaBackendConfig::CUDA_NTT_ALGORITHM, CudaBackendConfig::NttAlgorithm::MixedRadix);
+ config_compute.ext = &ntt_cfg_ext;
+
+ for (int run = 0; run < 10; run++) {
+ int vec_compute = run % 2;
+ int vec_transfer = (run + 1) % 2;
+ std::cout << "Run: " << run << std::endl;
+ std::cout << "Compute Vector: " << vec_compute << std::endl;
+ std::cout << "Transfer Vector: " << vec_transfer << std::endl;
+ START_TIMER(inplace);
+ bn254_ntt(d_vec[vec_compute], ntt_size, NTTDir::kForward, &config_compute, d_vec[vec_compute]);
+ // we have to delay upload to device relative to download from device by one block: preserve write after read
+ for (int i = 0; i <= nof_blocks; i++) {
+ if (i < nof_blocks) {
+ // copy result back from device to host
+ ICICLE_CHECK(icicle_copy_async(
+ &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(scalar_t) * block_size,
+ stream_d2h));
+ }
+ if (i > 0) {
+ // copy next input from host to device to alternate buffer
+ ICICLE_CHECK(icicle_copy_async(
+ &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
+ sizeof(scalar_t) * block_size, stream_h2d));
+ }
+ // synchronize upload and download at the end of the block to ensure data integrity
+ ICICLE_CHECK(icicle_stream_synchronize(stream_d2h));
+ ICICLE_CHECK(icicle_stream_synchronize(stream_h2d));
+ }
+ // synchronize compute stream with the end of the computation
+ ICICLE_CHECK(icicle_stream_synchronize(stream_compute));
+ END_TIMER(inplace, "Concurrent In-Place NTT");
+ }
+
+ // Clean-up
+ for (int i = 0; i < 2; i++) {
+ ICICLE_CHECK(icicle_free(d_vec[i]));
+ delete[](h_inp[i]);
+ delete[](h_out[i]);
+ }
+ ICICLE_CHECK(icicle_destroy_stream(stream_compute));
+ ICICLE_CHECK(icicle_destroy_stream(stream_d2h));
+ ICICLE_CHECK(icicle_destroy_stream(stream_h2d));
+ return 0;
+}
diff --git a/examples/c++/best-practice-ntt/example.cu b/examples/c++/best-practice-ntt/example.cu
deleted file mode 100644
index 341f42a9e..000000000
--- a/examples/c++/best-practice-ntt/example.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-#include
-#include
-#include
-#include
-
-#include "curves/params/bn254.cuh"
-#include "api/bn254.h"
-using namespace bn254;
-using namespace ntt;
-
-const std::string curve = "BN254";
-
-typedef scalar_t S;
-typedef scalar_t E;
-
-const unsigned max_log_ntt_size = 27;
-
-void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
-{
- for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
- elements[i] = E::from(i + 1);
- }
-}
-
-using FpMilliseconds = std::chrono::duration;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
- printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-int main(int argc, char** argv)
-{
- cudaDeviceReset();
- cudaDeviceProp deviceProperties;
- int deviceId = 0;
- cudaGetDeviceProperties(&deviceProperties, deviceId);
- std::string gpu_full_name = deviceProperties.name;
- std::cout << gpu_full_name << std::endl;
- std::string gpu_name = gpu_full_name;
-
- std::cout << "Curve: " << curve << std::endl;
-
- S basic_root = S::omega(max_log_ntt_size);
-
- // change these parameters to match the desired NTT size and batch size
- const unsigned log_ntt_size = 22;
- const unsigned nof_ntts = 16;
-
- std::cout << "log NTT size: " << log_ntt_size << std::endl;
- const unsigned ntt_size = 1 << log_ntt_size;
-
- std::cout << "Batch size: " << nof_ntts << std::endl;
-
- // Create separate CUDA streams for overlapping data transfers and kernel execution.
- cudaStream_t stream_compute, stream_h2d, stream_d2h;
- cudaStreamCreate(&stream_compute);
- cudaStreamCreate(&stream_h2d);
- cudaStreamCreate(&stream_d2h);
-
- // Create device context for NTT computation
- auto ctx_compute = device_context::DeviceContext{
- stream_compute, // stream
- 0, // device_id
- 0, // mempool
- };
-
- // Initialize NTT domain and configuration
- bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
- NTTConfig config_compute = default_ntt_config(ctx_compute);
- config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
- config_compute.batch_size = nof_ntts;
- config_compute.are_inputs_on_device = true;
- config_compute.are_outputs_on_device = true;
- config_compute.is_async = true;
-
- std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
- int nof_blocks = 32;
- std::cout << "Number of blocks: " << nof_blocks << std::endl;
- int block_size = ntt_size * nof_ntts / nof_blocks;
-
- // on-host pinned data
- E* h_inp[2];
- E* h_out[2];
- for (int i = 0; i < 2; i++) {
- cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
- cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
- }
-
- // on-device in-place data
- // we need two on-device vectors to overlap data transfers with NTT kernel execution
- E* d_vec[2];
- for (int i = 0; i < 2; i++) {
- cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
- }
-
- // initialize input data
- initialize_input(ntt_size, nof_ntts, h_inp[0]);
- initialize_input(ntt_size, nof_ntts, h_inp[1]);
-
- cudaEvent_t compute_start, compute_stop;
- cudaEventCreate(&compute_start);
- cudaEventCreate(&compute_stop);
-
- for (int run = 0; run < 10; run++) {
- int vec_compute = run % 2;
- int vec_transfer = (run + 1) % 2;
- std::cout << "Run: " << run << std::endl;
- std::cout << "Compute Vector: " << vec_compute << std::endl;
- std::cout << "Transfer Vector: " << vec_transfer << std::endl;
- START_TIMER(inplace);
- cudaEventRecord(compute_start, stream_compute);
- bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
- cudaEventRecord(compute_stop, stream_compute);
- // we have to delay upload to device relative to download from device by one block: preserve write after read
- for (int i = 0; i <= nof_blocks; i++) {
- if (i < nof_blocks) {
- cudaMemcpyAsync(
- &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
- cudaMemcpyDeviceToHost, stream_d2h);
- }
- if (i > 0) {
- cudaMemcpyAsync(
- &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
- sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
- }
- // synchronize upload and download at the end of the block to ensure data integrity
- cudaStreamSynchronize(stream_d2h);
- cudaStreamSynchronize(stream_h2d);
- }
- // synchronize compute stream with the end of the computation
- cudaEventSynchronize(compute_stop);
- float milliseconds = 0;
- cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
- END_TIMER(inplace, "Concurrent In-Place NTT");
- std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
- };
-
- // Clean-up
- for (int i = 0; i < 2; i++) {
- cudaFree(d_vec[i]);
- cudaFreeHost(h_inp[i]);
- cudaFreeHost(h_out[i]);
- }
- cudaEventDestroy(compute_start);
- cudaEventDestroy(compute_stop);
- cudaStreamDestroy(stream_compute);
- cudaStreamDestroy(stream_d2h);
- cudaStreamDestroy(stream_h2d);
- return 0;
-}
diff --git a/examples/c++/best-practice-ntt/run.sh b/examples/c++/best-practice-ntt/run.sh
index 01eca66ba..879390d0a 100755
--- a/examples/c++/best-practice-ntt/run.sh
+++ b/examples/c++/best-practice-ntt/run.sh
@@ -1,2 +1,66 @@
#!/bin/bash
-./build/example/example
+
+# Exit immediately if a command exits with a non-zero status
+set -e
+
+# Function to display usage information
+show_help() {
+ echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]"
+ echo
+ echo "Options:"
+ echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
+ echo " -b ICICLE_BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
+ echo " -h Show this help message"
+ exit 0
+}
+
+# Parse command line options
+while getopts ":d:b:h" opt; do
+ case ${opt} in
+ d )
+ DEVICE_TYPE=$OPTARG
+ ;;
+ b )
+ ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
+ ;;
+ h )
+ show_help
+ ;;
+ \? )
+ echo "Invalid option: -$OPTARG" 1>&2
+ show_help
+ ;;
+ : )
+ echo "Invalid option: -$OPTARG requires an argument" 1>&2
+ show_help
+ ;;
+ esac
+done
+
+# Set default values if not provided
+: "${DEVICE_TYPE:=CPU}"
+: "${ICICLE_BACKEND_INSTALL_DIR:=}"
+
+# Create necessary directories
+mkdir -p build/example
+mkdir -p build/icicle
+
+ICILE_DIR=$(realpath "../../../icicle/")
+ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
+
+# Build Icicle and the example app that links to it
+if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
+ echo "Building icicle with CUDA backend"
+ cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DG2=OFF -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
+ export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
+else
+ echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}"
+ export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}"
+ cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
+fi
+cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
+
+cmake --build build/icicle -j
+cmake --build build/example -j
+
+./build/example/example "$DEVICE_TYPE"
diff --git a/examples/c++/examples_utils.h b/examples/c++/examples_utils.h
new file mode 100644
index 000000000..e71c03bd3
--- /dev/null
+++ b/examples/c++/examples_utils.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include
+#include "icicle/runtime.h"
+
+// Timer
+using FpMilliseconds = std::chrono::duration;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) \
+ printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+// Load and choose backend
+void try_load_and_set_backend_device(int argc = 0, char** argv = nullptr)
+{
+ icicle_load_backend_from_env_or_default();
+
+ const char* selected_device = argc > 1 ? argv[1] : nullptr;
+ if (selected_device) {
+ ICICLE_LOG_INFO << "selecting " << selected_device << " device";
+ ICICLE_CHECK(icicle_set_device(selected_device));
+ return;
+ }
+
+ // trying to choose CUDA if available, or fallback to CPU otherwise (default device)
+ const bool is_cuda_device_available = (eIcicleError::SUCCESS == icicle_is_device_available("CUDA"));
+ if (is_cuda_device_available) {
+ Device device = {"CUDA", 0}; // GPU-0
+ ICICLE_LOG_INFO << "setting " << device;
+ ICICLE_CHECK(icicle_set_device(device));
+ return;
+ }
+
+ ICICLE_LOG_INFO << "CUDA device not available, falling back to CPU";
+}
\ No newline at end of file
diff --git a/examples/c++/install-and-use-icicle/CMakeLists.txt b/examples/c++/install-and-use-icicle/CMakeLists.txt
new file mode 100644
index 000000000..1b86fab2b
--- /dev/null
+++ b/examples/c++/install-and-use-icicle/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Add the executable
+add_executable(example example.cpp)
+# Link the libraries
+target_link_libraries(example icicle_device icicle_field_bn254 icicle_curve_bn254)
+
+# OPTIONAL (if not installed in default location)
+
+# The following is setting compile and runtime paths for headers and libs assuming
+# - headers in /custom/path/icicle/include
+# - libs in/custom/path/icicle/lib
+
+# Include directories
+target_include_directories(example PUBLIC /custom/path/icicle/include)
+# Library directories
+target_link_directories(example PUBLIC /custom/path/icicle/lib/)
+# Set the RPATH so linker finds icicle libs at runtime
+set_target_properties(example PROPERTIES
+ BUILD_RPATH /custom/path/icicle/lib/
+ INSTALL_RPATH /custom/path/icicle/lib/)
\ No newline at end of file
diff --git a/examples/c++/install-and-use-icicle/README.md b/examples/c++/install-and-use-icicle/README.md
new file mode 100644
index 000000000..f0a161833
--- /dev/null
+++ b/examples/c++/install-and-use-icicle/README.md
@@ -0,0 +1,117 @@
+# Example: Install and use ICICLE (C++)
+
+This example demonstrates how to install ICICLE binaries and use them in a C++ application.
+
+Download release binaries from our [github release page](https://github.com/ingonyama-zk/icicle/releases):
+- **Frontend** icicle30-ubuntu22.tar.gz
+- **Backend** icicle30-ubuntu22-cuda122.tar.gz
+
+> [!NOTE]
+> The names of the files are based on the release version. Ensure you update the tar file names in the example if you’re using a different release.
+
+## Optional: Using Docker
+
+While not mandatory, this example can be demonstrated in an Ubuntu 22 Docker container.
+```bash
+docker run -it --rm --gpus all -v ./:/workspace -w /workspace icicle-release-ubuntu22-cuda122 bash
+```
+
+This command starts a bash session in the Docker container, with GPUs enabled and the example files mapped to /workspace in the container.
+
+### Building the docker image
+
+The Docker image is based on NVIDIA’s image for Ubuntu 22.04 and can be built from the following Dockerfile:
+
+```dockerfile
+# Use the official NVIDIA development runtime image for Ubuntu 22.04
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Install necessary packages
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ cmake \
+ tar
+```
+
+Build the Docker image with the following command:
+```bash
+docker build -t icicle-release-ubuntu20-cuda122 -f Dockerfile.ubuntu20 .`
+```
+
+## Extract tars and install ICICLE
+
+Extracting and Installing the Frontend
+```bash
+cd release
+# extract frontend part
+tar xzvf icicle30-ubuntu22.tar.gz
+cp -r ./icicle/lib/* /usr/lib/
+cp -r ./icicle/include/icicle/ /usr/local/include/ # copy C++ headers
+```
+
+Extracting and Installing the CUDA Backend (Optional)
+
+```bash
+# extract CUDA backend (OPTIONAL)
+tar xzvf icicle30-ubuntu22-cuda122.tar.gz -C /opt
+rm -rf icicle # remove the extracted dir
+```
+
+## Compile and Link the C++ Example with ICICLE
+
+```bash
+cd ..
+mkdir build
+cmake -S . -B build && cmake --build build
+```
+
+## Launch the executable
+
+```bash
+./build/example
+```
+
+## Install ICICLE in a Custom Location
+
+If installing in a custom location such as /custom/path:
+```bash
+mkdir -p /custom/path
+cd release
+tar xzvf icicle30-ubuntu22.tar.gz -C /custom/path
+tar xzvf icicle30-ubuntu22-cuda122.tar.gz -C /custom/path # OPTIONAL
+```
+
+### Build your app and link to ICICLE
+
+When installing ICICLE in a custom location, you need to specify the paths for the include and library directories so that the compiler, linker, and loader can find them during compile and runtime. Add the following to your CMake file:
+```cmake
+# Include directories
+target_include_directories(example PUBLIC /custom/path/icicle/include)
+# Library directories
+target_link_directories(example PUBLIC /custom/path/icicle/lib/)
+# Set the RPATH so linker finds icicle libs at runtime
+set_target_properties(example PROPERTIES
+ BUILD_RPATH /custom/path/icicle/lib/
+ INSTALL_RPATH /custom/path/icicle/lib/)
+```
+
+Compile and Launch the Executable
+
+```bash
+cd ..
+mkdir build
+cmake -S . -B build && cmake --build build
+```
+
+### Launch the executable
+
+Since CUDA backend is installed to `/custom/path` we need to set the env variable accordingly:
+```bash
+export ICICLE_BACKEND_INSTALL_DIR=/custom/path/icicle/lib/backend
+./build/example
+```
+
+Alternatively, you can use the following API in your code:
+```cpp
+extern "C" eIcicleError icicle_load_backend(const char* path, bool is_recursive);
+```
diff --git a/examples/c++/install-and-use-icicle/example.cpp b/examples/c++/install-and-use-icicle/example.cpp
new file mode 100644
index 000000000..f86eceb97
--- /dev/null
+++ b/examples/c++/install-and-use-icicle/example.cpp
@@ -0,0 +1,92 @@
+#include
+#include
+#include "icicle/runtime.h"
+#include "icicle/api/bn254.h"
+
+using namespace bn254; // This makes scalar_t a bn254 scalar instead of bn254::scalar_t
+
+// Utility function to print arrays
+template
+void print_array(const T* arr, int size)
+{
+ for (int i = 0; i < size; ++i) {
+ std::cout << "\t" << i << ": " << arr[i] << std::endl;
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ // Load installed backends
+ icicle_load_backend_from_env_or_default();
+
+ // Check if GPU is available
+ Device device_cpu = {"CPU", 0};
+ const bool is_cuda_device_available = (eIcicleError::SUCCESS == icicle_is_device_available("CUDA"));
+ Device device_gpu = {"CUDA", 0};
+ if (is_cuda_device_available) {
+ ICICLE_LOG_INFO << "GPU is available";
+ } else {
+ ICICLE_LOG_INFO << "GPU is not available, falling back to CPU only";
+ device_gpu = device_cpu;
+ }
+
+ // Example input (on host memory) for NTT
+ const unsigned log_ntt_size = 2;
+ const unsigned ntt_size = 1 << log_ntt_size;
+ auto input_cpu = std::make_unique(ntt_size);
+ scalar_t::rand_host_many(input_cpu.get(), ntt_size);
+
+ // Allocate output on host memory
+ auto output_cpu = std::make_unique(ntt_size);
+ scalar_t root_of_unity = scalar_t::omega(log_ntt_size);
+ auto ntt_config = default_ntt_config();
+
+ // Part 1: Running NTT on CPU
+ std::cout << "Part 1: compute on CPU: " << std::endl;
+ icicle_set_device(device_cpu);
+ ntt_init_domain(root_of_unity, default_ntt_init_domain_config()); // Initialize NTT domain for CPU
+ ntt(input_cpu.get(), ntt_size, NTTDir::kForward, default_ntt_config(), output_cpu.get());
+ print_array(output_cpu.get(), ntt_size);
+
+ // Part 2: Running NTT on GPU
+ std::cout << "Part 2: compute on GPU (from/to CPU memory): " << std::endl;
+ icicle_set_device(device_gpu);
+ ntt_init_domain(root_of_unity, default_ntt_init_domain_config()); // Initialize NTT domain for GPU
+ ntt(input_cpu.get(), ntt_size, NTTDir::kForward, ntt_config, output_cpu.get());
+ print_array(output_cpu.get(), ntt_size);
+
+ // Allocate, copy data to GPU and compute on GPU memory
+ std::cout << "Part 2: compute on GPU (from/to GPU memory): " << std::endl;
+ scalar_t* input_gpu = nullptr;
+ scalar_t* output_gpu = nullptr;
+ icicle_malloc((void**)&input_gpu, ntt_size * sizeof(scalar_t));
+ icicle_malloc((void**)&output_gpu, ntt_size * sizeof(scalar_t));
+ icicle_copy(input_gpu, input_cpu.get(), ntt_size * sizeof(scalar_t));
+ ntt_config.are_inputs_on_device = true;
+ ntt_config.are_outputs_on_device = true;
+ ntt(input_gpu, ntt_size, NTTDir::kForward, ntt_config, output_gpu);
+ icicle_copy(output_cpu.get(), output_gpu, ntt_size * sizeof(scalar_t));
+ print_array(output_cpu.get(), ntt_size);
+
+ // Part 3: Using both CPU and GPU to compute NTT (GPU) and inverse INTT (CPU)
+ auto output_intt_cpu = std::make_unique(ntt_size);
+
+ // Step 1: Compute NTT on GPU
+ std::cout << "Part 3: compute NTT on GPU (NTT input): " << std::endl;
+ icicle_set_device(device_gpu);
+ ntt_config.are_inputs_on_device = false; // using host memory now
+ ntt_config.are_outputs_on_device = false;
+ ntt(input_cpu.get(), ntt_size, NTTDir::kForward, ntt_config, output_cpu.get());
+ print_array(input_cpu.get(), ntt_size);
+
+ // Step 2: Compute INTT on CPU
+ std::cout << "Part 3: compute INTT on CPU (INTT output): " << std::endl;
+ icicle_set_device(device_cpu);
+ ntt(output_cpu.get(), ntt_size, NTTDir::kInverse, ntt_config, output_intt_cpu.get());
+ print_array(output_intt_cpu.get(), ntt_size);
+
+ // Assert that INTT output is the same as NTT input
+ assert(0 == memcmp(input_cpu.get(), output_intt_cpu.get(), ntt_size * sizeof(scalar_t)));
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/c++/install-and-use-icicle/run.sh b/examples/c++/install-and-use-icicle/run.sh
new file mode 100755
index 000000000..c2bed7d1b
--- /dev/null
+++ b/examples/c++/install-and-use-icicle/run.sh
@@ -0,0 +1 @@
+echo "Check out the README file. You will have to download an ICICLE release and follow instructions"
diff --git a/examples/c++/msm/CMakeLists.txt b/examples/c++/msm/CMakeLists.txt
index b29e07c79..d523b645c 100644
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
+
set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
- set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
- set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+project(example)
+
+add_executable(example example.cpp)
+target_include_directories(example PRIVATE "../../../icicle/include" "..")
+target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
+message("${CMAKE_BINARY_DIR}/icicle")
+target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
+if(BACKEND_DIR)
+ add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
+endif()
-add_executable(
- example
- example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/examples/c++/msm/README.md b/examples/c++/msm/README.md
index dbb62be6e..ca6d7e8da 100644
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -1,46 +1,43 @@
# Icicle example: Muli-Scalar Multiplication (MSM)
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
## Key-Takeaway
`Icicle` provides CUDA C++ template function `MSM` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
## Concise Usage Explanation
-1. Select the curve
-2. Include an MSM template
-3. Configure MSM
-4. Call the template
+1. Include the curve api
+2. Configure MSM
+3. Call msm api
```c++
-#define CURVE_ID 1
-#include "icicle/appUtils/msm/msm.cu"
+#include "icicle/api/bn254.h"
...
-msm::MSMConfig config = {...};
+MSMConfig config = default_msm_config();
...
-msm::MSM(scalars, points, size, config, &result);
+bn254_msm(scalars, points, size, config, &result);
```
-In this example we use `BN254` curve (`CURVE_ID=1`). The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
+In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
**Parameters:**
-The configuration is passed to the kernel as a structure of type `msm::MSMConfig`. Some of the most important fields are listed below:
+The configuration is passed to the kernel as a structure of type `MSMConfig`. Some of the most important fields are listed below:
- `are_scalars_on_device`, `are_points_on_device`, `are_results_on_device`: location of the data
- `is_async`: blocking vs. non-blocking kernel call
-- `large_bucket_factor`: distinguishes between large bucket and normal bucket sizes. If there is a scalar distribution that is skewed heavily to a few values we can operate on those separately from the rest of the values. The ideal value here can vary by circuit (based on the distribution of scalars) but start with 10 and adjust to see if it improves performance.
+- In addition can pass backend-specific params via config.extConfig. For example CUDA backend accepts a `large_bucket_factor` param.
## Running the example
-- `cd` to your example directory
-- compile with `./compile.sh`
-- run with `./run.sh`
+```sh
+# for CPU
+./run.sh -d CPU
+# for CUDA
+./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
+```
## What's in the example
@@ -49,4 +46,4 @@ The configuration is passed to the kernel as a structure of type `msm::MSMConfig
3. Configure and execute MSM using on-host data
4. Copy inputs on-device
5. Configure and execute MSM using on-device data
-6. Repeat the above steps for G2 points
+6. Repeat step 3 G2 msm points
diff --git a/examples/c++/msm/compile.sh b/examples/c++/msm/compile.sh
deleted file mode 100755
index 7e8d781a0..000000000
--- a/examples/c++/msm/compile.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/msm/example.cpp b/examples/c++/msm/example.cpp
new file mode 100644
index 000000000..b3b3acc6f
--- /dev/null
+++ b/examples/c++/msm/example.cpp
@@ -0,0 +1,101 @@
+#include
+#include
+#include
+
+#include "icicle/runtime.h"
+#include "icicle/api/bn254.h"
+using namespace bn254;
+
+#include "examples_utils.h"
+
+int main(int argc, char* argv[])
+{
+ try_load_and_set_backend_device(argc, argv);
+
+ std::cout << "\nIcicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
+ std::cout << "Example parameters" << std::endl;
+
+ int batch_size = 1;
+ unsigned msm_size = 1 << 10;
+ int N = batch_size * msm_size;
+ std::cout << "Batch size: " << batch_size << std::endl;
+ std::cout << "MSM size: " << msm_size << std::endl;
+
+ std::cout << "\nPart I: use G1 points" << std::endl;
+
+ std::cout << "Generating random inputs on-host" << std::endl;
+ auto scalars = std::make_unique(N);
+ auto points = std::make_unique(N);
+ projective_t result;
+ scalar_t::rand_host_many(scalars.get(), N);
+ projective_t::rand_host_many(points.get(), N);
+
+ std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
+
+ auto config = default_msm_config();
+ config.batch_size = batch_size;
+
+ std::cout << "\nRunning MSM kernel with on-host inputs" << std::endl;
+ // Execute the MSM kernel
+ START_TIMER(MSM_host_mem);
+ ICICLE_CHECK(bn254_msm(scalars.get(), points.get(), msm_size, &config, &result));
+ END_TIMER(MSM_host_mem, "MSM from host-memory took");
+ std::cout << projective_t::to_affine(result) << std::endl;
+
+ DeviceProperties device_props;
+ ICICLE_CHECK(icicle_get_device_properties(device_props));
+ // If device does not share memory with host, copy inputs explicitly and execute msm with device pointers
+ if (!device_props.using_host_memory) {
+ std::cout << "\nReconfiguring MSM to use on-device inputs" << std::endl;
+ config.are_results_on_device = true;
+ config.are_scalars_on_device = true;
+ config.are_points_on_device = true;
+
+ std::cout << "Copying inputs to-device" << std::endl;
+ scalar_t* scalars_d;
+ affine_t* points_d;
+ projective_t* result_d;
+
+ ICICLE_CHECK(icicle_malloc((void**)&scalars_d, sizeof(scalar_t) * N));
+ ICICLE_CHECK(icicle_malloc((void**)&points_d, sizeof(affine_t) * N));
+ ICICLE_CHECK(icicle_malloc((void**)&result_d, sizeof(projective_t)));
+ ICICLE_CHECK(icicle_copy(scalars_d, scalars.get(), sizeof(scalar_t) * N));
+ ICICLE_CHECK(icicle_copy(points_d, points.get(), sizeof(affine_t) * N));
+
+ std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+ // Execute the MSM kernel
+ START_TIMER(MSM_device_mem);
+ ICICLE_CHECK(msm(scalars_d, points_d, msm_size, config, result_d));
+ END_TIMER(MSM_device_mem, "MSM from device-memory took");
+
+ // Copy the result back to the host
+ icicle_copy(&result, result_d, sizeof(projective_t));
+ // Print the result
+ std::cout << projective_t::to_affine(result) << std::endl;
+ // Free the device memory
+ icicle_free(scalars_d);
+ icicle_free(points_d);
+ icicle_free(result_d);
+ }
+
+ std::cout << "\nPart II: use G2 points" << std::endl;
+
+ std::cout << "Generating random inputs on-host" << std::endl;
+ // use the same scalars
+ auto g2_points = std::make_unique(N);
+ g2_projective_t::rand_host_many(g2_points.get(), N);
+
+ std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
+ config.are_results_on_device = false;
+ config.are_scalars_on_device = false;
+ config.are_points_on_device = false;
+ g2_projective_t g2_result;
+ START_TIMER(MSM_g2);
+ ICICLE_CHECK(bn254_g2_msm(scalars.get(), g2_points.get(), msm_size, &config, &g2_result));
+ END_TIMER(MSM_g2, "MSM G2 from host-memory took");
+ std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+ // Similar to G1 MSM, can explicitly copy to device and execute the G2 MSM using device pointers
+
+ return 0;
+}
diff --git a/examples/c++/msm/example.cu b/examples/c++/msm/example.cu
deleted file mode 100644
index abdc3f5cb..000000000
--- a/examples/c++/msm/example.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include
-#include
-#include
-
-#include "api/bn254.h"
-using namespace bn254;
-
-int main(int argc, char* argv[])
-{
- std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
- std::cout << "Example parameters" << std::endl;
- int batch_size = 1;
- std::cout << "Batch size: " << batch_size << std::endl;
- unsigned msm_size = 1048576;
- std::cout << "MSM size: " << msm_size << std::endl;
- int N = batch_size * msm_size;
-
- std::cout << "Part I: use G1 points" << std::endl;
-
- std::cout << "Generating random inputs on-host" << std::endl;
- scalar_t* scalars = new scalar_t[N];
- affine_t* points = new affine_t[N];
- projective_t result;
- scalar_t::rand_host_many(scalars, N);
- projective_t::rand_host_many_affine(points, N);
-
- std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
- device_context::DeviceContext ctx = device_context::get_default_device_context();
- msm::MSMConfig config = {
- ctx, // ctx
- 0, // points_size
- 1, // precompute_factor
- 0, // c
- 0, // bitsize
- 10, // large_bucket_factor
- 1, // batch_size
- false, // are_scalars_on_device
- false, // are_scalars_montgomery_form
- false, // are_points_on_device
- false, // are_points_montgomery_form
- false, // are_results_on_device
- false, // is_big_triangle
- false, // is_async
- };
- config.batch_size = batch_size;
-
- std::cout << "Running MSM kernel with on-host inputs" << std::endl;
- cudaStream_t stream = config.ctx.stream;
- // Execute the MSM kernel
- bn254_msm_cuda(scalars, points, msm_size, config, &result);
- std::cout << projective_t::to_affine(result) << std::endl;
-
- std::cout << "Copying inputs on-device" << std::endl;
- scalar_t* scalars_d;
- affine_t* points_d;
- projective_t* result_d;
- cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
- cudaMalloc(&points_d, sizeof(affine_t) * N);
- cudaMalloc(&result_d, sizeof(projective_t));
- cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
- cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
-
- std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
- config.are_results_on_device = true;
- config.are_scalars_on_device = true;
- config.are_points_on_device = true;
-
- std::cout << "Running MSM kernel with on-device inputs" << std::endl;
- // Execute the MSM kernel
- bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);
-
- // Copy the result back to the host
- cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
- // Print the result
- std::cout << projective_t::to_affine(result) << std::endl;
- // Free the device memory
- cudaFree(scalars_d);
- cudaFree(points_d);
- cudaFree(result_d);
- // Free the host memory, keep scalars for G2 example
- delete[] points;
-
- std::cout << "Part II: use G2 points" << std::endl;
-
- std::cout << "Generating random inputs on-host" << std::endl;
- // use the same scalars
- g2_affine_t* g2_points = new g2_affine_t[N];
- g2_projective_t::rand_host_many_affine(g2_points, N);
-
- std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
- config.are_results_on_device = false;
- config.are_scalars_on_device = false;
- config.are_points_on_device = false;
- g2_projective_t g2_result;
- bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
- std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
-
- std::cout << "Copying inputs on-device" << std::endl;
- g2_affine_t* g2_points_d;
- g2_projective_t* g2_result_d;
- cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
- cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
- cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
- cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
- cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
-
- std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
- config.are_results_on_device = true;
- config.are_scalars_on_device = true;
- config.are_points_on_device = true;
-
- std::cout << "Running MSM kernel with on-device inputs" << std::endl;
- bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
- cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
- std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
-
- cudaFree(scalars_d);
- cudaFree(g2_points_d);
- cudaFree(g2_result_d);
- delete[] g2_points;
- delete[] scalars;
- cudaStreamDestroy(stream);
- return 0;
-}
diff --git a/examples/c++/msm/run.sh b/examples/c++/msm/run.sh
index 01eca66ba..9a5bdf29d 100755
--- a/examples/c++/msm/run.sh
+++ b/examples/c++/msm/run.sh
@@ -1,2 +1,66 @@
#!/bin/bash
-./build/example/example
+
+# Exit immediately if a command exits with a non-zero status
+set -e
+
+# Function to display usage information
+show_help() {
+ echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]"
+ echo
+ echo "Options:"
+ echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
+ echo " -b ICICLE_BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
+ echo " -h Show this help message"
+ exit 0
+}
+
+# Parse command line options
+while getopts ":d:b:h" opt; do
+ case ${opt} in
+ d )
+ DEVICE_TYPE=$OPTARG
+ ;;
+ b )
+ ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
+ ;;
+ h )
+ show_help
+ ;;
+ \? )
+ echo "Invalid option: -$OPTARG" 1>&2
+ show_help
+ ;;
+ : )
+ echo "Invalid option: -$OPTARG requires an argument" 1>&2
+ show_help
+ ;;
+ esac
+done
+
+# Set default values if not provided
+: "${DEVICE_TYPE:=CPU}"
+: "${ICICLE_BACKEND_INSTALL_DIR:=}"
+
+# Create necessary directories
+mkdir -p build/example
+mkdir -p build/icicle
+
+ICILE_DIR=$(realpath "../../../icicle/")
+ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
+
+# Build Icicle and the example app that links to it
+if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
+ echo "Building icicle with CUDA backend"
+ cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
+ export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
+else
+ echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}"
+ export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}"
+ cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
+fi
+cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
+
+cmake --build build/icicle -j
+cmake --build build/example -j
+
+./build/example/example "$DEVICE_TYPE"
diff --git a/examples/c++/multi-gpu-poseidon/CMakeLists.txt b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
deleted file mode 100644
index 24746ce79..000000000
--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
- set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
- set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(icicle LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-add_executable(
- example
- example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
deleted file mode 100644
index 5fb110fe6..000000000
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Icicle example: using multiple GPU to hash large dataset
-
-## Best-Practices
-
-This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
-
-## Key-Takeaway
-
-Use `device_context::DeviceContext` variable to select GPU to use.
-Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
-
-## Concise Usage Explanation
-
-1. Include c++ threads
-
-```c++
-#include
-```
-
-2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
-
-```c++
-void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
-```
-
-3. Initialize device contexts for different GPUs
-
-```c++
-device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-ctx0.device_id=0;
-device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-ctx1.device_id=1;
-```
-
-4. Finally, spawn the threads and wait for their completion
-
-```c++
-std::thread thread0(threadPoseidon, ctx0, ...);
-std::thread thread1(threadPoseidon, ctx1, ...);
-thread0.join();
-thread1.join();
-```
-
-## What's in the example
-
-This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
-
-1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
-2. Hash two partitions in parallel on two GPUs
-3. Hash two partitions in series on one GPU
-4. Compare execution times
-
diff --git a/examples/c++/multi-gpu-poseidon/compile.sh b/examples/c++/multi-gpu-poseidon/compile.sh
deleted file mode 100755
index ab4e191d9..000000000
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
deleted file mode 100644
index fcc3d5275..000000000
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-#include
-#include
-#include
-#include
-
-#include "api/bn254.h"
-#include "gpu-utils/error_handler.cuh"
-
-#include "poseidon/poseidon.cuh"
-#include "hash/hash.cuh"
-
-using namespace poseidon;
-using namespace bn254;
-
-void checkCudaError(cudaError_t error)
-{
- if (error != cudaSuccess) {
- std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
- // Handle the error, e.g., exit the program or throw an exception.
- }
-}
-
-// these global constants go into template calls
-const int size_col = 11;
-
-void threadPoseidon(
- device_context::DeviceContext ctx,
- unsigned size_partition,
- scalar_t* layers,
- scalar_t* column_hashes,
- Poseidon * poseidon)
-{
- cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
- if (err_result != cudaSuccess) {
- std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
- return;
- }
- HashConfig column_config = default_hash_config(ctx);
- cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
- checkCudaError(err);
-}
-
-using FpMilliseconds = std::chrono::duration;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) \
- printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-#define CHECK_ALLOC(ptr) \
- if ((ptr) == nullptr) { \
- std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
- exit(EXIT_FAILURE); \
- }
-
-#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
- std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
- exit(EXIT_FAILURE); \
-}
-
-int main()
-{
- const unsigned size_row = (1 << 30);
- const unsigned nof_partitions = 64;
- const unsigned size_partition = size_row / nof_partitions;
- // layers is allocated only for one partition, need to reuse for different partitions
- const uint32_t size_layers = size_col * size_partition;
-
- nvmlInit();
- unsigned int deviceCount;
- nvmlDeviceGetCount(&deviceCount);
- std::cout << "Available GPUs: " << deviceCount << std::endl;
-
- for (unsigned int i = 0; i < deviceCount; ++i) {
- nvmlDevice_t device;
- nvmlMemory_t memory;
- char name[NVML_DEVICE_NAME_BUFFER_SIZE];
- nvmlDeviceGetHandleByIndex(i, &device);
- nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
- nvmlDeviceGetMemoryInfo(device, &memory);
- std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
- << "/" << memory.free / 1024 / 1024 << std::endl;
- }
-
- const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
- std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
-
- //===============================================================================
- // Key: multiple devices are supported by device context
- //===============================================================================
-
- device_context::DeviceContext ctx0 = device_context::get_default_device_context();
- ctx0.device_id = 0;
- device_context::DeviceContext ctx1 = device_context::get_default_device_context();
- ctx1.device_id = 1;
-
- std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
- scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t)));
- CHECK_ALLOC(layers0);
- scalar_t s = scalar_t::zero();
- for (unsigned i = 0; i < size_col * size_partition; i++) {
- layers0[i] = s;
- s = s + scalar_t::one();
- }
- scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t)));
- CHECK_ALLOC(layers1);
- s = scalar_t::zero() + scalar_t::one();
- for (unsigned i = 0; i < size_col * size_partition; i++) {
- layers1[i] = s;
- s = s + scalar_t::one();
- }
-
- scalar_t* column_hash0 = static_cast(malloc(size_partition * sizeof(scalar_t)));
- CHECK_ALLOC(column_hash0);
- scalar_t* column_hash1 = static_cast(malloc(size_partition * sizeof(scalar_t)));
- CHECK_ALLOC(column_hash1);
-
- Poseidon column_poseidon0(size_col, ctx0);
- cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
- if (err_result != cudaSuccess) {
- std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
- return;
- }
- Poseidon column_poseidon1(size_col, ctx1);
-
- std::cout << "Parallel execution of Poseidon threads" << std::endl;
- START_TIMER(parallel);
- std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
- std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);
-
- // Wait for the threads to finish
- thread0.join();
- thread1.join();
- END_TIMER(parallel, "2 GPUs");
- std::cout << "Output Data from Thread 0: ";
- std::cout << column_hash0[0] << std::endl;
- std::cout << "Output Data from Thread 1: ";
- std::cout << column_hash1[0] << std::endl;
-
- std::cout << "Sequential execution of Poseidon threads" << std::endl;
- START_TIMER(sequential);
- std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
- thread2.join();
- std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
- thread3.join();
- END_TIMER(sequential, "1 GPU");
- std::cout << "Output Data from Thread 2: ";
- std::cout << column_hash0[0] << std::endl;
- std::cout << "Output Data from Thread 3: ";
- std::cout << column_hash1[0] << std::endl;
-
- nvmlShutdown();
- return 0;
-}
diff --git a/examples/c++/multi-gpu-poseidon/run.sh b/examples/c++/multi-gpu-poseidon/run.sh
deleted file mode 100755
index 01eca66ba..000000000
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-./build/example/example
diff --git a/examples/c++/multiply/.devcontainer/Dockerfile b/examples/c++/multiply/.devcontainer/Dockerfile
deleted file mode 100644
index 770ad6fe4..000000000
--- a/examples/c++/multiply/.devcontainer/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Make sure NVIDIA Container Toolkit is installed on your host
-
-# Use NVIDIA base image
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
-
-# Update and install dependencies
-RUN apt-get update && apt-get install -y \
- nsight-systems-12.2 \
- cmake \
- protobuf-compiler \
- curl \
- build-essential \
- git \
- && rm -rf /var/lib/apt/lists/*
-
-# Clone Icicle from a GitHub repository
-RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle
-
-# Set the working directory in the container
-WORKDIR /icicle-example
-
-# Specify the default command for the container
-CMD ["/bin/bash"]
diff --git a/examples/c++/multiply/.devcontainer/devcontainer.json b/examples/c++/multiply/.devcontainer/devcontainer.json
deleted file mode 100644
index 2dd93aa30..000000000
--- a/examples/c++/multiply/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
- "name": "Icicle Examples - Multiply",
- "build": {
- "dockerfile": "Dockerfile"
- },
- "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-example,type=bind",
- "workspaceFolder": "/icicle-example",
- "runArgs": [
- "--gpus",
- "all"
- ],
- "postCreateCommand": [
- "nvidia-smi"
- ],
- "customizations": {
- "vscode": {
- "extensions": [
- "ms-vscode.cmake-tools",
- "ms-azuretools.vscode-docker",
- "ms-vscode.cpptools-extension-pack"
- ]
- }
- }
-}
diff --git a/examples/c++/multiply/CMakeLists.txt b/examples/c++/multiply/CMakeLists.txt
deleted file mode 100644
index f7048bb8a..000000000
--- a/examples/c++/multiply/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
- set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
- set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-add_executable(
- example
- example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
diff --git a/examples/c++/multiply/README.md b/examples/c++/multiply/README.md
deleted file mode 100644
index 0e253d3af..000000000
--- a/examples/c++/multiply/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Icicle example: Multiplication
-
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
-## Key-Takeaway
-
-`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
-
-## Concise Usage Explanation
-
-Define a `CURVE_ID` and include curve configuration header:
-
-```c++
-#define CURVE_ID 1
-#include "curves/curve_config.cuh"
-```
-
-The values of `CURVE_ID` for different curves are in the above header. Multiplication is accelerated both for field scalars and point fields.
-
-```c++
-using namespace curve_config;
-scalar_t a;
-point_field_t b;
-```
-
-## Running the example
-
-- `cd` to your example directory
-- compile with `./compile.sh`
-- run with `./run.sh`
-
-## What's in the example
-
-1. Define the parameters for the example such as vector size
-2. Generate random vectors on-host
-3. Copy them on-device
-4. Execute element-wise vector multiplication on-device
-5. Copy results on-host
-
diff --git a/examples/c++/multiply/compile.sh b/examples/c++/multiply/compile.sh
deleted file mode 100755
index de35c62da..000000000
--- a/examples/c++/multiply/compile.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/multiply/example.cu b/examples/c++/multiply/example.cu
deleted file mode 100644
index 77eb7a0b4..000000000
--- a/examples/c++/multiply/example.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-#include
-#include
-#include
-#include
-
-#include "api/bn254.h"
-#include "vec_ops/vec_ops.cuh"
-
-using namespace vec_ops;
-using namespace bn254;
-
-typedef scalar_t T;
-
-int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
-{
- vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
- config.is_a_on_device = true;
- config.is_b_on_device = true;
- config.is_result_on_device = true;
- cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
- if (err != cudaSuccess) {
- std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
- return 0;
-}
-
-int main(int argc, char** argv)
-{
- const unsigned vector_size = 1 << 15;
- const unsigned repetitions = 1 << 15;
-
- cudaError_t err;
- nvmlInit();
- nvmlDevice_t device;
- nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
- std::cout << "Icicle-Examples: vector multiplications" << std::endl;
- char name[NVML_DEVICE_NAME_BUFFER_SIZE];
- if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
- std::cout << "GPU Model: " << name << std::endl;
- } else {
- std::cerr << "Failed to get GPU model name." << std::endl;
- }
- unsigned power_limit;
- nvmlDeviceGetPowerManagementLimit(device, &power_limit);
-
- std::cout << "Vector size: " << vector_size << std::endl;
- std::cout << "Repetitions: " << repetitions << std::endl;
- std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
-
- unsigned int baseline_power;
- nvmlDeviceGetPowerUsage(device, &baseline_power);
- std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
- unsigned baseline_temperature;
- if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
- std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
- } else {
- std::cerr << "Failed to get GPU temperature." << std::endl;
- }
-
- // host data
- T* host_in1 = (T*)malloc(vector_size * sizeof(T));
- T* host_in2 = (T*)malloc(vector_size * sizeof(T));
- std::cout << "Initializing vectors with random data" << std::endl;
- T::rand_host_many(host_in1, vector_size);
- T::rand_host_many(host_in2, vector_size);
- // device data
- device_context::DeviceContext ctx = device_context::get_default_device_context();
- T* device_in1;
- T* device_in2;
- T* device_out;
-
- err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
- if (err != cudaSuccess) {
- std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
-
- err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
- if (err != cudaSuccess) {
- std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
-
- err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
- if (err != cudaSuccess) {
- std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
-
- // copy from host to device
- err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
- if (err != cudaSuccess) {
- std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
-
- err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
- if (err != cudaSuccess) {
- std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
- return 0;
- }
-
- std::cout << "Starting warm-up" << std::endl;
- // Warm-up loop
- for (int i = 0; i < repetitions; i++) {
- vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
- }
-
- std::cout << "Starting benchmarking" << std::endl;
- unsigned power_before;
- nvmlDeviceGetPowerUsage(device, &power_before);
- std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
- std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
- << " %" << std::endl;
- unsigned temperature_before;
- if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
- std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
- } else {
- std::cerr << "Failed to get GPU temperature." << std::endl;
- }
- auto start_time = std::chrono::high_resolution_clock::now();
- // Benchmark loop
- for (int i = 0; i < repetitions; i++) {
- vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast(end_time - start_time);
- std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
- unsigned power_after;
- nvmlDeviceGetPowerUsage(device, &power_after);
- std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
- std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
- << " %" << std::endl;
- unsigned temperature_after;
- if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
- std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
- } else {
- std::cerr << "Failed to get GPU temperature." << std::endl;
- }
-
- // Report performance in GMPS: Giga Multiplications Per Second
- double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
- std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
-
- // Optional: validate multiplication
- T* host_out = (T*)malloc(vector_size * sizeof(T));
-
- cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-
- // validate multiplication here...
-
- // clean up and exit
- free(host_in1);
- free(host_in2);
- free(host_out);
- cudaFree(device_in1);
- cudaFree(device_in2);
- cudaFree(device_out);
- nvmlShutdown();
- return 0;
-}
\ No newline at end of file
diff --git a/examples/c++/multiply/run.sh b/examples/c++/multiply/run.sh
deleted file mode 100755
index 01eca66ba..000000000
--- a/examples/c++/multiply/run.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-./build/example/example
diff --git a/examples/c++/ntt/CMakeLists.txt b/examples/c++/ntt/CMakeLists.txt
index de0a1a22d..d523b645c 100644
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
+
set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
- set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
- set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+project(example)
+
+add_executable(example example.cpp)
+target_include_directories(example PRIVATE "../../../icicle/include" "..")
+target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
+message("${CMAKE_BINARY_DIR}/icicle")
+target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
+if(BACKEND_DIR)
+ add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
+endif()
-add_executable(
- example
- example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/examples/c++/ntt/README.md b/examples/c++/ntt/README.md
index 28e8dd451..f506eba99 100644
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -1,33 +1,35 @@
# Icicle example: Number-Theoretical Transform (NTT)
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
## Key-Takeaway
`Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform.
## Concise Usage Explanation
+1. Include the curve api
+2. Init NTT domain
+3. Call ntt api
+
```c++
-// Select the curve
-#define CURVE_ID 1
-// Include NTT template
-#include "appUtils/ntt/ntt.cu"
-using namespace curve_config;
-using namespace ntt;
-// Configure NTT
-NTTConfig config=DefaultNTTConfig();
-// Call NTT
-NTT(input, ntt_size, NTTDir::kForward, config, output);
+#include "icicle/api/bn254.h"
+...
+auto ntt_init_domain_cfg = default_ntt_init_domain_config();
+...
+bn254_ntt_init_domain(&basic_root, ntt_init_domain_cfg);
+NTTConfig config = default_ntt_config();
+...
+bn254_ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get())
```
+
## Running the example
-- `cd` to your example directory
-- compile with `./compile.sh`
-- run with `./run.sh`
+```sh
+# for CPU
+./run.sh -d CPU
+# for CUDA
+./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
+```
## What's in the example
diff --git a/examples/c++/ntt/compile.sh b/examples/c++/ntt/compile.sh
deleted file mode 100755
index de35c62da..000000000
--- a/examples/c++/ntt/compile.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/ntt/example.cpp b/examples/c++/ntt/example.cpp
new file mode 100644
index 000000000..67aeb7963
--- /dev/null
+++ b/examples/c++/ntt/example.cpp
@@ -0,0 +1,106 @@
+#include
+
+#include "icicle/runtime.h"
+
+#include "icicle/api/bn254.h"
+using namespace bn254;
+
+#include "examples_utils.h"
+#include "icicle/backend/ntt_config.h"
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements);
+int validate_output(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements);
+
+int main(int argc, char* argv[])
+{
+ try_load_and_set_backend_device(argc, argv);
+
+ std::cout << "\nIcicle Examples: Number Theoretical Transform (NTT)" << std::endl;
+ const unsigned log_ntt_size = 20;
+ const unsigned ntt_size = 1 << log_ntt_size;
+ const unsigned batch_size = 2;
+
+ std::cout << "Example parameters:" << std::endl;
+ std::cout << "NTT size: " << ntt_size << std::endl;
+ std::cout << "batch size: " << batch_size << std::endl;
+
+ std::cout << "\nGenerating input data for lowest and highest harmonics" << std::endl;
+ auto input = std::make_unique(batch_size * ntt_size);
+ auto output = std::make_unique(batch_size * ntt_size);
+ initialize_input(ntt_size, batch_size, input.get());
+
+ // Initialize NTT domain
+ std::cout << "\nInit NTT domain" << std::endl;
+ scalar_t basic_root = scalar_t::omega(log_ntt_size /*NTT_LOG_SIZscalar_t*/);
+ auto ntt_init_domain_cfg = default_ntt_init_domain_config();
+ ConfigExtension backend_cfg_ext;
+ backend_cfg_ext.set(
+ CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true); // optionally construct fast_twiddles for CUDA backend
+ ntt_init_domain_cfg.ext = &backend_cfg_ext;
+ ICICLE_CHECK(bn254_ntt_init_domain(&basic_root, &ntt_init_domain_cfg));
+
+ // ntt configuration
+ NTTConfig