diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml
index d9dc024..a44f210 100755
--- a/.azure-pipelines/azure-pipelines-linux.yml
+++ b/.azure-pipelines/azure-pipelines-linux.yml
@@ -32,6 +32,27 @@ jobs:
   variables: {}
 
   steps:
+  - script: |
+         sudo mkdir -p /opt/empty_dir || true
+         for d in \
+                  /opt/ghc \
+                  /opt/hostedtoolcache \
+                  /usr/lib/jvm \
+                  /usr/local/.ghcup \
+                  /usr/local/lib/android \
+                  /usr/local/share/powershell \
+                  /usr/share/dotnet \
+                  /usr/share/swift \
+                  ; do
+           sudo rsync --stats -a --delete /opt/empty_dir/ $d || true
+         done
+         sudo apt-get purge -y -f firefox \
+                                  google-chrome-stable \
+                                  microsoft-edge-stable
+         sudo apt-get autoremove -y >& /dev/null
+         sudo apt-get autoclean -y >& /dev/null
+         df -h
+    displayName: Manage disk space
   # configure qemu binfmt-misc running.  This allows us to run docker containers
   # embedded qemu-static
   - script: |
diff --git a/.azure-pipelines/azure-pipelines-osx.yml b/.azure-pipelines/azure-pipelines-osx.yml
new file mode 100755
index 0000000..c6d4e7d
--- /dev/null
+++ b/.azure-pipelines/azure-pipelines-osx.yml
@@ -0,0 +1,49 @@
+# This file was generated automatically from conda-smithy. To update this configuration,
+# update the conda-forge.yml and/or the recipe/meta.yaml.
+# -*- mode: yaml -*-
+
+jobs:
+- job: osx
+  pool:
+    vmImage: macOS-13
+  strategy:
+    matrix:
+      osx_64_python3.10.____cpython:
+        CONFIG: osx_64_python3.10.____cpython
+        UPLOAD_PACKAGES: 'True'
+      osx_64_python3.11.____cpython:
+        CONFIG: osx_64_python3.11.____cpython
+        UPLOAD_PACKAGES: 'True'
+      osx_64_python3.12.____cpython:
+        CONFIG: osx_64_python3.12.____cpython
+        UPLOAD_PACKAGES: 'True'
+      osx_64_python3.13.____cp313:
+        CONFIG: osx_64_python3.13.____cp313
+        UPLOAD_PACKAGES: 'True'
+      osx_64_python3.9.____cpython:
+        CONFIG: osx_64_python3.9.____cpython
+        UPLOAD_PACKAGES: 'True'
+  timeoutInMinutes: 360
+  variables: {}
+
+  steps:
+  # TODO: Fast finish on azure pipelines?
+  - script: |
+      export CI=azure
+      export flow_run_id=azure_$(Build.BuildNumber).$(System.JobAttempt)
+      export remote_url=$(Build.Repository.Uri)
+      export sha=$(Build.SourceVersion)
+      export OSX_FORCE_SDK_DOWNLOAD="1"
+      export GIT_BRANCH=$BUILD_SOURCEBRANCHNAME
+      export FEEDSTOCK_NAME=$(basename ${BUILD_REPOSITORY_NAME})
+      if [[ "${BUILD_REASON:-}" == "PullRequest" ]]; then
+        export IS_PR_BUILD="True"
+      else
+        export IS_PR_BUILD="False"
+      fi
+      ./.scripts/run_osx_build.sh
+    displayName: Run OSX build
+    env:
+      BINSTAR_TOKEN: $(BINSTAR_TOKEN)
+      FEEDSTOCK_TOKEN: $(FEEDSTOCK_TOKEN)
+      STAGING_BINSTAR_TOKEN: $(STAGING_BINSTAR_TOKEN)
\ No newline at end of file
diff --git a/.azure-pipelines/azure-pipelines-win.yml b/.azure-pipelines/azure-pipelines-win.yml
new file mode 100755
index 0000000..1985588
--- /dev/null
+++ b/.azure-pipelines/azure-pipelines-win.yml
@@ -0,0 +1,50 @@
+# This file was generated automatically from conda-smithy. To update this configuration,
+# update the conda-forge.yml and/or the recipe/meta.yaml.
+# -*- mode: yaml -*-
+
+jobs:
+- job: win
+  pool:
+    vmImage: windows-2022
+  strategy:
+    matrix:
+      win_64_python3.10.____cpython:
+        CONFIG: win_64_python3.10.____cpython
+        UPLOAD_PACKAGES: 'True'
+      win_64_python3.11.____cpython:
+        CONFIG: win_64_python3.11.____cpython
+        UPLOAD_PACKAGES: 'True'
+      win_64_python3.12.____cpython:
+        CONFIG: win_64_python3.12.____cpython
+        UPLOAD_PACKAGES: 'True'
+      win_64_python3.13.____cp313:
+        CONFIG: win_64_python3.13.____cp313
+        UPLOAD_PACKAGES: 'True'
+      win_64_python3.9.____cpython:
+        CONFIG: win_64_python3.9.____cpython
+        UPLOAD_PACKAGES: 'True'
+  timeoutInMinutes: 360
+  variables:
+    CONDA_BLD_PATH: D:\\bld\\
+    MINIFORGE_HOME: D:\Miniforge
+    UPLOAD_TEMP: D:\\tmp
+
+  steps:
+
+    - script: |
+        call ".scripts\run_win_build.bat"
+      displayName: Run Windows build
+      env:
+        MINIFORGE_HOME: $(MINIFORGE_HOME)
+        CONDA_BLD_PATH: $(CONDA_BLD_PATH)
+        PYTHONUNBUFFERED: 1
+        CONFIG: $(CONFIG)
+        CI: azure
+        flow_run_id: azure_$(Build.BuildNumber).$(System.JobAttempt)
+        remote_url: $(Build.Repository.Uri)
+        sha: $(Build.SourceVersion)
+        UPLOAD_PACKAGES: $(UPLOAD_PACKAGES)
+        UPLOAD_TEMP: $(UPLOAD_TEMP)
+        BINSTAR_TOKEN: $(BINSTAR_TOKEN)
+        FEEDSTOCK_TOKEN: $(FEEDSTOCK_TOKEN)
+        STAGING_BINSTAR_TOKEN: $(STAGING_BINSTAR_TOKEN)
\ No newline at end of file
diff --git a/.scripts/build_steps.sh b/.scripts/build_steps.sh
index f8051ab..2b517d9 100755
--- a/.scripts/build_steps.sh
+++ b/.scripts/build_steps.sh
@@ -35,7 +35,7 @@ mv /opt/conda/conda-meta/history /opt/conda/conda-meta/history.$(date +%Y-%m-%d-
 echo > /opt/conda/conda-meta/history
 micromamba install --root-prefix ~/.conda --prefix /opt/conda \
     --yes --override-channels --channel conda-forge --strict-channel-priority \
-    pip  python=3.12 conda-build conda-forge-ci-setup=4 "conda-build>=24.1"
+    pip  rattler-build conda-forge-ci-setup=4 "conda-build>=24.1"
 export CONDA_LIBMAMBA_SOLVER_NO_CHANNELS_FROM_INSTALLED=1
 
 # set up the condarc
@@ -57,20 +57,16 @@ if [[ -f "${FEEDSTOCK_ROOT}/LICENSE.txt" ]]; then
 fi
 
 if [[ "${BUILD_WITH_CONDA_DEBUG:-0}" == 1 ]]; then
-    if [[ "x${BUILD_OUTPUT_ID:-}" != "x" ]]; then
-        EXTRA_CB_OPTIONS="${EXTRA_CB_OPTIONS:-} --output-id ${BUILD_OUTPUT_ID}"
-    fi
-    conda debug "${RECIPE_ROOT}" -m "${CI_SUPPORT}/${CONFIG}.yaml" \
-        ${EXTRA_CB_OPTIONS:-} \
-        --clobber-file "${CI_SUPPORT}/clobber_${CONFIG}.yaml"
-
-    # Drop into an interactive shell
-    /bin/bash
+    echo "rattler-build currently doesn't support debug mode"
 else
-    conda-build "${RECIPE_ROOT}" -m "${CI_SUPPORT}/${CONFIG}.yaml" \
-        --suppress-variables ${EXTRA_CB_OPTIONS:-} \
-        --clobber-file "${CI_SUPPORT}/clobber_${CONFIG}.yaml" \
-        --extra-meta flow_run_id="${flow_run_id:-}" remote_url="${remote_url:-}" sha="${sha:-}"
+
+    rattler-build build --recipe "${RECIPE_ROOT}" \
+     -m "${CI_SUPPORT}/${CONFIG}.yaml" \
+     ${EXTRA_CB_OPTIONS:-} \
+     --target-platform "${HOST_PLATFORM}" \
+     --extra-meta flow_run_id="${flow_run_id:-}" \
+     --extra-meta remote_url="${remote_url:-}" \
+     --extra-meta sha="${sha:-}"
     ( startgroup "Inspecting artifacts" ) 2> /dev/null
 
     # inspect_artifacts was only added in conda-forge-ci-setup 4.9.4
diff --git a/README.md b/README.md
index 4cf8459..f01ce9a 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,6 @@ Documentation: https://triton-lang.org/
 This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives.
 The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
 
-
 Current build status
 ====================
 
diff --git a/conda-forge.yml b/conda-forge.yml
index 993de48..6cc1712 100644
--- a/conda-forge.yml
+++ b/conda-forge.yml
@@ -1,6 +1,9 @@
+azure:
+  free_disk_space: true
 github:
   branch_name: main
   tooling_branch_name: main
 conda_forge_output_validation: true
 conda_build:
   pkg_format: '2'
+conda_build_tool: rattler-build
diff --git a/recipe/build.bat b/recipe/build.bat
new file mode 100644
index 0000000..7da3d99
--- /dev/null
+++ b/recipe/build.bat
@@ -0,0 +1,23 @@
+rem remove outdated vendored headers
+del /s %SRC_DIR%/python/triton/third_party
+
+set JSON_SYSPATH=%PREFIX%
+set LLVM_SYSPATH=%PREFIX%
+set PYBIND11_SYSPATH=%SP_DIR%/pybind11
+
+rem these don't seem to be actually used, but they prevent downloads
+set TRITON_PTXAS_PATH=%PREFIX%/bin/ptxas
+set TRITON_CUOBJDUMP_PATH=%PREFIX%/bin/cuobjdump
+set TRITON_NVDISASM_PATH=%PREFIX%/bin/nvdisasm
+set TRITON_CUDACRT_PATH=%PREFIX%
+set TRITON_CUDART_PATH=%PREFIX%
+set TRITON_CUPTI_PATH=%PREFIX%
+
+set MAX_JOBS=%CPU_COUNT%
+
+rem the build does not run C++ unittests, and they implicitly fetch gtest
+rem no easy way of passing this, not really worth a whole patch
+sed -i -e '/TRITON_BUILD_UT/s:\bON:OFF:' CMakeLists.txt
+
+cd python
+%PYTHON% -m pip install . -vv
diff --git a/recipe/build.sh b/recipe/build.sh
index cd4e083..e5c4519 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -19,5 +19,9 @@ export TRITON_CUPTI_PATH=$PREFIX
 
 export MAX_JOBS=$CPU_COUNT
 
+# the build does not run C++ unittests, and they implicitly fetch gtest
+# no easy way of passing this, not really worth a whole patch
+sed -i -e '/TRITON_BUILD_UT/s:\bON:OFF:' CMakeLists.txt
+
 cd python
 $PYTHON -m pip install . -vv
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index db95812..66f77d3 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -21,7 +21,11 @@ fortran_compiler_version:                         # [linux64]
   - 13                                            # [linux64]
 cdt_name:                                         # [linux64]
   - cos7                                          # [linux64]
-cuda_compiler:                                    # [linux64]
-  - cuda-nvcc                                     # [linux64]
-cuda_compiler_version:                            # [linux64]
-  - 12.6                                          # [linux64]
+cuda_compiler:                                    # [linux64 or win]
+  - cuda-nvcc                                     # [linux64 or win]
+cuda_compiler_version:                            # [linux64 or win]
+  - 12.6                                          # [linux64 or win]
+c_compiler:                                       # [win]
+  - clang                                         # [win]
+cxx_compiler:                                     # [win]
+  - clangxx                                       # [win]
diff --git a/recipe/patches/0011-Don-t-specify-A-x64-option-and-reuse-cmake-build-typ.patch b/recipe/patches/0011-Don-t-specify-A-x64-option-and-reuse-cmake-build-typ.patch
new file mode 100644
index 0000000..d97c8a5
--- /dev/null
+++ b/recipe/patches/0011-Don-t-specify-A-x64-option-and-reuse-cmake-build-typ.patch
@@ -0,0 +1,34 @@
+From 95b7ae2159e6a61cf960c8d9efd631ab2e0b6e35 Mon Sep 17 00:00:00 2001
+From: Anatoly Myachev <anatoly.myachev@intel.com>
+Date: Wed, 30 Oct 2024 14:56:52 +0100
+Subject: [PATCH 11/11] Don't specify `-A x64` option and reuse cmake build
+ type config for Windows (#5014)
+
+The `-A` argument is not compatible with the Ninja generator.
+
+Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
+---
+ python/setup.py | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/python/setup.py b/python/setup.py
+index c60dc6158..da924502b 100644
+--- a/python/setup.py
++++ b/python/setup.py
+@@ -378,12 +378,10 @@ class CMakeBuild(build_ext):
+         cfg = get_build_type()
+         build_args = ["--config", cfg]
+ 
++        cmake_args += [f"-DCMAKE_BUILD_TYPE={cfg}"]
+         if platform.system() == "Windows":
+             cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
+-            if sys.maxsize > 2**32:
+-                cmake_args += ["-A", "x64"]
+         else:
+-            cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
+             max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
+             build_args += ['-j' + max_jobs]
+ 
+-- 
+2.47.1
+
diff --git a/recipe/patches/0012-remove-unused-requirement-filelock-4356.patch b/recipe/patches/0012-remove-unused-requirement-filelock-4356.patch
new file mode 100644
index 0000000..f9038e1
--- /dev/null
+++ b/recipe/patches/0012-remove-unused-requirement-filelock-4356.patch
@@ -0,0 +1,71 @@
+From 4c4c6a4327508db8ca6adc0fb01fdb6ec16572ff Mon Sep 17 00:00:00 2001
+From: Cunshun Xia <505481172@qq.com>
+Date: Fri, 19 Jul 2024 23:11:15 +0800
+Subject: [PATCH 12/12] remove unused requirement 'filelock' (#4356)
+
+The core Triton is a small number of people, and we receive many PRs
+(thank
+you!).  To help us review your code more quickly, **if you are a new
+contributor (less than 3 PRs merged) we ask that you complete the
+following
+tasks and include the filled-out checklist in your PR description.**
+
+Complete the following tasks before sending your PR, and replace `[ ]`
+with
+`[x]` to indicate you have done them.
+
+- [x] I am not making a trivial change, such as fixing a typo in a
+comment.
+
+- [x] I have written a PR description following these
+  [rules](https://cbea.ms/git-commit/#why-not-how).
+
+- [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`.
+
+- Select one of the following.
+  - [ ] I have added tests.
+    - `/test` for `lit` tests
+    - `/unittest` for C++ tests
+    - `/python/test` for end-to-end tests
+  - [x] This PR does not need a test because `FILL THIS IN`.
+
+- Select one of the following.
+  - [x] I have not added any `lit` tests.
+- [ ] The `lit` tests I have added follow these [best
+practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices),
+including the "tests should be minimal" section. (Usually running Python
+code
+    and using the instructions it generates is not minimal.)
+
+Signed-off-by: cunshunxia <cunshunxia@tencent.com>
+---
+ python/setup.py | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/python/setup.py b/python/setup.py
+index da924502b..30c317328 100644
+--- a/python/setup.py
++++ b/python/setup.py
+@@ -572,11 +572,6 @@ def get_entry_points():
+     return entry_points
+ 
+ 
+-def get_install_requires():
+-    install_requires = ["filelock"]
+-    return install_requires
+-
+-
+ setup(
+     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
+     version="3.1.0" + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
+@@ -586,7 +581,6 @@ setup(
+     long_description="",
+     packages=get_packages(),
+     entry_points=get_entry_points(),
+-    install_requires=get_install_requires(),
+     package_data=package_data,
+     include_package_data=True,
+     ext_modules=[CMakeExtension("triton", "triton/_C/")],
+-- 
+2.47.1
+
diff --git a/recipe/meta.yaml b/recipe/recipe.yaml
similarity index 53%
rename from recipe/meta.yaml
rename to recipe/recipe.yaml
index 8e7d03a..60d7691 100644
--- a/recipe/meta.yaml
+++ b/recipe/recipe.yaml
@@ -1,17 +1,19 @@
-{% set version = "3.1.0" %}
-# Triton no longer tags releases, but there are release branches, e.g.
-# https://github.com/triton-lang/triton/commits/release/3.1.x/
-# Check if the commit id from Pytorch's latest pinned commit in
-# https://github.com/pytorch/pytorch/blob/v{{ pytorch_ver }}/.ci/docker/ci_commit_pins/triton.txt
-# can be found on one of those release branches, and use that as the version
-{% set git_commit = "5fe38ffd73c2ac6ed6323b554205186696631c6f" %}
+context:
+  version: 3.1.0
+  # Triton no longer tags releases, but there are release branches, e.g.
+  # https://github.com/triton-lang/triton/commits/release/3.1.x/
+  # Check if the commit id from Pytorch's latest pinned commit in
+  # https://github.com/pytorch/pytorch/blob/v{{ pytorch_ver }}/.ci/docker/ci_commit_pins/triton.txt
+  # can be found on one of those release branches, and use that as the version
+  git_commit: 5fe38ffd73c2ac6ed6323b554205186696631c6f
+  build_number: 3
 
 package:
   name: triton
-  version: {{ version }}
+  version: ${{ version }}
 
 source:
-  url: https://github.com/openai/triton/archive/{{ git_commit }}.tar.gz
+  url: https://github.com/openai/triton/archive/${{ git_commit }}.tar.gz
   sha256: 933babc32b69872efbce05fe8be61129fecf52c724fadea42d8c7b2d10e16ad9
   patches:
     - patches/0001-Remove-Werror-that-cause-false-positive-build-failur.patch
@@ -34,23 +36,32 @@ source:
     # https://github.com/triton-lang/triton/commit/f48dbc1b106c93144c198fbf3c4f30b2aab9d242
     - patches/0009-CODEGEN-Support-CUDA-12.6-4588.patch
     - patches/0010-Use-system-PATH-to-find-tools-in-CONDA_PREFIX.patch
+    # https://github.com/triton-lang/triton/commit/0591b3756bd4143b7163235c0eca4d718948e982
+    - patches/0011-Don-t-specify-A-x64-option-and-reuse-cmake-build-typ.patch
+    # https://github.com/triton-lang/triton/commit/3bfdbc0cba3e4838364bf6bd204fa522e1665458
+    # (use removed earlier)
+    - patches/0012-remove-unused-requirement-filelock-4356.patch
 
 build:
-  number: 2
-  # TODO: windows support should be available from next version;
-  #       CPU-only support still under development
-  skip: true  # [win or cuda_compiler_version == "None"]
-  string: cuda{{ cuda_compiler_version | replace('.', '') }}py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}
+  number: ${{ build_number }}
+  string: cuda${{ cuda_compiler_version | version_to_buildstring }}py${{ python | version_to_buildstring }}h${{ hash }}_${{ build_number }}
+  # TODO: CPU-only support still under development
+  # No success enabling Windows build as of 3.1.0:
+  # https://github.com/conda-forge/triton-feedstock/pull/29#issuecomment-2564371725
+  skip: "win or cuda_compiler_version == \"None\""
 
 requirements:
   build:
-    - {{ compiler('cxx') }}
-    - {{ compiler('cuda') }}
-    - {{ stdlib('c') }}
+    - ${{ compiler('cxx') }}
+    - ${{ compiler('cuda') }}
+    - ${{ stdlib('c') }}
     - ninja
     - cmake
     - mlir
     - llvmdev
+    - if: win
+      then: m2-sed
+      else: sed
   host:
     - python
     - pybind11
@@ -63,43 +74,43 @@ requirements:
     - cuda-cupti-dev
   run:
     - python
-    - filelock
-    - pytorch =*=cuda*
-    - lit
+    - setuptools
     - cuda-nvcc
     - cuda-cuobjdump
     - cuda-cudart
     - cuda-cupti
 
-test:
-  imports:
-    - triton
-    - triton._C.libtriton
-  requires:
-    - pip
-    - pytest
-    - scipy
-  source_files:
-    - python/test
-  commands:
-    - pip check
-    # test suite essentially depends on availability of a physical GPU,
-    # see https://github.com/openai/triton/issues/466;
-    # run a test that does not require a GPU but checks
-    # if triton.compile() works
-    - pytest -v python/test/unit/tools/test_aot.py::test_ttgir_to_ptx
+tests:
+  - python:
+      imports:
+        - triton
+        - triton._C.libtriton
+      pip_check: true
+  - files:
+      source:
+        - python/test/
+    requirements:
+      run:
+        - pip
+        - pytest
+        - scipy
+    script:
+      # test suite essentially depends on availability of a physical GPU,
+      # see https://github.com/openai/triton/issues/466;
+      # run a test that does not require a GPU but checks
+      # if triton.compile() works
+      - pytest -v python/test/unit/tools/test_aot.py::test_ttgir_to_ptx
 
 about:
-  home: https://github.com/openai/triton
   license: MIT
-  license_family: MIT
   license_file: LICENSE
   summary: Development repository for the Triton language and compiler
   description: |
     This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives.
     The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
-  doc_url: https://triton-lang.org/
-  dev_url: https://github.com/openai/triton
+  homepage: https://github.com/openai/triton
+  repository: https://github.com/openai/triton
+  documentation: https://triton-lang.org/
 
 extra:
   recipe-maintainers: