From 0a07488ed2c47765e337e290bd138c0e6e459cbd Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 19 Oct 2021 03:25:14 -0700
Subject: [PATCH] use irange for loops 1 (#66741)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66741

Modified loops in files under fbsource/fbcode/caffe2/ from the format

`for(TYPE var=x0;var<x_max;x++)`

to the format

`for(const auto var: irange(xmax))`

This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand.

Test Plan: Sandcastle

Reviewed By: ngimel

Differential Revision: D31705360

fbshipit-source-id: 7115f76e381ad2d98584eb534961c3cbb957ebaa
---
 .../src/main/cpp/pytorch_jni_common.cpp       | 15 ++---
 .../src/main/cpp/pytorch_jni_lite.cpp         |  5 +-
 aten/src/ATen/BatchingRegistrations.cpp       |  3 +-
 aten/src/ATen/CPUApplyUtils.h                 |  5 +-
 aten/src/ATen/Context.h                       |  3 +-
 aten/src/ATen/ExpandUtils.cpp                 |  2 +-
 aten/src/ATen/ExpandUtils.h                   |  9 +--
 aten/src/ATen/MemoryOverlap.cpp               |  3 +-
 aten/src/ATen/NamedTensorUtils.cpp            |  4 +-
 aten/src/ATen/ParallelNative.cpp              |  3 +-
 aten/src/ATen/SparseTensorImpl.h              |  5 +-
 aten/src/ATen/SparseTensorUtils.cpp           |  3 +-
 aten/src/ATen/TensorIndexing.cpp              |  3 +-
 aten/src/ATen/TensorIndexing.h                |  5 +-
 aten/src/ATen/TensorIterator.cpp              | 58 +++++++++----------
 aten/src/ATen/TensorIterator.h                |  7 ++-
 aten/src/ATen/TensorIteratorInternal.h        |  5 +-
 aten/src/ATen/TensorNames.cpp                 |  2 +-
 aten/src/ATen/TensorUtils.cpp                 |  3 +-
 aten/src/ATen/VmapTransforms.cpp              |  4 +-
 aten/src/ATen/WrapDimUtils.h                  |  5 +-
 aten/src/ATen/WrapDimUtilsMulti.h             |  3 +-
 aten/src/ATen/benchmarks/stateful_conv1d.cpp  |  7 ++-
 aten/src/ATen/core/Array.h                    |  1 +
 aten/src/ATen/core/Formatting.cpp             | 17 +++---
 aten/src/ATen/core/MT19937RNGEngine.h         |  8 ++-
 aten/src/ATen/core/TensorAccessor.h           |  3 +-
 aten/src/ATen/core/boxing/impl/test_helpers.h |  9 +--
 .../ATen/core/dispatch/DispatchKeyExtractor.h |  3 +-
 .../core/dispatch/backend_fallback_test.cpp   |  5 +-
 aten/src/ATen/core/function_schema.h          |  1 +
 aten/src/ATen/core/function_schema_inl.h      | 18 +++---
 aten/src/ATen/core/ivalue_inl.h               |  6 +-
 .../core/op_registration/infer_schema.cpp     |  7 ++-
 aten/src/ATen/core/qualified_name.h           |  5 +-
 aten/src/ATen/core/stack.h                    |  3 +-
 aten/src/ATen/cpu/vec/functional_base.h       |  3 +-
 .../cpu/vec/vec256/vec256_complex_double.h    |  5 +-
 .../cpu/vec/vec256/vec256_complex_float.h     |  5 +-
 aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  9 +--
 aten/src/ATen/cpu/vec/vec256/vec256_float.h   |  9 +--
 .../ATen/cpu/vec/vec256/vec256_float_neon.h   | 21 +++----
 aten/src/ATen/cpu/vec/vec256/vec256_int.h     |  9 +--
 aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 52 +++++++++--------
 .../vec256/vsx/vec256_complex_double_vsx.h    |  7 ++-
 .../vec/vec256/vsx/vec256_complex_float_vsx.h |  7 ++-
 .../cpu/vec/vec256/vsx/vec256_quint8_vsx.h    |  2 +
 .../cpu/vec/vec512/vec512_complex_double.h    |  5 +-
 .../cpu/vec/vec512/vec512_complex_float.h     |  5 +-
 aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  9 +--
 aten/src/ATen/cpu/vec/vec512/vec512_float.h   |  9 +--
 aten/src/ATen/cpu/vec/vec512/vec512_int.h     |  9 +--
 aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 52 +++++++++--------
 aten/src/ATen/cpu/vec/vec_base.h              | 36 ++++++------
 aten/src/ATen/cuda/CUDABlas.cpp               |  3 +-
 aten/src/ATen/cudnn/Descriptors.cpp           |  9 +--
 aten/src/ATen/miopen/Descriptors.cpp          |  9 +--
 aten/src/ATen/native/Activation.cpp           | 10 ++--
 .../ATen/native/AdaptiveAveragePooling.cpp    |  5 +-
 .../ATen/native/AdaptiveAveragePooling3d.cpp  | 35 +++++------
 aten/src/ATen/native/AdaptiveMaxPooling2d.cpp |  5 +-
 aten/src/ATen/native/AdaptiveMaxPooling3d.cpp | 15 ++---
 aten/src/ATen/native/AveragePool3d.cpp        | 12 ++--
 .../ATen/native/BatchLinearAlgebraKernel.cpp  | 10 ++--
 aten/src/ATen/native/BlasKernel.cpp           | 12 ++--
 aten/src/ATen/native/Bucketization.cpp        |  3 +-
 aten/src/ATen/native/Col2Im.cpp               |  3 +-
 aten/src/ATen/native/ComplexHelper.h          |  2 +-
 aten/src/ATen/native/ConstantPadNd.cpp        |  2 +-
 aten/src/ATen/native/ConvUtils.h              |  7 ++-
 aten/src/ATen/native/Convolution.cpp          | 10 ++--
 aten/src/ATen/native/ConvolutionMM2d.cpp      |  5 +-
 aten/src/ATen/native/ConvolutionMM3d.cpp      |  7 ++-
 aten/src/ATen/native/ConvolutionTBC.cpp       |  3 +-
 aten/src/ATen/native/Copy.cpp                 |  9 +--
 aten/src/ATen/native/Cross.cpp                |  3 +-
 .../src/ATen/native/DilatedConvolutionUtils.h |  3 +-
 aten/src/ATen/native/DilatedMaxPool3d.cpp     | 13 ++---
 aten/src/ATen/native/Dropout.cpp              |  5 +-
 aten/src/ATen/native/Embedding.cpp            |  4 +-
 aten/src/ATen/native/EmbeddingBag.cpp         | 12 ++--
 aten/src/ATen/native/Fill.cpp                 |  5 +-
 aten/src/ATen/native/FractionalMaxPool2d.cpp  | 11 ++--
 aten/src/ATen/native/FractionalMaxPool3d.cpp  | 10 ++--
 aten/src/ATen/native/GridSampler.cpp          | 33 ++++++-----
 aten/src/ATen/native/Im2Col.cpp               |  3 +-
 aten/src/ATen/native/IndexingUtils.h          |  5 +-
 aten/src/ATen/native/LinearAlgebra.cpp        | 23 ++++----
 aten/src/ATen/native/LinearAlgebraUtils.h     | 10 ++--
 aten/src/ATen/native/LossCTC.cpp              | 21 +++----
 aten/src/ATen/native/LossMultiLabelMargin.cpp | 23 ++++----
 aten/src/ATen/native/LossMultiMargin.cpp      | 15 ++---
 aten/src/ATen/native/LossNLL.cpp              |  7 ++-
 aten/src/ATen/native/LossNLL2d.cpp            | 21 +++----
 aten/src/ATen/native/NNPACK.cpp               |  3 +-
 aten/src/ATen/native/NamedTensor.cpp          |  4 +-
 aten/src/ATen/native/Normalization.cpp        | 11 ++--
 aten/src/ATen/native/PackedSequence.cpp       |  9 +--
 aten/src/ATen/native/Pool.h                   |  3 +-
 aten/src/ATen/native/QuantizedLinear.cpp      |  6 +-
 100 files changed, 513 insertions(+), 428 deletions(-)

diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
index fed6170c2bf306..0a405031f3d455 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@@ -4,6 +4,7 @@
 #include <string>
 
 #include <c10/core/MemoryFormat.h>
+#include <c10/util/irange.h>
 
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
@@ -97,7 +98,7 @@ static at::Tensor newAtTensor(
   std::vector<int64_t> shapeVec{};
   shapeVec.reserve(rank);
   auto numel = 1;
-  for (auto i = 0; i < rank; ++i) {
+  for (const auto i : c10::irange(rank)) {
     shapeVec.push_back(shapeArr[i]);
     numel *= shapeArr[i];
   }
@@ -521,7 +522,7 @@ at::IValue JIValue::JIValueToAtIValue(
 
     std::vector<at::IValue> elements;
     elements.reserve(n);
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto jivalue_element = jarray->getElement(i);
       auto element = JIValue::JIValueToAtIValue(jivalue_element);
       elements.push_back(std::move(element));
@@ -535,7 +536,7 @@ at::IValue JIValue::JIValueToAtIValue(
     size_t n = jArrayPinned.size();
     c10::List<bool> list{};
     list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       list.push_back(jArrayPinned[i]);
     }
     return at::IValue{std::move(list)};
@@ -547,7 +548,7 @@ at::IValue JIValue::JIValueToAtIValue(
     size_t n = jArrayPinned.size();
     c10::List<int64_t> list{};
     list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       list.push_back(jArrayPinned[i]);
     }
     return at::IValue{std::move(list)};
@@ -559,7 +560,7 @@ at::IValue JIValue::JIValueToAtIValue(
     size_t n = jArrayPinned.size();
     c10::List<double> list{};
     list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       list.push_back(jArrayPinned[i]);
     }
     return at::IValue{std::move(list)};
@@ -572,7 +573,7 @@ at::IValue JIValue::JIValueToAtIValue(
     size_t n = jArray->size();
     c10::List<at::Tensor> list{};
     list.reserve(n);
-    for (size_t i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       list.push_back(
           TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i)));
     }
@@ -594,7 +595,7 @@ at::IValue JIValue::JIValueToAtIValue(
     c10::impl::GenericList list{c10::unshapedType(first_element.type())};
     list.reserve(n);
     list.push_back(first_element);
-    for (auto i = 1; i < n; ++i) {
+    for (const auto i : c10::irange(1, n)) {
       auto jivalue_element = jarray->getElement(i);
       auto element = JIValue::JIValueToAtIValue(jivalue_element);
       list.push_back(element);
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
index b67799672ec291..86fd1e2260f9ca 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@@ -6,6 +6,7 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/script.h>
@@ -157,7 +158,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
-    for (size_t i = 0; i < n; i++) {
+    for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
       if (at::kVulkan == deviceType_) {
         inputs.push_back(
@@ -186,7 +187,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
-    for (size_t i = 0; i < n; i++) {
+    for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
       if (at::kVulkan == deviceType_) {
         inputs.push_back(
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index b2dcaa04b12c7d..e2292e1964e029 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -3,6 +3,7 @@
 #include <ATen/BatchedFallback.h>
 #include <ATen/native/ResizeCommon.h>
 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 
 namespace at {
 
@@ -329,7 +330,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) {
 
   VmapDimVector all_dims_physical;
   all_dims_physical.reserve(self_physical.tensor().dim());
-  for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) {
+  for (const auto bdim : c10::irange(self_physical.numBatchDims())) {
     all_dims_physical.push_back(bdim);
   }
   all_dims_physical.insert(
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 229eaf48be5020..d2fb3ac96305fa 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -2,6 +2,7 @@
 
 #include <ATen/Parallel.h>
 #include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
 #include <limits>
 #include <utility>
 #include <cstring>
@@ -130,7 +131,7 @@ inline Tensor sort_strides(Tensor& tensor_) {
   IntArrayRef strides = tensor_.strides();
   std::vector<int64_t> indices;
   indices.reserve(tensor_.ndimension());
-  for (int64_t i = 0; i < tensor_.ndimension(); i++) {
+  for (const auto i : c10::irange(tensor_.ndimension())) {
     indices.push_back(i);
   }
   std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
@@ -196,7 +197,7 @@ inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
   if (tensors.size() == 0)
     return true;
   int64_t all_numel = tensors[0].numel();
-  for (size_t i = 1; i < tensors.size(); i++) {
+  for (const auto i : c10::irange(1, tensors.size())) {
     if (tensors[i].numel() != all_numel)
       return false;
   }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index bbd021ef5504c2..28d7ea35094a35 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -11,6 +11,7 @@
 #include <c10/util/Exception.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/QEngine.h>
+#include <c10/util/irange.h>
 
 #include <memory>
 #include <mutex>
@@ -351,7 +352,7 @@ static inline void manual_seed(uint64_t seed) {
   // available. In that case, we must not seed CUDA; it will fail!
   const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
   if (hasCUDA() && num_gpus > 0) {
-    for (int i = 0; i < num_gpus; i++) {
+    for (const auto i : c10::irange(num_gpus)) {
       auto cuda_gen = globalContext().defaultGenerator(
         Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
       );
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index 710c27170958af..35588ac62a29cc 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -197,7 +197,7 @@ std::vector<int64_t> infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef t
   // compute output strides which preserves the input tensor's memory layout
   std::vector<int64_t> out_strides(ndim);
   int64_t curr_stride = 1;
-  for (size_t i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
     int64_t idx = perm[i];
     out_strides[idx] = curr_stride;
     // Note: for size 0, we simply treated it as 1, it really doesn't matter here
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 3f15e778618b68..55a392c8d9cc3a 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -4,6 +4,7 @@
 #include <ATen/Tensor.h>
 #include <c10/util/Exception.h>
 #include <c10/util/MaybeOwned.h>
+#include <c10/util/irange.h>
 
 #include <functional>
 #include <sstream>
@@ -266,7 +267,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
   // expands a list of Tensors; ignores undefined (null) tensors
   bool first = true;
   DimVector sizes;
-  for (size_t i = 0; i < to_expand.size(); ++i) {
+  for (const auto i : c10::irange(to_expand.size())) {
     if (!to_expand[i].defined()) {
       continue;
     } else if (first) {
@@ -278,7 +279,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
   }
 
   std::vector<Tensor> result(to_expand.size());
-  for (size_t i = 0; i < to_expand.size(); ++i) {
+  for (const auto i : c10::irange(to_expand.size())) {
     if (!to_expand[i].defined()) {
       continue;
     } else if (to_expand[i].sizes().equals(sizes)) {
@@ -299,7 +300,7 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) {
   c10::SmallVector<int64_t, 8> reduce_dims;
   const at::IntArrayRef sizes = tensor.sizes();
   const int64_t leading_dims = sizes.size() - shape.size();
-  for (int64_t i = 0; i < leading_dims; ++i) {
+  for (const auto i : c10::irange(leading_dims)) {
     reduce_dims.push_back(i);
   }
   for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
@@ -320,7 +321,7 @@ static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
   if (ndim > target_dim) {
     return false;
   }
-  for (size_t i = 0; i < ndim; i++) {
+  for (const auto i : c10::irange(ndim)) {
     int64_t size = shape[ndim - i - 1];
     int64_t target = desired[target_dim - i - 1];
     if (size != target && size != 1) {
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
index 232fda6bac10f0..edeca5e4bac1a4 100644
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@@ -1,6 +1,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/core/TensorBase.h>
 #include <c10/core/Layout.h>
+#include <c10/util/irange.h>
 
 namespace at {
 
@@ -17,7 +18,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
 
   auto strides = t->strides();
   auto sizes = t->sizes();
-  for (size_t i = 0; i < strides.size(); ++i) {
+  for (const auto i : c10::irange(strides.size())) {
     if (strides[i] == 0 && sizes[i] > 1) {
       return MemOverlap::YES;
     }
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 782a266a8aeda3..24a85b4ce7085a 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -225,7 +225,7 @@ std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor) {
   }
   std::vector<Dimname> outnames;
   auto tensor_names = tensor.names();
-  for (int64_t d = 0; d < tensor.dim(); d++) {
+  for (const auto d : c10::irange(tensor.dim())) {
     if (tensor.sizes()[d] != 1) {
       outnames.push_back(tensor_names[d]);
     }
@@ -242,7 +242,7 @@ std::vector<Dimname> compute_diagonal_outnames(
   }
   std::vector<Dimname> outnames;
   auto tensor_names = tensor.names();
-  for (int64_t d = 0; d < tensor.dim(); d++) {
+  for (const auto d : c10::irange(tensor.dim())) {
     if (d == dim1 || d == dim2) {
       continue;
     }
diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index 753dbdb751e68e..bade0b26d54d81 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -6,6 +6,7 @@
 
 #ifndef C10_MOBILE
 #include <c10/core/thread_pool.h>
+#include <c10/util/irange.h>
 #else
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif // C10_MOBILE
@@ -87,7 +88,7 @@ TaskThreadPoolBase& _get_intraop_pool() {
 // `fn` will be called with params: (thread_pool_task_id, task_id).
 void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
 #ifndef C10_MOBILE
-  for (size_t i = 1; i < range; ++i) {
+  for (const auto i : c10::irange(1, range)) {
     _get_intraop_pool().run([fn, i]() { fn((int)i, i); });
   }
   // Run the first task on the current thread directly.
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index e2fc89a9db8498..78ebb25e15b1f5 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -3,6 +3,7 @@
 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 namespace at {
 struct TORCH_API SparseTensorImpl : public TensorImpl {
@@ -109,7 +110,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
       bool shrinking_dense_dim = false;
       auto sparse_size_original = sizes().slice(0, sparse_dim);
       auto sparse_size_new = size.slice(0, sparse_dim);
-      for (int64_t i = 0; i < sparse_dim; i++) {
+      for (const auto i : c10::irange(sparse_dim)) {
         if (sparse_size_new[i] < sparse_size_original[i]) {
           shrinking_sparse_dims = true;
           break;
@@ -117,7 +118,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
       }
       auto dense_size_original = sizes().slice(sparse_dim);
       auto dense_size_new = size.slice(sparse_dim);
-      for (int64_t i = 0; i < dense_dim; i++) {
+      for (const auto i : c10::irange(dense_dim)) {
         if (dense_size_new[i] < dense_size_original[i]) {
           shrinking_dense_dim = true;
           break;
diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp
index 564eeda03c3daa..d5811b933e7ca5 100644
--- a/aten/src/ATen/SparseTensorUtils.cpp
+++ b/aten/src/ATen/SparseTensorUtils.cpp
@@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace sparse {
 
@@ -98,7 +99,7 @@ Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
     at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t h, hp0, hp1;
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
         hp0 = indices[i];
         hp1 = (i+1 == nnz) ?  dim : indices[i+1];
         if (hp0 != hp1) {
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 5c402155c6feaa..95d70132f43f95 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -1,6 +1,7 @@
 #include <ATen/TensorIndexing.h>
 
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace indexing {
@@ -31,7 +32,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 
 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
   stream << "(";
-  for (size_t i = 0; i < tensor_indices.size(); i++) {
+  for (const auto i : c10::irange(tensor_indices.size())) {
     stream << tensor_indices[i];
     if (i < tensor_indices.size() - 1) stream << ", ";
   }
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 263f4914e0d5e2..71c9c3feb9e76b 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/util/Optional.h>
+#include <c10/util/irange.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Functions.h>
@@ -335,7 +336,7 @@ static inline Tensor scalarToTensor(const Scalar& v, const TensorOptions& option
 // strip away unit dimensions from the left of 'src'
 static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) {
   size_t first_non1_src = sizes.size();
-  for (size_t i = 0; i < sizes.size(); ++i) {
+  for (const auto i : c10::irange(sizes.size())) {
     if (sizes[i] != 1) {
       first_non1_src = i;
       break;
@@ -439,7 +440,7 @@ static inline Tensor applySlicing(
     "too many indices for tensor of dimension ", (int)self_sizes.size());
 
   Tensor result = self;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     auto& obj = indices[i];
     result = handleDimInMultiDimIndexing(
       /*prev_dim_result=*/result,
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 84298660aedfa1..cea4805bc080af 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -36,8 +36,8 @@ inline void get_base_ptrs(char** ptrs, ArrayRef<OperandInfo> operands) {
 }
 
 inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_t ndim) {
-  for (int64_t dim = 0; dim < ndim; ++dim) {
-    for (size_t arg = 0; arg < operands.size(); ++arg) {
+  for (const auto dim : c10::irange(ndim)) {
+    for (const auto arg : c10::irange(operands.size())) {
       *strides++ = operands[arg].stride_bytes[dim];
     }
   }
@@ -214,7 +214,7 @@ void TensorIteratorBase::reorder_dimensions() {
   // returns 1 if the dim0 should come after dim1, -1 if dim0 should come
   // before dim1, and 0 if the comparison is ambiguous.
   auto should_swap = [&](size_t dim0, size_t dim1) {
-    for (int arg = 0; arg < ntensors(); arg++) {
+    for (const auto arg : c10::irange(ntensors())) {
       // ignore undefined or incorrectly sized tensors
       if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) {
         continue;
@@ -251,7 +251,7 @@ void TensorIteratorBase::reorder_dimensions() {
   };
 
   // insertion sort with support for ambiguous comparisons
-  for (int i = 1; i < ndim(); i++) {
+  for (const auto i : c10::irange(1, ndim())) {
     int dim1 = i;
     for (int dim0 = i - 1; dim0 >= 0; dim0--) {
       int comparison = should_swap(perm_[dim0], perm_[dim1]);
@@ -497,7 +497,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
 StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
   auto stride = StrideVector();
   int64_t next_stride = element_size;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
     stride.push_back(next_stride);
     next_stride *= shape_[dim];
   }
@@ -510,14 +510,14 @@ DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
   TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
   TORCH_INTERNAL_ASSERT(input.size()==perm_.size());
   auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to.
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
     res[perm_[dim]] = input[dim];
   }
   return res;
 }
 
 void TensorIteratorBase::allocate_or_resize_outputs() {
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
     auto& op = operands_[i];
     if (!op.tensor_base().defined() || op.will_resize) {
       TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@@ -525,8 +525,8 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
       op.stride_bytes = compatible_stride(element_size);
       // check if permutation is just an inverted order
       bool inverted = true;
-      for (int i = 0; i < ndim(); i++) {
-        if (perm_[i] != ndim() - i - 1) {
+      for (const auto j : c10::irange(ndim())) {
+        if (perm_[j] != ndim() - j - 1) {
           inverted = false;
           break;
         }
@@ -539,7 +539,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
         set_output(i, tensor_shape, {}, original_options(op), names_);
       } else {
         auto tensor_stride = invert_perm(op.stride_bytes);
-        for (int dim = 0; dim < ndim(); dim++) {
+        for (const auto dim : c10::irange(ndim())) {
           tensor_stride[dim] /= element_size;
         }
         set_output(i, tensor_shape, tensor_stride, original_options(op), names_);
@@ -593,7 +593,7 @@ void TensorIteratorBase::coalesce_dimensions() {
     if (shape0 == 1 || shape1 == 1) {
       return true;
     }
-    for (int i = 0; i < ntensors(); i++) {
+    for (const auto i : c10::irange(ntensors())) {
       auto& stride = operands_[i].stride_bytes;
       if (shape0 * stride[dim0] != stride[dim1]) {
         return false;
@@ -604,14 +604,14 @@ void TensorIteratorBase::coalesce_dimensions() {
 
   // replace each operands stride at dim0 with its stride at dim1
   auto replace_stride = [&](int dim0, int dim1) {
-    for (int i = 0; i < ntensors(); i++) {
+    for (const auto i : c10::irange(ntensors())) {
       auto& stride = operands_[i].stride_bytes;
       stride[dim0] = stride[dim1];
     }
   };
 
   int prev_dim = 0;
-  for (int dim = 1; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(1, ndim())) {
     if (can_coalesce(prev_dim, dim)) {
       if (shape_[prev_dim] == 1) {
         replace_stride(prev_dim, dim);
@@ -627,7 +627,7 @@ void TensorIteratorBase::coalesce_dimensions() {
   }
 
   shape_.resize(prev_dim + 1);
-  for (int i = 0; i < ntensors(); i++) {
+  for (const auto i : c10::irange(ntensors())) {
     operands_[i].stride_bytes.resize(ndim());
   }
   has_coalesced_dimensions_ = true;
@@ -670,7 +670,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
 
   auto reorder = [perm](IntArrayRef data) {
     auto res = DimVector(data.size(), 0);
-    for (size_t i = 0; i < perm.size(); i++) {
+    for (const auto i : c10::irange(perm.size())) {
       res[i] = data[perm[i]];
     }
     return res;
@@ -687,7 +687,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
 
 int64_t TensorIteratorBase::num_output_elements() const {
   int64_t elem = 1;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
     if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0)  {
       elem *= shape_[dim];
     }
@@ -697,7 +697,7 @@ int64_t TensorIteratorBase::num_output_elements() const {
 
 int TensorIteratorBase::num_reduce_dims() const {
   int count = 0;
-  for (int dim = 0; dim < ndim(); dim++) {
+  for (const auto dim : c10::irange(ndim())) {
     if (operands_[0].stride_bytes[dim] == 0) {
       count++;
     }
@@ -760,7 +760,7 @@ bool TensorIteratorBase::is_contiguous() const {
 
 bool TensorIteratorBase::is_scalar(int arg) const {
   const auto& stride = operands_[arg].stride_bytes;
-  for (int i = 0; i < ndim(); i++) {
+  for (const auto i : c10::irange(ndim())) {
     if (stride[i] != 0 && shape_[i] != 1) {
       return false;
     }
@@ -815,7 +815,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
 
 void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
   TORCH_INTERNAL_ASSERT(start_dim <= ndim());
-  for (int i = start_dim; i < ndim(); ++i) {
+  for (const auto i : c10::irange(start_dim, ndim())) {
     for (auto& op : operands_) {
       op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
     }
@@ -1063,13 +1063,13 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
 
 void TensorIteratorBase::mark_outputs() {
   // TODO: merge this into populate_operands
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
     operands_[i].is_output = true;
     const auto& output = tensor(i);
     if (!output.defined()) continue;
 
     // check if output is also an input
-    for (int arg = num_outputs_; arg < ntensors(); arg++) {
+    for (const auto arg : c10::irange(num_outputs_, ntensors())) {
       const auto& input = tensor(arg);
       if (output.is_same(input)) {
         operands_[i].is_read_write = true;
@@ -1086,7 +1086,7 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
   if (config.static_shape_.has_value()) {
     return;
   }
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
     const auto& output = tensor(i);
     if (output.defined() && !output.sizes().equals(shape_)) {
       if (config.resize_outputs_ && !operands_[i].is_read_write) {
@@ -1104,11 +1104,11 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
   if (!config.check_mem_overlap_) {
     return;
   }
-  for (int i = 0; i < num_outputs_; i++) {
+  for (const auto i : c10::irange(num_outputs_)) {
     const auto& output = tensor_base(i);
     if (!output.defined()) continue;
     assert_no_internal_overlap(output);
-    for (int j = num_outputs_; j < ntensors(); j++) {
+    for (const auto j : c10::irange(num_outputs_, ntensors())) {
       const auto& input = tensor_base(j);
       if (!input.is_same(output)) {
         assert_no_partial_overlap(output, input);
@@ -1164,7 +1164,7 @@ void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
           op.stride_bytes.resize(ndim(), 0);
       else
           op.stride_bytes.resize(ndim());
-      for (size_t i = 0; i < original_shape.size(); i++) {
+      for (const auto i : c10::irange(original_shape.size())) {
         // see NOTE: [Computing output strides]
         if (original_shape[i] == 1 && shape_[offset + i] !=1) {
           op.stride_bytes[offset + i] = 0;
@@ -1183,7 +1183,7 @@ bool TensorIteratorBase::can_use_32bit_indexing() const {
   }
   for (auto& op : operands_) {
     int64_t max_offset = 1;
-    for (int dim = 0; dim < ndim(); dim++) {
+    for (const auto dim : c10::irange(ndim())) {
       max_offset += (shape_[dim] - 1) * op.stride_bytes[dim];
     }
     if (max_offset > max_value) {
@@ -1245,7 +1245,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
   switch (setup_type) {
     case FastSetupType::CONTIGUOUS:
       {
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
           auto& op = operands_[i];
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@@ -1256,7 +1256,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
       }
     case FastSetupType::CHANNELS_LAST:
       {
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
           auto& op = operands_[i];
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
@@ -1273,7 +1273,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
           if (tensor(i_defined).defined()) break;
         }
         TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
-        for (int i = 0; i < num_outputs_; i++){
+        for (const auto i : c10::irange(num_outputs_)) {
           auto& op = operands_[i];
           if (!op.tensor_base().defined()) {
             TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index 329598695dc9cc..6a35650c96d203 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -4,6 +4,7 @@
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/TypeCast.h>
+#include <c10/util/irange.h>
 #include <ATen/core/Dimname.h>
 #include <ATen/core/Range.h>
 #include <ATen/core/TensorBase.h>
@@ -322,9 +323,9 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
         char** base, const int64_t* strides, int64_t size0, int64_t size1) {
       PtrVector data(base, base + ntensor);
       const int64_t* outer_strides = &strides[ntensor];
-      for (int64_t i = 0; i < size1; i++) {
+      for (const auto i : c10::irange(size1)) {
         if (i > 0) {
-          for (int64_t arg = 0; arg < ntensor; arg++) {
+          for (const auto arg : c10::irange(ntensor)) {
             data[arg] += outer_strides[arg];
           }
         }
@@ -397,7 +398,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 
   bool has_contiguous_first_dim() const {
     int num_tensors = ntensors();
-    for (int i = 0; i < num_tensors; i++) {
+    for (const auto i : c10::irange(num_tensors)) {
       if (strides(i)[0] != element_size(i)) {
         return false;
       }
diff --git a/aten/src/ATen/TensorIteratorInternal.h b/aten/src/ATen/TensorIteratorInternal.h
index 57477bcb1d4030..72e5939b351798 100644
--- a/aten/src/ATen/TensorIteratorInternal.h
+++ b/aten/src/ATen/TensorIteratorInternal.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/native/TensorIterator.h>
 #include <c10/util/SmallBuffer.h>
+#include <c10/util/irange.h>
 
 namespace at {
 
@@ -24,9 +25,9 @@ inline void get_data_ptrs(
   const int64_t ntensors = base.size();
   const int64_t ndim = counter.size();
   std::copy(base.begin(), base.end(), ptrs);
-  for (int64_t dim = 0; dim < ndim; ++dim) {
+  for (const auto dim : c10::irange(ndim)) {
     int64_t value = counter[dim];
-    for (int64_t arg = 0; arg < ntensors; ++arg) {
+    for (const auto arg : c10::irange(ntensors)) {
       ptrs[arg] += value * strides[dim * ntensors + arg];
     }
   }
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index 9c28924dc0aee2..683de258a2ebd8 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -56,7 +56,7 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
   start = maybe_wrap_dim(start, names.size());
   end = maybe_wrap_dim(end, names.size());
   names_.reserve(end - start);
-  for (int64_t idx = start; idx < end; ++idx) {
+  for (const auto idx : c10::irange(start, end)) {
     names_.emplace_back(names, idx);
   }
 }
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 1ec9f9c291c0ab..3426bff7b4b8da 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Config.h>
 #include <ATen/TensorUtils.h>
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 
 #include <ostream>
 #include <sstream>
@@ -323,7 +324,7 @@ size_t computeStorageNbytes(
   // size of the underlying storage is 1 bigger than the offset
   // of the last element according to stride
   size_t size = 1;
-  for(size_t i = 0; i < sizes.size(); i++) {
+  for (const auto i : c10::irange(sizes.size())) {
     if(sizes[i] == 0) {
       return 0;
     }
diff --git a/aten/src/ATen/VmapTransforms.cpp b/aten/src/ATen/VmapTransforms.cpp
index 07ff77fe2b746f..4bda903545fdf8 100644
--- a/aten/src/ATen/VmapTransforms.cpp
+++ b/aten/src/ATen/VmapTransforms.cpp
@@ -83,7 +83,7 @@ VmapDimVector VmapPhysicalView::getPhysicalShape(IntArrayRef logical_shape) cons
 static BatchDims computeFrontBatchDimsFromLevels(std::bitset<kVmapNumLevels> levels_bitset) {
   BatchDims bdims;
   int64_t dim = 0;
-  for (int64_t level = 0; level < kVmapNumLevels; level++) {
+  for (const auto level : c10::irange(kVmapNumLevels)) {
     if (!levels_bitset[level]) {
       continue;
     }
@@ -208,7 +208,7 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) {
   VmapDimVector batch_sizes(num_batch_dims, 1);
   for (const auto& physical_tensor : physical_tensors) {
     auto physical_sizes = physical_tensor.sizes();
-    for (int64_t dim = 0; dim < num_batch_dims; dim++) {
+    for (const auto dim : c10::irange(num_batch_dims)) {
       if (physical_sizes[dim] != 1) {
         batch_sizes[dim] = physical_sizes[dim];
       }
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 13e605c920ec13..24fe684c6dc61c 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -2,6 +2,7 @@
 
 #include <c10/core/WrapDimMinimal.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/util/irange.h>
 #include <ATen/core/Tensor.h>
 
 namespace at {
@@ -40,7 +41,7 @@ static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_p
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
-  for (int64_t i = 0; i < ndims; ++i) {
+  for (const auto i : c10::irange(ndims)) {
     auto &dim = dims[i];
     if (dim < min || dim > max) {
       TORCH_CHECK_INDEX(false,
@@ -85,7 +86,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) {
 
 // wrap negative dims in a vector
 static inline void wrap_all_dims(std::vector<int64_t>& dims_to_wrap, int64_t tensor_total_dims) {
-  for (size_t i = 0; i < dims_to_wrap.size(); i++) {
+  for (const auto i : c10::irange(dims_to_wrap.size())) {
     dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
   }
 }
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h
index a2af1b0dcd7195..e1d2266e24efba 100644
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/TensorImpl.h>
+#include <c10/util/irange.h>
 #include <ATen/WrapDimUtils.h>
 #include <sstream>
 #include <bitset>
@@ -15,7 +16,7 @@ constexpr size_t dim_bitset_size = 64;
 static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
   TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
   std::bitset<dim_bitset_size> seen;
-  for (size_t i = 0; i < dims.size(); i++) {
+  for (const auto i : c10::irange(dims.size())) {
     size_t dim = maybe_wrap_dim(dims[i], ndims);
     TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
     seen[dim] = true;
diff --git a/aten/src/ATen/benchmarks/stateful_conv1d.cpp b/aten/src/ATen/benchmarks/stateful_conv1d.cpp
index 60502773ca57a0..527dcc439dcdce 100644
--- a/aten/src/ATen/benchmarks/stateful_conv1d.cpp
+++ b/aten/src/ATen/benchmarks/stateful_conv1d.cpp
@@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
@@ -33,7 +34,7 @@ static void stateful_conv1d(benchmark::State& state) {
   )");
 
   std::vector<std::vector<torch::jit::IValue>> inputs;
-  for (int i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
     std::vector<torch::jit::IValue> input;
     // NOLINTNEXTLINE(modernize-use-emplace)
     input.push_back(torch::rand({batch_size, input_channels, width}));
@@ -70,8 +71,8 @@ static void GenerateSizes(benchmark::internal::Benchmark* b) {
 
   for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) {
     for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) {
-      for (size_t kernel = 3; kernel < 8; ++kernel) {
-        for (size_t batch_size = 1; batch_size < 5; ++batch_size) {
+      for (const auto kernel : c10::irange(3, 8)) {
+        for (const auto batch_size : c10::irange(1, 5)) {
           for (size_t width = 32; width < 256; width *= 2) {
             b->Args({input_channels, output_channels, kernel, batch_size, width, true});
             b->Args({input_channels, output_channels, kernel, batch_size, width, false});
diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h
index 6e0fce606efc80..4754f72cda0f74 100644
--- a/aten/src/ATen/core/Array.h
+++ b/aten/src/ATen/core/Array.h
@@ -4,6 +4,7 @@
 // device code.
 
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace detail {
 
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index dbbed6e3b07858..68fcd8c29398b1 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -1,4 +1,5 @@
 #include <ATen/core/Formatting.h>
+#include <c10/util/irange.h>
 
 #include <cmath>
 #include <cstdint>
@@ -44,7 +45,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
   }
   bool intMode = true;
   auto self_p = self.data_ptr<double>();
-  for(int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
     auto z = self_p[i];
     if(std::isfinite(z)) {
       if(z != std::ceil(z)) {
@@ -70,7 +71,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
   } else {
     expMin = fabs(self_p[offset]);
     expMax = fabs(self_p[offset]);
-    for(int64_t i = offset; i < size; i++) {
+    for (const auto i : c10::irange(offset, size)) {
       double z = fabs(self_p[i]);
       if(std::isfinite(z)) {
         if(z < expMin) {
@@ -130,7 +131,8 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
 
 static void __printIndent(std::ostream &stream, int64_t indent)
 {
-  for(int64_t i = 0; i < indent; i++) {
+  for (const auto i : c10::irange(indent)) {
+    (void)i; //Suppress unused variable warning
     stream << " ";
   }
 }
@@ -168,7 +170,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
       printScale(stream,scale);
       __printIndent(stream, indent);
     }
-    for(int64_t l = 0; l < self.size(0); l++) {
+    for (const auto l : c10::irange(self.size(0))) {
       Tensor row = self.select(0,l);
       double *row_ptr = row.data_ptr<double>();
       for(int64_t c = firstColumn; c < lastColumn+1; c++) {
@@ -198,8 +200,9 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
   bool start = true;
   bool finished = false;
   counter[0] = -1;
-  for(size_t i = 1; i < counter.size(); i++)
+  for (const auto i : c10::irange(1, counter.size())) {
     counter[i] = 0;
+  }
   while(true) {
     for(int64_t i = 0; self.ndimension()-2; i++) {
       counter[i] = counter[i] + 1;
@@ -269,7 +272,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
           printScale(stream, scale);
         }
         double* tensor_p = tensor.data_ptr<double>();
-        for (int64_t i = 0; i < tensor.size(0); i++) {
+        for (const auto i : c10::irange(tensor.size(0))) {
           stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
         }
       }
@@ -284,7 +287,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
         __printTensor(stream, tensor, linesize);
       }
       stream << "[ " << tensor_.toString() << "{" << tensor.size(0);
-      for(int64_t i = 1; i < tensor.ndimension(); i++) {
+      for (const auto i : c10::irange(1, tensor.ndimension())) {
         stream << "," << tensor.size(i);
       }
       stream << "}";
diff --git a/aten/src/ATen/core/MT19937RNGEngine.h b/aten/src/ATen/core/MT19937RNGEngine.h
index 40c1ba5f584ade..68b9c0c7e64c46 100644
--- a/aten/src/ATen/core/MT19937RNGEngine.h
+++ b/aten/src/ATen/core/MT19937RNGEngine.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <c10/util/irange.h>
+
 // define constants like M_PI and C keywords for MSVC
 #ifdef _MSC_VER
 #ifndef _USE_MATH_DEFINES
@@ -8,9 +10,9 @@
 #include <math.h>
 #endif
 
-#include <stdint.h>
-#include <cmath>
 #include <array>
+#include <cmath>
+#include <cstdint>
 
 namespace at {
 
@@ -155,7 +157,7 @@ class mt19937_engine {
     data_.seed_ = seed;
     data_.seeded_ = true;
     data_.state_[0] = seed & 0xffffffff;
-    for(int j = 1; j < MERSENNE_STATE_N; j++) {
+    for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
       data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
     }
     data_.left_ = 1;
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index e18f9d35ca2f04..9d65522b5d96b2 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -3,6 +3,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 #include <stdint.h>
 #include <cstddef>
 
@@ -134,7 +135,7 @@ class GenericPackedTensorAccessorBase {
       const source_index_t* sizes_,
       const source_index_t* strides_)
       : data_(data_) {
-    for (int i = 0; i < N; i++) {
+    for (const auto i : c10::irange(N)) {
       this->sizes_[i] = sizes_[i];
       this->strides_[i] = strides_[i];
     }
diff --git a/aten/src/ATen/core/boxing/impl/test_helpers.h b/aten/src/ATen/core/boxing/impl/test_helpers.h
index 9ca06878f1539f..93b11dc853f00f 100644
--- a/aten/src/ATen/core/boxing/impl/test_helpers.h
+++ b/aten/src/ATen/core/boxing/impl/test_helpers.h
@@ -7,6 +7,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/ivalue.h>
 #include <c10/core/CPUAllocator.h>
+#include <c10/util/irange.h>
 
 template<class... Inputs>
 inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
@@ -87,7 +88,7 @@ inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
 template<class T, size_t N>
 void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
   EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
@@ -95,7 +96,7 @@ void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
   EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
@@ -103,7 +104,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
   EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     EXPECT_EQ(expected[i], actual.get(i));
   }
 }
@@ -111,7 +112,7 @@ void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
 template<class T>
 void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
   EXPECT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 5289f9fa01142f..31dd09836cbe66 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -5,6 +5,7 @@
 #include <ATen/core/jit_type.h>
 #include <c10/util/Bitset.h>
 #include <c10/core/DispatchKeySet.h>
+#include <c10/util/irange.h>
 #include <ATen/core/Variadic.h>
 #include <ATen/core/stack.h>
 
@@ -171,7 +172,7 @@ struct TORCH_API DispatchKeyExtractor final {
         "The function schema has ", schema.arguments().size(),
         " arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
     c10::utils::bitset dispatch_arg_indices_reverse;
-    for (size_t index = 0; index < schema.arguments().size(); ++index) {
+    for (const auto index : c10::irange(schema.arguments().size())) {
       if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
           schema.arguments()[index].type()->isSubtypeOf(
               *ListType::ofTensors()) ||
diff --git a/aten/src/ATen/core/dispatch/backend_fallback_test.cpp b/aten/src/ATen/core/dispatch/backend_fallback_test.cpp
index 1fb14cf205b94c..19981988962a95 100644
--- a/aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+++ b/aten/src/ATen/core/dispatch/backend_fallback_test.cpp
@@ -5,6 +5,7 @@
 #include <ATen/Functions.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/op_registration.h>
+#include <c10/util/irange.h>
 #include <torch/library.h>
 
 using namespace at;
@@ -51,7 +52,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
 
   // Unwrap all arguments
   auto args = torch::jit::pop(*stack, num_arguments);
-  for (size_t i = 0; i < num_arguments; i++) {
+  for (const auto i : c10::irange(num_arguments)) {
     // TODO: Handle tensor list
     if (args[i].isTensor()) {
       auto* impl = args[i].unsafeToTensorImpl();
@@ -70,7 +71,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack*
 
   // Rewrap outputs
   auto rets = torch::jit::pop(*stack, num_returns);
-  for (size_t i = 0; i < num_returns; i++) {
+  for (const auto i : c10::irange(num_returns)) {
     // TODO: Handle tensor list
     if (rets[i].isTensor()) {
       torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(rets[i]).toTensor()));  // yes move!
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 3da6958eaf3441..211c55662f2b82 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -2,6 +2,7 @@
 
 #include <c10/util/StringUtil.h>
 #include <c10/util/string_view.h>
+#include <c10/util/irange.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index 712192b0823062..b7aab0730c7d5c 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -16,7 +16,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
   out << "(";
 
   bool seen_kwarg_only = false;
-  for(size_t i = 0; i < schema.arguments().size(); ++i) {
+  for (const auto i : c10::irange(schema.arguments().size())) {
     if (i > 0) out << ", ";
     if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
       out << "*, ";
@@ -35,7 +35,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
 
   const auto& returns = schema.returns();
   out << "(";
-  for(size_t i = 0; i < returns.size(); ++i) {
+  for (const auto i : c10::irange(returns.size())) {
     if (i > 0) {
       out << ", ";
     }
@@ -53,7 +53,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
 
 inline size_t findFirstOutArg(const std::vector<Argument>& args) {
   // find the start of out args in the schema
-  for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
+  for (const auto out_start_idx : c10::irange(args.size())) {
     if (args.at(out_start_idx).is_out()) {
       return out_start_idx;
     }
@@ -122,7 +122,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
         && arguments().size() >= old.arguments().size())) {
     return false;
   }
-  for (size_t i = 0; i < returns().size(); ++i) {
+  for (const auto i : c10::irange(returns().size())) {
     // Backwards compatibility requires covariance on argument types
     // (i.e. more generic), and contravariance on return types (i.e.
     //  more specific).
@@ -138,7 +138,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
   size_t new_out_start_idx = findFirstOutArg(arguments());
 
   // make sure among the default args, they are backward compatible
-  for (size_t i = 0; i < old_out_start_idx; i++) {
+  for (const auto i : c10::irange(old_out_start_idx)) {
     if (!arguments().at(i).isBackwardCompatibleWith(
           old.arguments().at(i), why_not)) {
       return false;
@@ -146,7 +146,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
   }
 
   // // Validate that all new arguments provided has a default value
-  for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
+  for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) {
     if (!arguments().at(i).default_value()) {
       if (why_not) {
         *why_not
@@ -160,7 +160,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
   }
 
   // now compare the out args
-  for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
+  for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) {
     if (!arguments()
              .at(i - old_out_start_idx + new_out_start_idx)
              .isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
@@ -238,7 +238,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
       *this);
 
   size_t consumed_kwargs = 0;
-  for (size_t pos = 0; pos < arguments().size(); ++pos) {
+  for (const auto pos : c10::irange(arguments().size())) {
     const auto& argument = arguments()[pos];
     if (pos < inputs.size()) {
       checkArg(inputs[pos], argument, pos);
@@ -298,7 +298,7 @@ inline bool isSubtypeOfList(
   if (child.size() != parent.size()) {
     return false;
   }
-  for (size_t i = 0; i < child.size(); ++i) {
+  for (const auto i : c10::irange(child.size())) {
     const Argument& c = child[i];
     const Argument& p = parent[i];
     if (c.name() != p.name()) {
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 062af97af793b8..7fe5aa48258d28 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -1114,7 +1114,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
     }
     std::ostringstream oss;
     oss << devices[0];
-    for (size_t idx = 1; idx < devices.size(); idx++) {
+    for (const auto idx : c10::irange(1, devices.size())) {
       if (idx == devices.size() - 1) {
         oss << " and ";
       } else {
@@ -1131,7 +1131,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
       return c10::kCPU;
     }
     c10::DeviceType deviceType = devices[0].type();
-    for (size_t idx = 1; idx < devices.size(); idx++) {
+    for (const auto idx : c10::irange(1, devices.size())) {
       TORCH_CHECK_VALUE(
           devices[idx].type() == deviceType,
           "Expected all devices to be of the same type, but got a mismatch between ",
@@ -1151,7 +1151,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
       [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
     // Deduplicate by compacting.
     size_t targetIdx = 0;
-    for (size_t sourceIdx = 0; sourceIdx < devices.size(); sourceIdx++) {
+    for (const auto sourceIdx : c10::irange(devices.size())) {
       TORCH_CHECK_VALUE(
           devices[sourceIdx].has_index(),
           "Expected devices to have indices, got ", devices[sourceIdx]);
diff --git a/aten/src/ATen/core/op_registration/infer_schema.cpp b/aten/src/ATen/core/op_registration/infer_schema.cpp
index 3807e420086a7f..df1925aba5ed1a 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.cpp
+++ b/aten/src/ATen/core/op_registration/infer_schema.cpp
@@ -1,4 +1,5 @@
 #include <ATen/core/op_registration/infer_schema.h>
+#include <c10/util/irange.h>
 #include <sstream>
 
 namespace c10 {
@@ -20,7 +21,7 @@ std::string fastToString(size_t x) {
 std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
   std::vector<Argument> result;
   result.reserve(args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
+  for (const auto i : c10::irange(args.size())) {
     // Arguments are named "_<index>"
     result.emplace_back(fastToString(i), (*args[i].getTypeFn)());
   }
@@ -49,7 +50,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
              " vs " + guts::to_string(rhs.returns().size());
   }
 
-  for (size_t i = 0; i < lhs.arguments().size(); ++i) {
+  for (const auto i : c10::irange(lhs.arguments().size())) {
     const TypePtr& leftType = lhs.arguments()[i].type();
     const TypePtr& rightType = rhs.arguments()[i].type();
     // Type::operator== is virtual. Comparing pointers first is
@@ -61,7 +62,7 @@ C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema
     }
   }
 
-  for (size_t i = 0; i < lhs.returns().size(); ++i) {
+  for (const auto i : c10::irange(lhs.returns().size())) {
     const TypePtr& leftType = lhs.returns()[i].type();
     const TypePtr& rightType = rhs.returns()[i].type();
     // See above about comparing pointers first.
diff --git a/aten/src/ATen/core/qualified_name.h b/aten/src/ATen/core/qualified_name.h
index 4770a3cf334080..b8065d9d5085f7 100644
--- a/aten/src/ATen/core/qualified_name.h
+++ b/aten/src/ATen/core/qualified_name.h
@@ -3,6 +3,7 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
+#include <c10/util/irange.h>
 #include <string>
 
 namespace c10 {
@@ -69,7 +70,7 @@ struct QualifiedName {
       // Can't be a prefix if it's bigger
       return false;
     }
-    for (size_t i = 0; i < thisAtoms.size(); i++) {
+    for (const auto i : c10::irange(thisAtoms.size())) {
       if (thisAtoms[i] != otherAtoms[i]) {
         return false;
       }
@@ -116,7 +117,7 @@ struct QualifiedName {
       reserve += e.size() + 1;
     }
     out.reserve(reserve);
-    for (size_t i = 0; i < v.size(); ++i) {
+    for (const auto i : c10::irange(v.size())) {
       if (i != 0) {
         out.push_back(delimiter);
       }
diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h
index 021e8a02104f22..35bb9964eb398e 100644
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@@ -4,6 +4,7 @@
 
 #include <ATen/core/ivalue.h>
 #include <c10/util/Deprecated.h>
+#include <c10/util/irange.h>
 
 // TODO move this to c10 namespace
 
@@ -108,7 +109,7 @@ static inline IValue pop(Stack* stack) {
 static inline std::vector<IValue> pop(Stack& stack, size_t n) {
   std::vector<IValue> result;
   result.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
     result.push_back(std::move(peek(stack, i, n)));
   }
   drop(stack, n);
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
index 7bd04e637c7e3c..eb160577e8694e 100644
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace vec {
 
@@ -16,7 +17,7 @@ inline scalar_t vec_reduce_all(
   using Vec = vec::Vectorized<scalar_t>;
   scalar_t acc_arr[Vec::size()];
   acc_vec.store(acc_arr);
-  for (int64_t i = 1; i < size; i++) {
+  for (const auto i : c10::irange(1, size)) {
     std::array<scalar_t, Vec::size()> acc_arr_next = {0};
     acc_arr_next[0] = acc_arr[i];
     Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index 40276ba8365d51..f6db6fdc49a4a9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 
@@ -109,7 +110,7 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
     __at_align__ c10::complex<double> tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -293,7 +294,7 @@ template <> class Vectorized<c10::complex<double>> {
     __at_align__ c10::complex<double> y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index f40196320022bc..a4181a8abb8b21 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
@@ -144,7 +145,7 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
     __at_align__ c10::complex<float> tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -327,7 +328,7 @@ template <> class Vectorized<c10::complex<float>> {
     __at_align__ c10::complex<float> y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index f92f44e562a9d4..b64f910fbb6d8a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -5,6 +5,7 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@@ -72,7 +73,7 @@ template <> class Vectorized<double> {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -103,7 +104,7 @@ template <> class Vectorized<double> {
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -180,7 +181,7 @@ template <> class Vectorized<double> {
     __at_align__ double tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -190,7 +191,7 @@ template <> class Vectorized<double> {
     __at_align__ double tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index deb95429843738..57a594f6354c49 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -5,6 +5,7 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@@ -80,7 +81,7 @@ template <> class Vectorized<float> {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -109,7 +110,7 @@ template <> class Vectorized<float> {
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -217,7 +218,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -227,7 +228,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
index 2aac442d2123d9..465266b8b55dac 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@@ -5,6 +5,7 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@@ -221,7 +222,7 @@ template <> class Vectorized<float> {
     }
     else {
       __at_align__ float tmp_values[size()];
-      for (auto i = 0; i < size(); ++i) {
+      for (const auto i : c10::irange(size())) {
         tmp_values[i] = 0.0;
       }
       std::memcpy(
@@ -287,7 +288,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp[size()];
     __at_align__ float res[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       if (_isnan(tmp[i])) {
         std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
       } else {
@@ -299,7 +300,7 @@ template <> class Vectorized<float> {
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -336,7 +337,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_exp[size()];
     store(tmp);
     exp.store(tmp_exp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
     }
     return loadu(tmp);
@@ -371,7 +372,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_q[size()];
     store(tmp);
     q.store(tmp_q);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = std::fmod(tmp[i], tmp_q[i]);
     }
     return loadu(tmp);
@@ -381,7 +382,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_b[size()];
     store(tmp);
     b.store(tmp_b);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = std::hypot(tmp[i], tmp_b[i]);
     }
     return loadu(tmp);
@@ -397,7 +398,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -407,7 +408,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -429,7 +430,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_b[size()];
     store(tmp);
     b.store(tmp_b);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
     }
     return loadu(tmp);
@@ -494,7 +495,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_exp[size()];
     store(tmp);
     exp.store(tmp_exp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = std::pow(tmp[i], tmp_exp[i]);
     }
     return loadu(tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 5ee9919abca02c..2808c19bb3bb3d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 #include <iostream>
 
 namespace at {
@@ -98,7 +99,7 @@ class Vectorized<int64_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
@@ -221,7 +222,7 @@ class Vectorized<int32_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
@@ -435,7 +436,7 @@ class Vectorized<int16_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
@@ -684,7 +685,7 @@ class Vectorized<int8_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index 65ef6c62683943..504fac94dde44d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -6,6 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/native/quantized/affine_quantizer_base.h>
+
+#include <c10/util/irange.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
@@ -745,7 +747,7 @@ struct VectorizedQuantizedConverter {
   std::array<value_type, size_> vals;
 
   VectorizedQuantizedConverter(T val) {
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       vals[i] = val.val_;
     }
   }
@@ -763,9 +765,9 @@ struct VectorizedQuantizedConverter {
       Vectorized<float> zero_point,
       Vectorized<float> scale_zp_premul) const {
     float_vec_return_type rv;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       float tmp_vals[8];
-      for (int j = 0; j < 8; ++j) {
+      for (const auto j : c10::irange(8)) {
         tmp_vals[j] = at::native::dequantize_val<T>(
             scale[j], zero_point[j], T(vals[8 * i + j]));
       }
@@ -822,7 +824,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 8], 8);
     }
 
@@ -838,7 +840,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -846,7 +848,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -861,7 +863,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       Vectorized<c10::qint32> zero_point,
       Vectorized<c10::qint32> q_six) {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -870,7 +872,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
     int_vec_return_type retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval[0].vals[i] = vals[i] - b.vals[i];
     }
     return retval;
@@ -881,7 +883,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       float multiplier,
       int32_t zero_point) {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] =
           nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
           zero_point;
@@ -954,7 +956,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 8], 8);
     }
 
@@ -970,7 +972,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -978,7 +980,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -992,7 +994,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       Vectorized<c10::qint8> zero_point,
       Vectorized<c10::qint8> q_six) {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -1002,8 +1004,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
   int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
     int_vec_return_type retval;
     constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         retval[i].vals[j] =
             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@@ -1019,8 +1021,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
     constexpr auto min_val = std::numeric_limits<value_type>::min();
     constexpr auto max_val = std::numeric_limits<value_type>::max();
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         int32_t rounded =
             nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
             zero_point;
@@ -1074,7 +1076,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 8> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 8], 8);
     }
 
@@ -1090,7 +1092,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -1098,7 +1100,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -1113,7 +1115,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
       Vectorized<c10::quint8> zero_point,
       Vectorized<c10::quint8> q_six) {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -1123,8 +1125,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
   int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
     int_vec_return_type retval;
     constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         retval[i].vals[j] =
             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@@ -1140,8 +1142,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     constexpr auto min_val = std::numeric_limits<value_type>::min();
     constexpr auto max_val = std::numeric_limits<value_type>::max();
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         int32_t rounded =
             nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
             zero_point;
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
index 3a3e0daade098b..fefe5a0a4c9aba 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -3,6 +3,7 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace vec {
@@ -167,7 +168,7 @@ class Vectorized<ComplexDbl> {
   Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
     __at_align__ ComplexDbl tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -176,7 +177,7 @@ class Vectorized<ComplexDbl> {
   Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
     __at_align__ ComplexDbl tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -454,7 +455,7 @@ class Vectorized<ComplexDbl> {
     __at_align__ ComplexDbl y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
index 712de24597dcfa..92beb6bc227ff2 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -4,6 +4,7 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace vec {
@@ -222,7 +223,7 @@ class Vectorized<ComplexFlt> {
   Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
     __at_align__ ComplexFlt tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -231,7 +232,7 @@ class Vectorized<ComplexFlt> {
   Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
     __at_align__ ComplexFlt tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -430,7 +431,7 @@ class Vectorized<ComplexFlt> {
     __at_align__ ComplexFlt y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
index 2482af6ec2324f..7a80c24e42c6a9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@@ -3,6 +3,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+
+#include <c10/util/irange.h>
 #include <c10/util/quint8.h>
 #include <array>
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index 6fc22f0f7d3362..7dc3fdc6eafc38 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
@@ -149,7 +150,7 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
     __at_align__ c10::complex<double> tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -357,7 +358,7 @@ template <> class Vectorized<c10::complex<double>> {
     __at_align__ c10::complex<double> y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index dfd070604c40c5..a9876dd5fcadc5 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -4,6 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <c10/util/complex.h>
+#include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
@@ -667,7 +668,7 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
     __at_align__ c10::complex<float> tmp[size()];
     store(tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -858,7 +859,7 @@ template <> class Vectorized<c10::complex<float>> {
     __at_align__ c10::complex<float> y_tmp[size()];
     store(x_tmp);
     exp.store(y_tmp);
-    for (int i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
     }
     return loadu(x_tmp);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 7128219748a061..7035b3e0f5d4b8 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -5,6 +5,7 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
 #if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@@ -87,7 +88,7 @@ template <> class Vectorized<double> {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -120,7 +121,7 @@ template <> class Vectorized<double> {
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -200,7 +201,7 @@ template <> class Vectorized<double> {
     __at_align__ double tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -210,7 +211,7 @@ template <> class Vectorized<double> {
     __at_align__ double tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 1a2b113de9d367..70866b15eb7085 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -5,6 +5,7 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
 #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 #include <sleef.h>
 #endif
@@ -104,7 +105,7 @@ template <> class Vectorized<float> {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -135,7 +136,7 @@ template <> class Vectorized<float> {
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = f(tmp[i]);
     }
     return loadu(tmp);
@@ -246,7 +247,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
@@ -256,7 +257,7 @@ template <> class Vectorized<float> {
     __at_align__ float tmp_x[size()];
     store(tmp);
     x.store(tmp_x);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
     }
     return loadu(tmp);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index f28c14ed3f73f4..92cbe6b6abd6d5 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace vec {
@@ -100,7 +101,7 @@ class Vectorized<int64_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
@@ -253,7 +254,7 @@ class Vectorized<int32_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
@@ -485,7 +486,7 @@ class Vectorized<int16_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (auto i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
@@ -761,7 +762,7 @@ class Vectorized<int8_t> : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
index 3a1eda8874f1af..3ed7899bb75b60 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@@ -6,6 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/native/quantized/affine_quantizer_base.h>
+
+#include <c10/util/irange.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
@@ -744,7 +746,7 @@ struct VectorizedQuantizedConverter {
   std::array<value_type, size_> vals;
 
   VectorizedQuantizedConverter(T val) {
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       vals[i] = val.val_;
     }
   }
@@ -762,9 +764,9 @@ struct VectorizedQuantizedConverter {
       Vectorized<float> zero_point,
       Vectorized<float> scale_zp_premul) const {
     float_vec_return_type rv;
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       float tmp_vals[16];
-      for (int j = 0; j < 16; ++j) {
+      for (const auto j : c10::irange(16)) {
         tmp_vals[j] = at::native::dequantize_val<T>(
             scale[j], zero_point[j], T(vals[16 * i + j]));
       }
@@ -829,7 +831,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 16> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 16], 16);
     }
 
@@ -845,7 +847,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -853,7 +855,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -868,7 +870,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       Vectorized<c10::qint32> zero_point,
       Vectorized<c10::qint32> q_six) {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -877,7 +879,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
     int_vec_return_type retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval[0].vals[i] = vals[i] - b.vals[i];
     }
     return retval;
@@ -888,7 +890,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       float multiplier,
       int32_t zero_point) {
     Vectorized<c10::qint32> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] =
           nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
           zero_point;
@@ -961,7 +963,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 16> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 16], 16);
     }
 
@@ -977,7 +979,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -985,7 +987,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -999,7 +1001,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       Vectorized<c10::qint8> zero_point,
       Vectorized<c10::qint8> q_six) {
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -1009,8 +1011,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
   int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
     int_vec_return_type retval;
     constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         retval[i].vals[j] =
             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@@ -1026,8 +1028,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
     constexpr auto min_val = std::numeric_limits<value_type>::min();
     constexpr auto max_val = std::numeric_limits<value_type>::max();
     Vectorized<c10::qint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         int32_t rounded =
             nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
             zero_point;
@@ -1081,7 +1083,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     std::array<value_type, size()> qvals;
     std::array<float, float_num_vecs() * 16> float_vals;
 
-    for (int i = 0; i < float_num_vecs(); ++i) {
+    for (const auto i : c10::irange(float_num_vecs())) {
       rhs[i].store(&float_vals[i * 16], 16);
     }
 
@@ -1097,7 +1099,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -1105,7 +1107,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
     }
     return retval;
@@ -1120,7 +1122,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
       Vectorized<c10::quint8> zero_point,
       Vectorized<c10::quint8> q_six) {
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < size(); ++i) {
+    for (const auto i : c10::irange(size())) {
       retval.vals[i] = std::min<value_type>(
           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
     }
@@ -1130,8 +1132,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
   int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
     int_vec_return_type retval;
     constexpr int elem_per_int_vec = size() / int_num_vecs();
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         retval[i].vals[j] =
             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
@@ -1147,8 +1149,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     constexpr auto min_val = std::numeric_limits<value_type>::min();
     constexpr auto max_val = std::numeric_limits<value_type>::max();
     Vectorized<c10::quint8> retval;
-    for (size_t i = 0; i < int_num_vecs(); ++i) {
-      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
         int32_t rounded =
             nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
             zero_point;
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index da5f318bf530cc..d6c921eddde262 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -31,6 +31,7 @@
 #include <ATen/native/cpu/zmath.h>
 #include <c10/util/TypeCast.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 
 // These macros helped us unify vec_base.h
 #ifdef CPU_CAPABILITY_AVX512
@@ -150,7 +151,7 @@ struct Vectorized {
   static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
     int64_t mask = mask_;
     Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       if (mask & 0x01) {
         vector[i] = b[i];
       } else {
@@ -165,7 +166,7 @@ struct Vectorized {
     Vectorized vector;
     int_same_size_t<T> buffer[size()];
     mask.store(buffer);
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       if (buffer[i] & 0x01)
        {
         vector[i] = b[i];
@@ -178,14 +179,14 @@ struct Vectorized {
   template<typename step_t>  // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
   static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
     Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       vector.values[i] = base + i * step;
     }
     return vector;
   }
   static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
     Vectorized vector;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       if (i < count) {
         vector[i] = b[i];
       } else {
@@ -340,7 +341,7 @@ struct Vectorized {
   }
   Vectorized<T> atan2(const Vectorized<T> &exp) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = std::atan2(values[i], exp[i]);
     }
     return ret;
@@ -380,7 +381,7 @@ struct Vectorized {
     // U is for SFINAE purposes only. Make sure it is not changed.
     static_assert(std::is_same<U, T>::value, "U must be T");
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = std::fmod(values[i], q[i]);
     }
     return ret;
@@ -423,7 +424,7 @@ struct Vectorized {
   }
   Vectorized<T> hypot(const Vectorized<T> &b) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = std::hypot(values[i], b[i]);
     }
     return ret;
@@ -436,14 +437,14 @@ struct Vectorized {
   }
   Vectorized<T> igamma(const Vectorized<T> &x) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = calc_igamma(values[i], x[i]);
     }
     return ret;
   }
   Vectorized<T> igammac(const Vectorized<T> &x) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = calc_igammac(values[i], x[i]);
     }
     return ret;
@@ -456,7 +457,7 @@ struct Vectorized {
   }
   Vectorized<T> nextafter(const Vectorized<T> &b) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = std::nextafter(values[i], b[i]);
     }
     return ret;
@@ -494,7 +495,7 @@ struct Vectorized {
   }
   Vectorized<T> pow(const Vectorized<T> &exp) const {
     Vectorized<T> ret;
-    for (int64_t i = 0; i < size(); i++) {
+    for (const auto i : c10::irange(size())) {
       ret[i] = std::pow(values[i], exp[i]);
     }
     return ret;
@@ -808,7 +809,7 @@ inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex)
   int_same_size_t<T> index_arr[size];
   vindex.store(static_cast<void*>(index_arr));
   T buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
     buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
   }
   return Vectorized<T>::loadu(static_cast<void*>(buffer));
@@ -826,7 +827,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
   mask.store(static_cast<void*>(mask_arr));
   vindex.store(static_cast<void*>(index_arr));
   T buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
     if (mask_arr[i] & 0x01) {  // check highest bit
       buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
     } else {
@@ -872,7 +873,7 @@ inline Vectorized<int_same_size_t<T>> convert_to_int_of_same_size(const Vectoriz
   T src_arr[size];
   src.store(static_cast<void*>(src_arr));
   int_same_size_t<T> buffer[size];
-  for (int64_t i = 0; i < size; i++) {
+  for (const auto i : c10::irange(size)) {
     buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
   }
   return Vectorized<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
@@ -899,7 +900,7 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
   T buffer2[size];
   a.store(static_cast<void*>(a_arr));
   b.store(static_cast<void*>(b_arr));
-  for (int64_t i = 0; i < half_size; i++) {
+  for (const auto i : c10::irange(half_size)) {
     buffer1[i] = a_arr[i * 2];
     buffer1[half_size + i] = b_arr[i * 2];
     buffer2[i] = a_arr[i * 2 + 1];
@@ -931,7 +932,7 @@ interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
   T buffer2[size];
   a.store(static_cast<void*>(a_arr));
   b.store(static_cast<void*>(b_arr));
-  for (int64_t i = 0; i < half_size; i++) {
+  for (const auto i : c10::irange(half_size)) {
     buffer1[i * 2] = a_arr[i];
     buffer1[i * 2 + 1] = b_arr[i];
     buffer2[i * 2] = a_arr[half_size + i];
@@ -946,7 +947,8 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
 #ifndef _MSC_VER
 # pragma unroll
 #endif
-  for (int64_t i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
+    (void)i; //Suppress unused variable warning
     *dst = c10::static_cast_with_inter_type<dst_T, src_T>::apply(*src);
     src++;
     dst++;
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index ab542cb3bdab04..d6a6205ab1c249 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -4,6 +4,7 @@
 
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
+#include <c10/util/irange.h>
 
 #define CUDABLAS_POSINT_CHECK(FD, X)         \
   TORCH_CHECK(                               \
@@ -295,7 +296,7 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
       c, CUDA_R_16F, ldc, stridec,
       num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   } else {
-    for (int64_t i = 0; i < num_batches; ++i) {
+    for (const auto i : c10::irange(num_batches)) {
       at::cuda::blas::gemm<at::Half>(
         transa, transb,
         m, n, k,
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index f52280e9d2401d..6a617edaf2777f 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -1,6 +1,7 @@
 #include <ATen/cudnn/Descriptors.h>
 
 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 
 #include <iostream>
 #include <sstream>
@@ -47,11 +48,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr
 #undef STR
   int size[CUDNN_DIM_MAX];
   int stride[CUDNN_DIM_MAX];
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     size[i] = static_cast<int>(t_sizes[i]);
     stride[i] = static_cast<int>(t_strides[i]);
   }
-  for (size_t i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
     size[i] = 1;
     stride[i] = 1;
   }
@@ -126,10 +127,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
       "cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format");
 
   int size[CUDNN_DIM_MAX];
-  for (int i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     size[i] = (int) t.size(i);
   }
-  for (int i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
     size[i] = (int) 1;
   }
   dim = std::max(dim, pad);
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 6911b1ad216bd3..ead45a52dad1a1 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -1,5 +1,6 @@
 #include <ATen/miopen/Descriptors.h>
 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 
 #include <iostream>
 
@@ -39,11 +40,11 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
 #undef STR
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     size[i] = static_cast<int>(t_sizes[i]);
     stride[i] = static_cast<int>(t_strides[i]);
   }
-  for (size_t i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
     size[i] = 1;
     stride[i] = 1;
   }
@@ -103,10 +104,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
 
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
-  for (int i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     size[i] = (int) t.size(i);
   }
-  for (int i = dim; i < pad; ++i) {
+  for (const auto i : c10::irange(dim, pad)) {
     size[i] = (int) 1;
   }
 
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 37700bb5867939..9db577792942ee 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -500,7 +500,7 @@ inline void _rrelu_with_noise_train(
   scalar_t* noise_data = noise.data_ptr<scalar_t>();
   auto gen  = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
   std::lock_guard<std::mutex> lock(gen->mutex_);
-  for (int64_t i = 0; i < input.numel(); i++) {
+  for (const auto i : c10::irange(input.numel())) {
     if (input_data[i] <= 0) {
       at::uniform_real_distribution<double> uniform(lower, upper);
       const scalar_t r = (scalar_t)uniform(gen);
@@ -610,7 +610,7 @@ void inline prelu_cpu_kernel_share_weights(
   auto weight_val = weight.data_ptr<scalar_t>()[0];
 
   at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
       scalar_t input_data_val = input_data[i];
       // to allow for compiler optimization, here splitting into two lines:
       scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
@@ -725,7 +725,7 @@ void inline prelu_cpu_backward_kernel_share_weights(
   scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
       [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
     scalar_t partial_sum = ident;
-    for (auto i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
       scalar_t input_data_val = input_data[i];
       scalar_t grad_out_data_val = grad_out_data[i];
       // to allow for compiler optimization, here splitting into two lines:
@@ -839,7 +839,9 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
     std::vector<int64_t> reduce_dims;
     reduce_dims.push_back(0);
     if (dims > 2) {
-      for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
+      for (const auto i : c10::irange(2, dims)) {
+        reduce_dims.push_back(i);
+      }
     }
     weight_grad = weight_grad_collector.sum(reduce_dims);
   }
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 2324b958b34f51..b0be043e30692b 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/AdaptivePooling.h>
 #include <ATen/native/xnnpack/Engine.h>
+#include <c10/util/irange.h>
 
 
 namespace at {
@@ -16,7 +17,7 @@ namespace {
   {
     TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
     int64_t ndim = input.ndimension();
-    for (int64_t i = 1; i < ndim; i++) {
+    for (const auto i : c10::irange(1, ndim)) {
       TORCH_CHECK(input.size(i) > 0,
         "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
         "but input has sizes ", input.sizes(), " with dimension ", i, " being "
@@ -52,7 +53,7 @@ namespace {
     const Tensor& input)
   {
     int64_t ndim = grad_output.ndimension();
-    for (int64_t i = 1; i < ndim; i++) {
+    for (const auto i : c10::irange(1, ndim)) {
       TORCH_CHECK(grad_output.size(i) > 0,
         "adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
         "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index f7565b554d896e..41515259c33e1a 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -33,19 +34,19 @@ static void adaptive_avg_pool3d_out_frame(
     int64_t istrideH,
     int64_t istrideW) {
   at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
-    for (int64_t d = start; d < end; d++) {
+    for (const auto d : c10::irange(start, end)) {
       /* loop over output */
-      for (int64_t ot = 0; ot < osizeT; ot++) {
+      for (const auto ot : c10::irange(osizeT)) {
         int istartT = start_index(ot, osizeT, isizeT);
         int iendT = end_index(ot, osizeT, isizeT);
         int kT = iendT - istartT;
 
-        for (int64_t oh = 0; oh < osizeH; oh++) {
+        for (const auto oh : c10::irange(osizeH)) {
           int istartH = start_index(oh, osizeH, isizeH);
           int iendH = end_index(oh, osizeH, isizeH);
           int kH = iendH - istartH;
 
-          for (int64_t ow = 0; ow < osizeW; ow++) {
+          for (const auto ow : c10::irange(osizeW)) {
             int istartW = start_index(ow, osizeW, isizeW);
             int iendW = end_index(ow, osizeW, isizeW);
             int kW = iendW - istartW;
@@ -58,9 +59,9 @@ static void adaptive_avg_pool3d_out_frame(
 
             /* compute local average: */
             scalar_t sum = 0;
-            for (int it = 0; it < kT; it++) {
-              for (int ih = 0; ih < kH; ih++) {
-                for (int iw = 0; iw < kW; iw++) {
+            for (const auto it : c10::irange(kT)) {
+              for (const auto ih : c10::irange(kH)) {
+                for (const auto iw : c10::irange(kW)) {
                   scalar_t val =
                       *(ip + it * istrideT + ih * istrideH + iw * istrideW);
                   sum += val;
@@ -83,7 +84,7 @@ void adaptive_avg_pool3d_out_cpu_template(
     IntArrayRef output_size) {
   TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
 
-  for (int64_t i = 1; i < input.ndimension(); i++) {
+  for (const auto i : c10::irange(1, input.ndimension())) {
     TORCH_CHECK(
         input.size(i) > 0,
         "adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
@@ -148,7 +149,7 @@ void adaptive_avg_pool3d_out_cpu_template(
           auto input_data = input.data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
-            for (int64_t b = start; b < end; ++b) {
+            for (const auto b : c10::irange(start, end)) {
               adaptive_avg_pool3d_out_frame<scalar_t>(
                   input_data + b * input.stride(0),
                   output_data + b * sizeD * osizeT * osizeH * osizeW,
@@ -181,22 +182,22 @@ static void adaptive_avg_pool3d_backward_out_frame(
     int64_t osizeH,
     int64_t osizeW) {
   at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
-    for (int64_t d = start; d < end; d++) {
+    for (const auto d : c10::irange(start, end)) {
       scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
       scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
 
       /* calculate average */
-      for (int64_t ot = 0; ot < osizeT; ot++) {
+      for (const auto ot : c10::irange(osizeT)) {
         int istartT = start_index(ot, osizeT, isizeT);
         int iendT = end_index(ot, osizeT, isizeT);
         int kT = iendT - istartT;
 
-        for (int64_t oh = 0; oh < osizeH; oh++) {
+        for (const auto oh : c10::irange(osizeH)) {
           int istartH = start_index(oh, osizeH, isizeH);
           int iendH = end_index(oh, osizeH, isizeH);
           int kH = iendH - istartH;
 
-          for (int64_t ow = 0; ow < osizeW; ow++) {
+          for (const auto ow : c10::irange(osizeW)) {
             int istartW = start_index(ow, osizeW, isizeW);
             int iendW = end_index(ow, osizeW, isizeW);
             int kW = iendW - istartW;
@@ -205,9 +206,9 @@ static void adaptive_avg_pool3d_backward_out_frame(
                 gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
                 kH / kW;
 
-            for (int it = istartT; it < iendT; it++) {
-              for (int ih = istartH; ih < iendH; ih++) {
-                for (int iw = istartW; iw < iendW; iw++) {
+            for (const auto it : c10::irange(istartT, iendT)) {
+              for (const auto ih : c10::irange(istartH, iendH)) {
+                for (const auto iw : c10::irange(istartW, iendW)) {
                   /* update gradient */
                   gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
                       grad_delta;
@@ -265,7 +266,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
           scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
           scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
           at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
-            for (int64_t b = start; b < end; b++) {
+            for (const auto b : c10::irange(start, end)) {
               adaptive_avg_pool3d_backward_out_frame<scalar_t>(
                   gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
                   gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
index bc9bc60b9da957..6634d74a2e3f84 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/AdaptivePooling.h>
+#include <c10/util/irange.h>
 
 
 namespace at {
@@ -10,7 +11,7 @@ TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_si
   TORCH_CHECK(ndim == 3 || ndim == 4,
               "adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ",
               input.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
     TORCH_CHECK(input.size(i) > 0,
         "adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
         "but input has sizes ", input.sizes(), " with dimension ", i,
@@ -51,7 +52,7 @@ TORCH_META_FUNC(adaptive_max_pool2d_backward)
   int64_t ndim = grad_output.ndimension();
   TORCH_CHECK(ndim == 3 || ndim == 4,
     "adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
     TORCH_CHECK(grad_output.size(i) > 0,
       "adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, "
       "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i,
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
index 257670fc7c9d09..3bf1186b3bce82 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 #include <tuple>
 
 
@@ -11,7 +12,7 @@ TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_si
   TORCH_CHECK(
     ndim == 4 || ndim == 5,
     "adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes());
-  for (int64_t i = 1; i < ndim; i++) {
+  for (const auto i : c10::irange(1, ndim)) {
     TORCH_CHECK(
         input.size(i) > 0,
         "adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
@@ -96,8 +97,7 @@ static void adaptive_max_pool3d_single_out_frame(
           int64_t istrideW)
 {
   at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
-    {
+    for (const auto d : c10::irange(start, end)) {
       /* loop over output */
       int64_t ot, oh, ow;
       for(ot = 0; ot < osizeT; ot++)
@@ -176,8 +176,7 @@ static void adaptive_max_pool3d_out_frame(
           int64_t istrideW)
 {
   at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-    for (auto b = start; b < end; b++)
-    {
+    for (const auto b : c10::irange(start, end)) {
       adaptive_max_pool3d_single_out_frame<scalar_t>(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
                                                      indices_data+b*sizeD*osizeT*osizeH*osizeW,
                                                      sizeD,
@@ -203,8 +202,7 @@ static void adaptive_max_pool3d_backward_single_out_frame(
           int64_t osizeW)
 {
   at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
-    {
+    for (const auto d : c10::irange(start, end)) {
       scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
       scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
       int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
@@ -244,8 +242,7 @@ static void adaptive_max_pool3d_backward_out_frame(
           int64_t osizeW)
 {
   at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-    for (auto b = start; b < end; b++)
-    {
+    for (const auto b : c10::irange(start, end)) {
       adaptive_max_pool3d_backward_single_out_frame<scalar_t>(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
                                                               indices_data+b*sizeD*osizeT*osizeH*osizeW,
                                                               sizeD,
diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp
index 658936f329fcd6..7d3febede6f9a5 100644
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Pool.h>
+#include <c10/util/irange.h>
 #include <tuple>
 
 
@@ -169,8 +170,7 @@ static void avg_pool3d_out_frame(
           c10::optional<int64_t> divisor_override)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
+    for (const auto k : c10::irange(start, end)) {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t i, j, ti;
 
@@ -315,7 +315,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
         scalar_t *output_data = output.data_ptr<scalar_t>();
 
         at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-          for (auto p = start; p < end; p++) {
+          for (const auto p : c10::irange(start, end)) {
             avg_pool3d_out_frame(
               input_data + p * istride, output_data + p * ostride, nslices,
               itime, iwidth, iheight,
@@ -358,8 +358,7 @@ static void avg_pool3d_backward_out_frame(
           c10::optional<int64_t> divisor_override)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
+    for (const auto k : c10::irange(start, end)) {
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t i, j, ti;
 
@@ -500,8 +499,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
         scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
 
         at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-          for (auto p = start; p < end; p++)
-          {
+          for (const auto p : c10::irange(start, end)) {
             avg_pool3d_backward_out_frame(
               gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
               itime, iwidth, iheight,
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 16713ab03786eb..a910cf1fd46fc9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -63,7 +63,7 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
   std::function<void(int64_t, int64_t)> loop = [](int64_t, int64_t){};
   if (upper) {
     loop = [&](int64_t start, int64_t end) {
-      for (int64_t i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
         for (int64_t j = i + 1; j < n; j++) {
           self[i * stride + j] = conj_impl(self[j * stride + i]);
         }
@@ -71,8 +71,8 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo
     };
   } else {
     loop = [&](int64_t start, int64_t end) {
-      for (int64_t i = start; i < end; i++) {
-        for (int64_t j = 0; j < i; j++) {
+      for (const auto i : c10::irange(start, end)) {
+        for (const auto j : c10::irange(i)) {
           self[i * stride + j] = conj_impl(self[j * stride + i]);
         }
       }
@@ -106,7 +106,7 @@ void apply_cholesky_inverse(Tensor& input, Tensor& infos, bool upper) {
   auto n = input.size(-2);
   auto lda = std::max<int64_t>(1, n);
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     int* info_working_ptr = &infos_data[i];
     lapackCholeskyInverse<scalar_t>(uplo, n, input_working_ptr, lda, info_working_ptr);
@@ -501,7 +501,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
     scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
 
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index b52a0f20a35c9c..9cf1f995f3ca9a 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -2,6 +2,7 @@
 #include <algorithm>
 #include <ATen/ATen.h>
 #include <ATen/Config.h>
+#include <c10/util/irange.h>
 
 #if AT_BUILD_WITH_BLAS()
 extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
@@ -151,7 +152,7 @@ inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
     blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
     return;
   }
-  for (int64_t i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
     if (a == scalar_t(0)) {
       x[i * incx] = 0;
     } else {
@@ -176,11 +177,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
   }
 
   if ((trans == 'T') || (trans == 't')) {
-    for (int64_t i = 0; i < n; i++)
-    {
+    for (const auto i : c10::irange(n)) {
       scalar_t sum = 0;
       scalar_t *row_ = a + lda * i;
-      for (int64_t j = 0; j < m; j++) {
+      for (const auto j : c10::irange(m)) {
         sum += x[j * incx] * row_[j];
       }
       if (beta == scalar_t(0)) {
@@ -192,10 +192,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t
   } else {
     if (beta != scalar_t(1) && beta != scalar_t(0)) scal<scalar_t>(m, beta, y, incy);
 
-    for (int64_t j = 0; j < n; j++) {
+    for (const auto j : c10::irange(n)) {
       scalar_t *column_ = a + lda * j;
       scalar_t z = alpha * x[j * incx];
-      for (int64_t i = 0; i < m; i++) {
+      for (const auto i : c10::irange(m)) {
         //output values are ignored if beta is 0, and set to 0, nans and infs are not propagated
         if (j==0 && beta==scalar_t(0)) {
          y[i * incy] = scalar_t(0);
diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp
index e4ad35f59fc698..63b88510a6f4e9 100644
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/native/BucketizationUtils.h>
 #include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
 
 /* Implement a TF like searchsorted and a bucketize function running on cpu
  *
@@ -58,7 +59,7 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
 
   bool is_1d_boundaries = boundaries.dim() == 1;
   at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) {
-    for (int64_t i = start; i < end; ++i) {
+    for (const auto i : c10::irange(start, end)) {
       // If boundaries tensor is 1d, we always search the entire boundary tensor
       int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd;
       const input_t *data_bd_start = &data_bd[start_bd];
diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp
index efc41bea0c207a..f1e08a887c841e 100644
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@@ -5,6 +5,7 @@
 
 #include <ATen/native/im2col.h>
 #include <ATen/native/im2col_shape_check.h>
+#include <c10/util/irange.h>
 
 // Note [im2col/col2im output padding]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -150,7 +151,7 @@ static void col2im_out_cpu_template(
                 stride_width +
             1;
 
-        for (int64_t elt = 0; elt < batch_size; elt++) {
+        for (const auto elt : c10::irange(batch_size)) {
           input_n = input.select(0, elt);
           output_n = output.select(0, elt);
 
diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index 1b9a94e9089068..e9efd4b7c88db2 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -24,7 +24,7 @@ inline Tensor view_tensor(
 
 inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
   DimVector res(oldstride.size() + 1);
-  for(size_t i = 0; i < oldstride.size(); i++) {
+  for (const auto i : c10::irange(oldstride.size())) {
     res[i] = oldstride[i] * 2;
   }
   res.back() = 1;
diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp
index 71bbfde152895a..f7a2d76ed52280 100644
--- a/aten/src/ATen/native/ConstantPadNd.cpp
+++ b/aten/src/ATen/native/ConstantPadNd.cpp
@@ -47,7 +47,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
         new_shape.emplace_back(input_sizes[i]);
     }
 
-    for (size_t i = 0; i < (size_t)l_pad; i++) {
+    for (const auto i : c10::irange((size_t)l_pad)) {
         auto pad_idx = pad.size() - ((i + 1) * 2);
         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
         TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 9f0abaf7e61cc3..7a1dcde7b8ce37 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/util/env.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -35,7 +36,7 @@ static inline std::vector<int64_t> conv_output_size(
   std::vector<int64_t> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
   output_size[1] = weight_size[weight_output_channels_dim];
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
     auto dilation_ = has_dilation ? dilation[d - 2] : 1;
     auto kernel = dilation_ * (weight_size[d] - 1) + 1;
     output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
@@ -53,7 +54,7 @@ static inline std::vector<int64_t> conv_input_size(
   std::vector<int64_t> input_size(dim);
   input_size[0] = output_size[output_batch_size_dim];
   input_size[1] = weight_size[weight_input_channels_dim] * groups;
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
     int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
     input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
                      kernel + output_padding[d - 2];
@@ -69,7 +70,7 @@ static inline std::vector<int64_t> conv_weight_size(
   std::vector<int64_t> weight_size(dim);
   weight_size[0] = output_size[1];
   weight_size[1] = input_size[1] / groups;
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
     int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
                + 2 * padding[d - 2] - output_padding[d - 2];
     weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 78eb889f8cfa6e..e8baf42b8c9bb1 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -975,7 +975,7 @@ at::Tensor _convolution(
     } else {
       std::vector<Tensor> outputs(params.groups);
       input = input.contiguous();
-      for (int g = 0; g < params.groups; ++g) {
+      for (const auto g : c10::irange(params.groups)) {
         auto input_g = subtensor(input, 1, params.groups, g);
         auto weight_g = subtensor(weight, 0, params.groups, g);
         auto bias_g = subtensor(bias, 0, params.groups, g);
@@ -1212,7 +1212,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
         }
       } else {
         std::vector<Tensor> gWt_list(groups);
-        for (int g = 0; g < groups; ++g) {
+        for (const auto g : c10::irange(groups)) {
           auto ggIt_g = subvariable(ggIt, 0, groups, g);
           auto gOt_g = subvariable(gOt, 0, groups, g);
           if (gOt_g.is_cuda()) {
@@ -1239,7 +1239,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
       // the ConvForward kernels don't support asymmetric padding.
       auto gW_size = gW.sizes();
       auto w_size = weight.sizes();
-      for (size_t i = 2; i < gW_size.size(); ++i) {
+      for (const auto i : c10::irange(2, gW_size.size())) {
         if (gW_size[i] > w_size[i]) {
             gW = gW.narrow(i, 0, w_size[i]);
             gW_size = gW.sizes();
@@ -1268,7 +1268,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
         // rather than narrowing the computed gI
         auto gI_size = gI.sizes();
         auto i_size = input.sizes();
-        for (size_t i = 2; i < gI_size.size(); ++i) {
+        for (const auto i : c10::irange(2, gI_size.size())) {
           if (gI_size[i] > i_size[i]) {
             gI = gI.narrow(i, 0, i_size[i]);
             gI_size = gI.sizes();
@@ -1289,7 +1289,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
             gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
           }
         } else {
-          for(size_t i = 0; i < kernel_size.size(); ++i) {
+          for (const auto i : c10::irange(kernel_size.size())) {
             // Check if whole input has been used or not
             auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
               - 2 * gi_conv_params.padding[i]
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 67a045ad1a198d..f9f2bb88daf1c2 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -7,6 +7,7 @@
 #include <ATen/div_rtn.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/Unfold2d.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -299,7 +300,7 @@ void slow_conv2d_backward_out_cpu_template(
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       auto fgrad_input = std::make_unique<scalar_t[]>(
           c10::multiply_integers(finput.sizes().slice(1)));
-      for (int64_t t = start; t < end; t++) {
+      for (const auto t : c10::irange(start, end)) {
         auto grad_input_t = grad_input_a[t];
         auto grad_output_t = grad_output_a[t];
         slow_conv2d_backward_update_grad_input_frame(
@@ -478,7 +479,7 @@ std::tuple<Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
     auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t t = start; t < end; t++) {
+      for (const auto t : c10::irange(start, end)) {
         auto input_t = input_a[t];
         auto output_t = output_a[t];
         auto finput_t = finput_a[t];
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index cd8ba16903f0b6..88d4245f9d93c8 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -6,6 +6,7 @@
 #include <ATen/div_rtn.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/Unfold3d.h>
+#include <c10/util/irange.h>
 
 constexpr int64_t CONV3D_GRAIN_SALT = 20;
 
@@ -358,7 +359,7 @@ void slow_conv3d_backward_out_cpu_template(
         auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
         auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
 
-        for (int64_t t = start; t < end; t++) {
+        for (const auto t : c10::irange(start, end)) {
           auto grad_input_t = grad_input_a[t];
           auto grad_output_t = grad_output_a[t];
           auto fgrad_input_t = fgrad_input_a[t];
@@ -462,7 +463,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
     auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
     auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
     auto finput_a = finput.accessor<scalar_t, 3>();
-    for (int64_t t = 0; t < batch_size; t++) {
+    for (const auto t : c10::irange(batch_size)) {
       auto grad_output_t = grad_output_a[t];
       auto finput_t = finput_a[t];
       slow_conv3d_backward_weight_frame(
@@ -564,7 +565,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(const Tensor&
 
     at::parallel_for(
         0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
-          for (int64_t t = start; t < end; t++) {
+          for (const auto t : c10::irange(start, end)) {
             auto input_t = input_a[t];
             auto output_t = output_a[t];
             auto finput_t = finput_a[t];
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
index 2bd0f5ae4b9e3b..c90577822218e9 100644
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
+#include <c10/util/irange.h>
 #include <tuple>
 
 namespace at {
@@ -39,7 +40,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
     weight_size[2],
   }, self.options());
   output.copy_(bias.expand(output.sizes()));
-  for (int k = 0; k < kw; k++) {
+  for (const auto k : c10::irange(kw)) {
     int iShift = std::max(0, static_cast<int>(k - real_pad));
     int oShift = std::max(0, static_cast<int>(real_pad - k));
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index e2c2a1150c5729..c28ca2b66ef8f7 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -12,6 +12,7 @@
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 #include <torch/library.h>
 
 #ifdef USE_FBGEMM
@@ -65,16 +66,16 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
         int nc = std::min(NC - C, BLOCK_SZ);
 
         // 1. copy columns from src to buf
-        for (int c = 0; c < nc; c++) {
+        for (const auto c : c10::irange(nc)) {
           memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
         }
 
         // 2. transpose buf in place
         int rc_max = std::max(nr, nc);
         int rc_min = std::min(nr, nc);
-        for (int r = 0; r < rc_max; r++) {
+        for (const auto r : c10::irange(rc_max)) {
           int end = std::min(r, rc_min);
-          for (int c = 0; c < end; c++) {
+          for (const auto c : c10::irange(end)) {
             scalar_t tmp = bp[r + BLOCK_SZ * c];
             bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
             bp[r * BLOCK_SZ + c] = tmp;
@@ -82,7 +83,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
         }
 
         // 3. copy rows from buf to dst
-        for (int r = 0; r < nr; r++) {
+        for (const auto r : c10::irange(nr)) {
           memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
         }
       }
diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp
index 70dba97520c008..49f3c80e27d509 100644
--- a/aten/src/ATen/native/Cross.cpp
+++ b/aten/src/ATen/native/Cross.cpp
@@ -3,6 +3,7 @@
 #include <ATen/NativeFunctions.h>
 
 #include <ATen/native/Cross.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -30,7 +31,7 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::option
 
   int64_t dim = -1;
   if(!dimension.has_value()) {
-    for(int64_t i = 0; i < input.dim(); i++) {
+    for (const auto i : c10::irange(input.dim())) {
       if(input.size(i) == 3) {
         dim = i;
         break;
diff --git a/aten/src/ATen/native/DilatedConvolutionUtils.h b/aten/src/ATen/native/DilatedConvolutionUtils.h
index 0f9bf90ab5a169..2d4815799b10f2 100644
--- a/aten/src/ATen/native/DilatedConvolutionUtils.h
+++ b/aten/src/ATen/native/DilatedConvolutionUtils.h
@@ -5,6 +5,7 @@
 
 #include <ATen/div_rtn.h>
 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 
 #define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
   TORCH_CHECK(                                       \
@@ -43,7 +44,7 @@ std::vector<int64_t> get_output_size(
     IntArrayRef pad_size,
     IntArrayRef dilation_size) {
   std::vector<int64_t> sizes;
-  for (int index = 0; index < dim; index++) {
+  for (const auto index : c10::irange(dim)) {
     sizes.push_back(
         div_rtn<int64_t>(
             input.size(index + input.dim() - dim) + 2 * pad_size[index] -
diff --git a/aten/src/ATen/native/DilatedMaxPool3d.cpp b/aten/src/ATen/native/DilatedMaxPool3d.cpp
index 21398c09067598..57fa6f9ea691cf 100644
--- a/aten/src/ATen/native/DilatedMaxPool3d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp
@@ -3,6 +3,7 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Pool.h>
+#include <c10/util/irange.h>
 #include <tuple>
 
 
@@ -37,8 +38,7 @@ static void max_pool3d_with_indices_single_out_frame(
           int dilationH)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
+    for (const auto k : c10::irange(start, end)) {
       /* loop over output */
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t i, j, ti;
@@ -120,8 +120,7 @@ static void max_pool3d_with_indices_out_frame(
           int dilationT, int dilationW, int dilationH)
 {
   at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++)
-    {
+    for (const auto p : c10::irange(start, end)) {
       max_pool3d_with_indices_single_out_frame(
         input_data   + p * istride,
         output_data  + p * ostride,
@@ -285,8 +284,7 @@ static void max_pool3d_with_indices_backward_single_out_frame(
           int dilationH)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
+    for (const auto k : c10::irange(start, end)) {
       scalar_t *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
       scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
       int64_t *indz_p_k = indz_p + k * otime * owidth * oheight;
@@ -330,8 +328,7 @@ static void max_pool3d_with_indices_backward_out_frame(
           int dilationT, int dilationW, int dilationH)
 {
   at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-    for (auto p = start; p < end; p++)
-    {
+    for (const auto p : c10::irange(start, end)) {
       max_pool3d_with_indices_backward_single_out_frame<scalar_t>(
         gradInput_data + p * istride,
         gradOutput_data + p * ostride,
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index ac56071edb8054..c4a6ec6cef5561 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NamedTensorUtils.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -16,8 +17,10 @@ Tensor make_feature_noise(const Tensor& input) {
   sizes.reserve(input.dim());
   sizes.push_back(input_sizes[0]);
   sizes.push_back(input_sizes[1]);
-  for (int64_t i = 2; i < input.dim(); ++i)
+  for (const auto i : c10::irange(2, input.dim())) {
+    (void)i; //Suppress unused variable warning
     sizes.push_back(1);
+  }
   return at::empty(sizes, input.options());
 }
 
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 959005c52b2fc4..cac0cbe7130f26 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -123,7 +123,7 @@ Tensor embedding_dense_backward_cpu(
 
     auto parallel_section = [&](index_t start, index_t end) {
       TensorIterator iter(add_iter);
-      for (int64_t i = 0; i < numel; i++) {
+      for (const auto i : c10::irange(numel)) {
         if (indices_data[i] != padding_idx) {
           index_t k = indices_data[i];
           if (k >= start && k < end) {
@@ -167,7 +167,7 @@ Tensor & embedding_renorm_cpu_(
 
     // Note that we cannot use at::parallel_for here because we perform operations on
     // Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details.
-    for (auto i = 0; i < num_indices; i++) {
+    for (const auto i : c10::irange(num_indices)) {
       if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) {
         continue;
       }
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 53477e2e20e989..66ae4b4f7956cb 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -107,7 +107,7 @@ index_select_add(const Tensor &select_indices,
   auto output_stride0 = output.strides()[0];
   auto output_stride1 = output.strides()[1];
 
-  for (int64_t i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
     // We can skip indices equal to padding_idx so they are not included in
     // the reduction
     if (select_indices_data[i] != padding_idx) {
@@ -247,7 +247,7 @@ index_select_add(const Tensor &select_indices,
     auto output_stride0 = output.strides()[0];
     auto output_stride1 = output.strides()[1];
     auto numel = add_indices.numel();
-    for (int64_t i = 0; i < numel; i++) {
+    for (const auto i : c10::irange(numel)) {
       // We can skip indices equal to padding_idx so they are not included in
       // the reduction
       if (select_indices_data[i] != padding_idx) {
@@ -302,14 +302,14 @@ index_select_scale_add(const Tensor &select_indices,
   auto* scale_data = scale.data_ptr<data_t>();
   auto scale_stride = scale.strides()[0];
 
-  for (int64_t i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
     // We can skip indices equal to padding_idx so they are not included in
     // the reduction
     if (select_indices_data[i] != padding_idx) {
       auto* src_base = src_data + src_stride0 * select_indices_data[i];
       auto* output_base = output_data + output_stride0 * add_indices_data[i];
       auto scale = scale_data[i * scale_stride];
-      for (int64_t j = 0; j < ddim; j++) {
+      for (const auto j : c10::irange(ddim)) {
         output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
       }
     } else if (bag_size.defined()) {
@@ -419,14 +419,14 @@ index_select_scale_add(const Tensor &select_indices,
     auto numel = add_indices.numel();
 
 
-    for (int64_t i = 0; i < numel; i++) {
+    for (const auto i : c10::irange(numel)) {
       // We can skip indices equal to padding_idx so they are not included in
       // the reduction
       if (select_indices_data[i] != padding_idx) {
         auto* src_base = src_data + src_stride0 * select_indices_data[i];
         auto* output_base = output_data + output_stride0 * add_indices_data[i];
         auto scale = scale_data[i * scale_stride];
-        for (int64_t j = 0; j < ddim; j++) {
+        for (const auto j : c10::irange(ddim)) {
           output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
         }
       } else if (bag_size.defined()) {
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index 91f0534d5fa1f6..acaad52a299591 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -6,6 +6,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -63,7 +64,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
 
   if (nDims > 2) {
     int64_t dim1 = height;
-    for (int64_t i = 1; i < nDims; i++) {
+    for (const auto i : c10::irange(1, nDims)) {
       if (self.size(i) != dim1) {
         AT_ERROR("all dimensions of input must be of equal length");
       }
@@ -76,7 +77,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
   int64_t size = std::min(height, width);
 
   int64_t stride = 0;
-  for (int64_t i = 0; i < nDims; i++) {
+  for (const auto i : c10::irange(nDims)) {
     stride += self.stride(i);
   }
   strides.push_back(stride);
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index e7eacd3e76e0f7..bdff052e94b001 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 
 #include <tuple>
 #include <vector>
@@ -32,7 +33,7 @@ TORCH_META_FUNC(fractional_max_pool2d) (
   int64_t ndims = input.ndimension();
   TORCH_CHECK(ndims == 3 || ndims == 4,
               "fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes());
-  for (int64_t i = 1; i < ndims; ++i) {
+  for (const auto i : c10::irange(1, ndims)) {
     TORCH_CHECK(input.size(i) > 0,
                 "fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got",
                 input.sizes(), " with dimension ", i, " being empty.");
@@ -106,7 +107,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
   int outputW, int outputH,
   int poolSizeW, int poolSizeH) {
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; ++plane) {
+    for (const auto plane : c10::irange(start, end)) {
       /* each plane contains 2 random samples, one for W and one for H */
       scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
 
@@ -177,7 +178,7 @@ static void fractional_max_pool2d_out_frame(
       return;
     }
     at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
         fractional_max_pool2d_out_single_batch_frame<scalar_t>(
           input + batch * numPlanes * inputH * inputW,
           output + batch * numPlanes * outputH * outputW,
@@ -254,7 +255,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
   int inputW, int inputH,
   int outputW, int outputH) {
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; plane++) {
+    for (const auto plane : c10::irange(start, end)) {
       scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
       scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
       int64_t* indicesForPlane = indices + plane * outputW * outputH;
@@ -291,7 +292,7 @@ static void fractional_max_pool2d_backward_out_frame(
       return;
     }
     at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
         fractional_max_pool2d_backward_out_single_batch_frame<scalar_t>(
           gradInput + batch * numPlanes * inputH * inputW,
           gradOutput + batch * numPlanes * outputH * outputW,
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 279ff92467733d..237f9d4395bcea 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -44,7 +44,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
   int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
 
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; ++plane) {
+    for (const auto plane : c10::irange(start, end)) {
       /* each plane contains 3 random samples,
          one for T, one for W, and one for H */
       scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
@@ -126,7 +126,7 @@ static void fractional_max_pool3d_out_frame(
     }
 
     at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
         fractional_max_pool3d_out_single_batch_frame<scalar_t>(
           input + batch * numPlanes * inputW * inputH * inputT,
           output + batch * numPlanes * outputW * outputH * outputT,
@@ -171,7 +171,7 @@ void fractional_max_pool3d_out_cpu_template(
   TORCH_CHECK(ndims == 4 || ndims == 5,
               "fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ",
               input_.sizes());
-  for (int64_t i = 1; i < ndims; ++i) {
+  for (const auto i : c10::irange(1, ndims)) {
     TORCH_CHECK(input_.size(i) > 0,
                 "fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got",
                 input_.sizes(), " with dimension ", i, " being empty.");
@@ -243,7 +243,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
   int64_t outputT, int64_t outputH, int64_t outputW) {
 
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
-    for (auto plane = start; plane < end; plane++) {
+    for (const auto plane : c10::irange(start, end)) {
       scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
       scalar_t* gradOutputForPlane = gradOutput +
                   plane * outputT * outputH * outputW;
@@ -284,7 +284,7 @@ static void fractional_max_pool3d_backward_out_frame(
     }
 
     at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) {
-      for (auto batch = start; batch < end; ++batch) {
+      for (const auto batch : c10::irange(start, end)) {
         fractional_max_pool3d_backward_out_single_batch_frame<scalar_t>(
           gradInput + batch * numPlanes * inputW * inputH * inputT,
           gradOutput + batch * numPlanes * outputW * outputH * outputT,
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index df3ff5d73f7ebb..740f725167a63a 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/GridSamplerKernel.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -51,12 +52,12 @@ namespace {
     scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
     // loop over each output pixel
     at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-      for (int64_t n = start; n < end; ++n) {
+      for (const auto n : c10::irange(start, end)) {
         scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
         scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
-          for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w) {
+        for (const auto d : c10::irange(out_D)) {
+          for (const auto h : c10::irange(out_H)) {
+            for (const auto w : c10::irange(out_W)) {
               // get the corresponding input x, y, z co-ordinates from grid
               scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
               scalar_t ix = *grid_ptr_NDHW;
@@ -222,12 +223,12 @@ namespace {
     scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
     // loop over each output pixel
     at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-      for (int64_t n = start; n < end; ++n) {
+      for (const auto n : c10::irange(start, end)) {
         scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
         scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
         scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
-          for (int64_t h = 0; h < out_H; ++h) {
+        for (const auto d : c10::irange(out_D)) {
+          for (const auto h : c10::irange(out_H)) {
             for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
               // get the corresponding input x, y, z co-ordinates from grid
               scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
@@ -416,11 +417,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
   scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
   // loop over each output pixel
   at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-    for (int64_t n = start; n < end; ++n) {
+    for (const auto n : c10::irange(start, end)) {
       scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
       scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w) {
+      for (const auto h : c10::irange(out_H)) {
+        for (const auto w : c10::irange(out_W)) {
           // get the corresponding input x, y, z co-ordinates from grid
           scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           scalar_t x = *grid_ptr_NHW;
@@ -505,7 +506,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
               scalar_t coefficients[4];
 
               // Interpolate 4 values in the x directon
-              for (int64_t i = 0; i < 4; ++i) {
+              for (const auto i : c10::irange(4)) {
                 coefficients[i] = cubic_interp1d<scalar_t>(
                   get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
                   get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
@@ -578,11 +579,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
   scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
   // loop over each output pixel
   at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-    for (int64_t n = start; n < end; ++n) {
+    for (const auto n : c10::irange(start, end)) {
       scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
       scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
       scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
+      for (const auto h : c10::irange(out_H)) {
         for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
           // get the corresponding input x, y co-ordinates from grid
           scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
@@ -703,8 +704,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
             for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
               scalar_t gOut = *gOut_ptr_NCHW;
 
-              for (int64_t i = 0; i < 4; ++i) {
-                for (int64_t j = 0; j < 4; ++j) {
+              for (const auto i : c10::irange(4)) {
+                for (const auto j : c10::irange(4)) {
 
                   // set input gradient
                   add_value_bounded<scalar_t>(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
@@ -857,7 +858,7 @@ Tensor grid_sampler(const Tensor& input, const Tensor& grid,
     !(input.dim() == 5 && static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Bicubic),
     "grid_sampler(): bicubic interpolation only supports 4D input"
   );
-  for (int64_t i = 2; i < input.dim(); i++) {
+  for (const auto i : c10::irange(2, input.dim())) {
     TORCH_CHECK(input.size(i) > 0,
       "grid_sampler(): expected input to have non-empty spatial dimensions, "
       "but input has sizes ", input.sizes(), " with dimension ", i, " being "
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index f66e4d44544616..c4b05bc18b566f 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -5,6 +5,7 @@
 
 #include <ATen/native/im2col.h>
 #include <ATen/native/im2col_shape_check.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -91,7 +92,7 @@ static void im2col_out_cpu_template(
         Tensor input_n;
         Tensor output_n;
 
-        for (int64_t elt = 0; elt < batch_size; elt++) {
+        for (const auto elt : c10::irange(batch_size)) {
           input_n = input.select(0, elt);
           output_n = output.select(0, elt);
 
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 5b938e9536c486..2dea9a0e94d416 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -2,6 +2,7 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/core/List.h>
+#include <c10/util/irange.h>
 
 #include <limits>
 
@@ -31,7 +32,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
         }
         // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
         // corresponding dimensions in self
-        for (int64_t j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
           int64_t srcIdx = result.size() + j;
           if (index.size(j) != self.size(srcIdx)) {
             invalid_mask(self, srcIdx, index, j);
@@ -39,7 +40,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, const t
         }
         // Replace with nonzeros
         auto nonzero = index.nonzero();
-        for (int64_t j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
           result.emplace_back(nonzero.select(1, j));
         }
       } else {
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 4c9be81055bfd8..d97a143e2fd9c1 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1264,7 +1264,7 @@ static void addbmm_impl_(
   }
 
   auto adjusted_beta(beta);
-  for (int64_t batch = 0; batch < num_batches; ++batch) {
+  for (const auto batch : c10::irange(num_batches)) {
     result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
     adjusted_beta = 1; // accumulate output once
   }
@@ -1321,23 +1321,23 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T
 
   int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
   parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
-      for (int64_t b = b_begin; b < b_end; b++) {
+      for (const auto b : c10::irange(b_begin, b_end)) {
         auto r1 = r0[b];
         auto s1 = s0[b];
         auto m1 = m0[b];
-        for (int64_t i = 0; i < is; i++) {
+        for (const auto i : c10::irange(is)) {
           auto r2 = r1[i];
           auto s2 = s1[i];
-          for (int64_t j = 0; j < js; j++) {
+          for (const auto j : c10::irange(js)) {
             scalar_t &r = r2[j];
             if (is_bmm) {
               r = 0;
-              for (int64_t k = 0; k < ks; k++) {
+              for (const auto k : c10::irange(ks)) {
                 r += s2[k] * m1[k][j];
               }
             } else {
               r *= beta;
-              for (int64_t k = 0; k < ks; k++) {
+              for (const auto k : c10::irange(ks)) {
                 r += alpha * s2[k] * m1[k][j];
               }
             }
@@ -2100,10 +2100,11 @@ void compute_T18_scale_square(
   auto mexp_scaled = at::native::compute_T18<scalar_t>(a_scaled);
   auto s_cpu = (s.device().type() == at::kCPU)
     ? s : s.to(at::kCPU);
-  for (int64_t i = 0; i < mexp_scaled.size(0); ++i) {
+  for (const auto i : c10::irange(mexp_scaled.size(0))) {
     auto s_val = s_cpu.select(0, i).template item<int64_t>();
     auto mexp = mexp_scaled.select(0, i);
-    for (int64_t p = 0; p < s_val; ++p) {
+    for (const auto p : c10::irange(s_val)) {
+      (void)p; //Suppress unused variable warning
       mexp = at::matmul(mexp, mexp);
     }
     mexp_out.select(0, i).copy_(mexp);
@@ -2371,7 +2372,7 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens
 // (e.g. [0, 1, 2, ..., ndim-1])
 static std::vector<int64_t> make_dim_list(int64_t ndim) {
   std::vector<int64_t> dim_list(ndim);
-  for (int64_t ind = 0; ind < ndim; ind++) {
+  for (const auto ind : c10::irange(ndim)) {
     dim_list[ind] = ind;
   }
   return dim_list;
@@ -2924,7 +2925,7 @@ struct KronImpl final {
       a_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
       b_reshape = c10::SmallVector<int64_t, 10>(2 * maxdim);
       result_reshape = c10::SmallVector<int64_t, 10>(maxdim);
-      for (int64_t i = 0; i < maxdim; i++) {
+      for (const auto i : c10::irange(maxdim)) {
         a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
         a_reshape[2 * i + 1] = 1;
         b_reshape[2 * i] = 1;
@@ -2939,7 +2940,7 @@ struct KronImpl final {
       TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it.");
 
       c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
-      for (int64_t i = 0; i < maxdim; i++) {
+      for (const auto i : c10::irange(maxdim)) {
         mul_shape[2 * i] = a_reshape[2 * i];
         mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
       }
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index bbabca6bbfb8aa..c495fc83075654 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/TensorUtils.h>
@@ -169,7 +170,8 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
     auto* b_batch_idx_ptr = data[0];
     auto* a_batch_idx_ptr = data[1];
 
-    for (int64_t elem = 0; elem < nelems; ++elem) {
+    for (const auto elem : c10::irange(nelems)) {
+      (void)elem; //Suppress unused variable warning
       auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
       auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
 
@@ -332,7 +334,7 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
   const int64_t ndim = self.ndimension();
   std::vector<int64_t> perm;
 
-  for (int64_t i = 0; i < ndim; i++) {
+  for (const auto i : c10::irange(ndim)) {
     auto it = std::find(a.begin(), a.end(), i);
     if (it == a.end()) {
        perm.push_back(i);
@@ -476,7 +478,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
     "duplicate or invalid dimensions");
   std::vector<int64_t> permutation(ndim);
   int64_t cur_permuted_dim = 0;
-  for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
+  for (const auto dim_ind : c10::irange(ndim)) {
     if ((dim_ind != dim0) && (dim_ind != dim1)) {
       permutation[cur_permuted_dim++] = dim_ind;
     }
@@ -493,7 +495,7 @@ static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0
 static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
   int64_t ndim = permutation.size();
   std::vector<int64_t> reverse_permutation(ndim);
-  for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) {
+  for (const auto dim_ind : c10::irange(ndim)) {
     reverse_permutation[permutation[dim_ind]] = dim_ind;
   }
   return reverse_permutation;
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index dbf0e2cc990950..19af04b9731de4 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -11,6 +11,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/Fill.h>
+#include <c10/util/irange.h>
 
 #include <numeric>
 #include <type_traits>
@@ -60,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
   std::vector<int64_t> tg_batch_offsets(batch_size);
   if (targets.dim() == 1) { // concatenated targets
     int64_t pos = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       tg_batch_offsets[i] = pos;
       pos += target_lengths[i];
       if (max_target_length < target_lengths[i])
@@ -72,7 +73,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
   else { // batch x max_target_length
     // dim is 2
     int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       tg_batch_offsets[i] = i * tg_batch_stride;
       if (max_target_length < target_lengths[i])
         max_target_length = target_lengths[i];
@@ -84,7 +85,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
              " (while checking arguments for ", c, ")");
   }
   int64_t max_input_length = log_probs.size(0);
-  for (int64_t b = 0; b < batch_size; b++) {
+  for (const auto b : c10::irange(batch_size)) {
     TORCH_CHECK(input_lengths[b] <= max_input_length,
              "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
              " (while checking arguments for ", c, ")");
@@ -103,7 +104,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
   // first the default
   log_alpha.narrow(1, 0, 1).fill_(neginf);
   at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-    for (int64_t b = start; b < end; b++) {
+    for (const auto b : c10::irange(start, end)) {
       int64_t input_length = input_lengths[b];
       int64_t target_length = target_lengths[b];
       auto log_probs_a = log_probs_a_global[b];
@@ -116,7 +117,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
         log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
 
       // now the loop over the inputs
-      for (int64_t t=1; t<input_length; t++) {
+      for (const auto t : c10::irange(1, input_length)) {
         for (int64_t s=0; s<2*target_length+1; s++) {
           auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
           // this loop over s could be parallel/vectorized, too, but the required items are one index apart
@@ -189,7 +190,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
   if (targets.dim() == 1) { // concatenated targets
     int64_t pos = 0;
     max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       tg_batch_offsets[i] = pos;
       pos += target_lengths[i];
       if (max_target_length < target_lengths[i])
@@ -200,7 +201,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
   else { // batch x max_target_length
     // dim is 2
     int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       tg_batch_offsets[i] = i * tg_batch_stride;
     }
     tg_target_stride = targets.stride(1);
@@ -234,7 +235,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
     TensorIterator fill_1d_iter_local(fill_1d_iter);
     TensorIterator fill_log_beta_1d_iter_local(fill_log_beta_1d_iter);
 
-    for (int64_t b = start; b < end; b++) {
+    for (const auto b : c10::irange(start, end)) {
       scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
       auto grad_a = grad_a_global[b];
       if (zero_infinity && nll == std::numeric_limits<scalar_t>::infinity()) {
@@ -322,8 +323,8 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
       // this could be a great target for further vectorization.
       // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
       scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
-      for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
-        for (int64_t c = 0; c < num_labels; c++) {
+      for (const auto t : c10::irange(input_length)) { // or go for the full thing?
+        for (const auto c : c10::irange(num_labels)) {
           scalar_t& res = grad_a[t][c];
           scalar_t lp = log_probs_a[t][c];
           res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index fa71663716f5d3..f59de5c8817a42 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/LossMulti.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -17,21 +18,21 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
     int64_t dim) {
   using accscalar_t = at::acc_type<scalar_t, false>;
   accscalar_t sum = 0;
-  for (int64_t ddt = 0; ddt < dim; ddt++) {
+  for (const auto ddt : c10::irange(dim)) {
     int64_t target_idx = target_data[ddt];
     if (target_idx < 0) {
       break;
     }
     is_target_data[target_idx] = 1;
   }
-  for (int64_t dt = 0; dt < dim; dt++) {
+  for (const auto dt : c10::irange(dim)) {
     int64_t target_idx = target_data[dt];
     if (target_idx < 0) {
       break;
     }
 
     scalar_t input_target = input_data[target_idx];
-    for (int64_t d = 0; d < dim; d++) {
+    for (const auto d : c10::irange(dim)) {
       if (!is_target_data[d]) {
         scalar_t z = 1 - input_target + input_data[d];
         if (z > 0) {
@@ -63,7 +64,8 @@ static void multilabel_margin_loss_forward_out_frame(
 
     accscalar_t sum = 0;
 
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
+      (void)t; //Suppress unused variable warning
       sum += multilabel_margin_loss_forward_inner_sum_cpu(
           input_data, target_data, is_target_data, dim);
 
@@ -81,7 +83,7 @@ static void multilabel_margin_loss_forward_out_frame(
   } else {
     auto output_acc = output.accessor<scalar_t, 1>();
 
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
       scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu(
           input_data, target_data, is_target_data, dim);
 
@@ -171,15 +173,16 @@ static void multilabel_margin_loss_backward_out_frame(
       reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
 
   scalar_t* grad_input_row_data = grad_input.data_ptr<scalar_t>();
-  for (int64_t t = 0; t < nframe; t++) {
-    for (int64_t dt = 0; dt < dim; dt++) {
+  for (const auto t : c10::irange(nframe)) {
+    (void)t; //Suppress unused variable warning
+    for (const auto dt : c10::irange(dim)) {
       int64_t target_idx = target_data[dt];
       if (target_idx < 0) {
         break;
       }
 
       scalar_t input_target = input_data[target_idx];
-      for (int64_t d = 0; d < dim; d++) {
+      for (const auto d : c10::irange(dim)) {
         if (!is_target_data[d]) {
           scalar_t z = 1 - input_target + input_data[d];
           if (z > 0) {
@@ -206,8 +209,8 @@ static void multilabel_margin_loss_backward_out_frame(
   } else {
     check_dim_size(grad_output, 1, 0, nframe);
     auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
-      for (int64_t d = 0; d < dim; d++) {
+    for (const auto t : c10::irange(nframe)) {
+      for (const auto d : c10::irange(dim)) {
         grad_input_data[t * dim + d] *= grad_output_acc[t];
       }
     }
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index b65aaf9b6adce2..c7ab53f1d211b7 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/native/LossMulti.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -18,7 +19,7 @@ inline scalar_t multi_margin_inner_sum_cpu(
     const int64_t target_idx) {
   const scalar_t input_target = input_data[target_idx];
   scalar_t sum = 0;
-  for (int64_t d = 0; d < dim; d++) {
+  for (const auto d : c10::irange(dim)) {
     if (d == target_idx) {
       continue;
     }
@@ -63,7 +64,7 @@ static inline void multi_margin_loss_cpu_kernel(
   // cannot be handled by TensorAccessor)
   if (reduction == Reduction::None && output.dim() > 0) {
     auto output_acc = output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
       const auto idx = target_index_checked(target_data, t, dim);
       auto sum = multi_margin_inner_sum_cpu(
           input_data, weight_data, p, margin, dim, idx);
@@ -73,7 +74,7 @@ static inline void multi_margin_loss_cpu_kernel(
   } else {
     accscalar_t sum = 0;
     auto output_acc = output.data_ptr<scalar_t>();
-    for (int64_t t = 0; t < nframe; t++) {
+    for (const auto t : c10::irange(nframe)) {
       const auto idx = target_index_checked(target_data, t, dim);
       sum += multi_margin_inner_sum_cpu(
           input_data, weight_data, p, margin, dim, idx);
@@ -149,11 +150,11 @@ static void multi_margin_loss_backward_cpu_kernel(
     int64_t dim,
     int64_t reduction) {
   scalar_t* grad_input_row_data = grad_input_data;
-  for (int64_t t = 0; t < nframe; t++) {
+  for (const auto t : c10::irange(nframe)) {
     int64_t target_idx = target_index_checked(target_data, t, dim);
     scalar_t input_target = input_data[target_idx];
     scalar_t grad_input_target = 0;
-    for (int64_t d = 0; d < dim; d++) {
+    for (const auto d : c10::irange(dim)) {
       scalar_t z = margin - input_target + input_data[d];
       if (d == target_idx) {
         continue;
@@ -186,8 +187,8 @@ static void multi_margin_loss_backward_cpu_kernel(
     }
   } else {
     auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
-    for (int64_t t = 0; t < nframe; t++) {
-      for (int64_t d = 0; d < dim; d++) {
+    for (const auto t : c10::irange(nframe)) {
+      for (const auto d : c10::irange(dim)) {
         grad_input_data[t * dim + d] *= grad_output_acc[t];
       }
     }
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 78f982afcd871d..dfb4aced85c41f 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/SmallBuffer.h>
 
 #include <c10/core/TensorOptions.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace meta {
@@ -155,7 +156,7 @@ static void nll_loss_out_frame(
     auto output_acc = output.accessor<scalar_t, 1>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
         const auto cur_target = target_acc[i];
 
         if (cur_target == ignore_index) {
@@ -215,7 +216,7 @@ static void nll_loss_out_frame(
   scalar_t weight_partial_sums[cascade_sum_num_levels] = {0};
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
-  for (int64_t b = 0; b < batch_size; b++) {
+  for (const auto b : c10::irange(batch_size)) {
     const int64_t cur_target = target_data[b];
     if (cur_target == ignore_index) {
       ++num_ignored;
@@ -330,7 +331,7 @@ static void nll_loss_backward_out_frame(
     auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
     auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (auto i = start; i < end; i++) {
+      for (const auto i : c10::irange(start, end)) {
         auto cur_target = target_acc[i];
         if (cur_target == ignore_index) {
           continue;
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 0e3e9e6fec77ba..d7ebf65231f1ed 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -5,6 +5,7 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/native/cpu/utils.h>
 #include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -109,9 +110,9 @@ static void nll_loss2d_forward_out_frame(
     auto target_acc = target.accessor<int64_t, 3>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t b = start; b < end; b++) {
-        for (int64_t h = 0; h < H; h++) {
-          for (int64_t w = 0; w < W; w++) {
+      for (const auto b : c10::irange(start, end)) {
+        for (const auto h : c10::irange(H)) {
+          for (const auto w : c10::irange(W)) {
             const int64_t cur_target = (int64_t)target_acc[b][h][w];
 
             if (cur_target == ignore_index) {
@@ -176,8 +177,8 @@ static void nll_loss2d_forward_out_frame(
   const int64_t level_mask = level_step - 1;
 
   int64_t num_ignored = 0;
-  for (int64_t b = 0; b < batch_size; b++) {
-    for (int64_t elem = 0; elem < map_size; elem++) {
+  for (const auto b : c10::irange(batch_size)) {
+    for (const auto elem : c10::irange(map_size)) {
       const int64_t cur_target = target_data[b * map_size + elem];
       if (cur_target == ignore_index) {
         ++num_ignored;
@@ -286,9 +287,9 @@ static void nll_loss2d_backward_out_frame(
     auto target_acc = target.accessor<int64_t, 3>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-      for (int64_t b = start; b < end; b++) {
-        for (int64_t h = 0; h < H; h++) {
-          for (int64_t w = 0; w < W; w++) {
+      for (const auto b : c10::irange(start, end)) {
+        for (const auto h : c10::irange(H)) {
+          for (const auto w : c10::irange(W)) {
             const int64_t cur_target = target_acc[b][h][w];
             if (cur_target == ignore_index) {
               continue;
@@ -329,8 +330,8 @@ static void nll_loss2d_backward_out_frame(
                                                    : grad_output_value);
 
   at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
-    for (int64_t b = start; b < end; b++) {
-      for (int64_t elem = 0; elem < map_size; elem++) {
+    for (const auto b : c10::irange(start, end)) {
+      for (const auto elem : c10::irange(map_size)) {
         const int64_t t = target_data[b * map_size + elem];
 
         if (t != ignore_index) {
diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp
index fa1d1d86c6930d..e83320e09fa6eb 100644
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@@ -60,6 +60,7 @@ bool _nnpack_available() {
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/Parallel.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -238,7 +239,7 @@ Tensor _nnpack_spatial_convolution(
       const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
       const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
 
-      for (size_t batch = 0u; batch < batch_size; ++batch) {
+      for (const auto batch : c10::irange(0u, batch_size)) {
         const nnp_status status = nnp_convolution_inference(
             algorithm,
             nnp_convolution_transform_strategy_compute,
diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp
index 1d5d8e4a4a6982..c987f72261ab47 100644
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@@ -100,7 +100,7 @@ Tensor refine_names(const Tensor& self, DimnameList names) {
       self_names.size(), " and ", names.size(), " respectively).");
   check_names_valid_for(self, names);
 
-  for (size_t idx = 0; idx < self_names.size(); idx++) {
+  for (const auto idx : c10::irange(self_names.size())) {
     const auto& self_name = self_names[idx];
     const auto& out_name = names[idx];
     if (self_name == out_name || self_name.isWildcard()) {
@@ -221,7 +221,7 @@ Tensor align_to(const Tensor& tensor, DimnameList order, int64_t ellipsis_idx) {
   };
 
   // Fill in the non-ellipsis dimensions
-  for (auto order_idx = 0U; order_idx < order.size(); ++order_idx) {
+  for (const auto order_idx : c10::irange(0U, order.size())) {
     auto out_idx = order_idx;
     if (order_idx >= ellipsis_idx) {
       out_idx = order_idx + num_ellipsis_names;
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 25ae1a765e85ff..fdce903c0806d9 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -10,6 +10,7 @@
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/batch_norm.h>
 #include <ATen/native/Normalization.h>
+#include <c10/util/irange.h>
 
 #include <vector>
 
@@ -156,7 +157,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
   // Reduce all dimensions except dim=1
   DimVector reduce_dims(ndim - 1);
   reduce_dims[0] = 0;
-  for (int64_t i = 2; i < ndim; ++i) {
+  for (const auto i : c10::irange(2, ndim)) {
     reduce_dims[i - 1] = i;
   }
 
@@ -178,7 +179,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
     batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
 
     parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
-      for (int64_t f = b_begin; f < b_end; ++f) {
+      for (const auto f : c10::irange(b_begin, b_end)) {
         save_mean_a[f] = _mean_a[f];
         save_var_transform_a[f] = VarTransform<accscalar_t>{}(_var_sum_a[f] / n, eps);
 
@@ -206,7 +207,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
 
   parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
     TensorIterator iter(reduce_iter);
-    for (int64_t f = b_begin; f < b_end; ++f) {
+    for (const auto f : c10::irange(b_begin, b_end)) {
       // compute variance per input
       iter.unsafe_replace_operand(0, in_data + channel_stride * f);
       accscalar_t var_sum = 0;
@@ -283,7 +284,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
   // Reduce all dimensions except dim=1
   DimVector reduce_dims(ndim - 1);
   reduce_dims[0] = 0;
-  for (int64_t i = 2; i < ndim; ++i) {
+  for (const auto i : c10::irange(2, ndim)) {
     reduce_dims[i - 1] = i;
   }
 
@@ -330,7 +331,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
       TensorIterator unary_iter_local(unary_iter);
       TensorIterator binary_iter_local(binary_iter);
 
-      for (int64_t f = b_begin; f < b_end; ++f) {
+      for (const auto f : c10::irange(b_begin, b_end)) {
         scalar_t w = weight.defined() ? weight_a[f] : 1;
 
         scalar_t mean, invstd;
diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp
index 798672ccdeaeff..ec997d86aa1b59 100644
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@@ -77,7 +77,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
   // more elements below in our column, we lower the counter (prev_l), and append the new
   // block to the output.
   int64_t prev_l = 0;
-  for (int64_t i = 0; i < batch_size; ++i) {
+  for (const auto i : c10::irange(batch_size)) {
     int64_t l = lengths[batch_size - 1 - i];
     if (l > prev_l) {
       auto current_batch_size = batch_size - i;
@@ -109,7 +109,7 @@ Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_
   int64_t offset = 0;
   int64_t max_seq_len = batch_sizes_t.size(0);
   int64_t * batch_sizes = batch_sizes_t.data_ptr<int64_t>();
-  for (int64_t i = 0; i < max_seq_len; ++i) {
+  for (const auto i : c10::irange(max_seq_len)) {
     grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i]));
     offset += batch_sizes[i];
   }
@@ -170,7 +170,8 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
     }
     int64_t dec = prev_batch_size - batch_size;
     if (dec > 0) {
-      for (int64_t j = 0; j < dec; ++j) {
+      for (const auto j : c10::irange(dec)) {
+        (void)j; //Suppress unused variable warning
         (*lengths--) = i;
       }
     }
@@ -206,7 +207,7 @@ Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value
   out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end());
 
   Tensor out = at::full(out_dims, padding_value, sequences[0].options());
-  for (int64_t i = 0; i < sequences_size; i++) {
+  for (const auto i : c10::irange(sequences_size)) {
     const Tensor currseq = sequences[i];
     const int64_t length_i = currseq.size(0);
     // use index notation to prevent duplicate references to the tensor
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index da774911b5737e..3db102ad855053 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/div_rtn.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/util/irange.h>
 
 #pragma once
 
@@ -212,7 +213,7 @@ pool3d_shape_check(
   TORCH_CHECK(ndim == 4 || ndim == 5,
               fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
 
-  for (int64_t i = 1; i < ndim; ++i) {
+  for (const auto i : c10::irange(1, ndim)) {
     TORCH_CHECK(input.size(i) > 0,
                 fn_name, "Expected input to have non-zero size for non-batch dimensions, but got",
                 input.sizes(), " with dimension ", i, " being empty.");
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index 0c4256f5272041..e3030f71d16517 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -206,9 +206,9 @@ void CalcColOffsetsTranspose(
     const int8_t* Bint8,
     int32_t B_zero_point,
     int32_t* col_offsets) {
-  for (int i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     int32_t sum = 0;
-    for (int j = 0; j < K; ++j) {
+    for (const auto j : c10::irange(K)) {
       sum += Bint8[i * K + j];
     }
     col_offsets[i] = sum - B_zero_point * K;
@@ -353,7 +353,7 @@ bool CheckAndSaturate(T max_val, T* element) {
 void HandleWeightsSaturation(int64_t N, float* weight) {
   const float kFp16Max = RawUint16ToFp16(0x7BFF);
   bool found_out_of_range = false;
-  for (int64_t i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     if (CheckAndSaturate<float>(kFp16Max, weight + i)) {
       found_out_of_range = true;
     }