diff --git a/.ci/pytorch/perf_test/compare_with_baseline.py b/.ci/pytorch/perf_test/compare_with_baseline.py
index f7b962632cd79f..c756df378729bf 100644
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@@ -59,12 +59,12 @@
 print("z-value: ", z_value)
 
 if z_value >= 3:
-    raise Exception('''\n
+    raise Exception(f'''\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
-`cd .ci/pytorch/perf_test/ && bash {}.sh` on your local machine
+`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
 and compare the runtime before/after your code change.
-'''.format(test_name))
+''')
 else:
     print("z-value < 3, no perf regression detected.")
     if args.update:
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index cc253f36cbd1b7..b1026ac420c945 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -620,7 +620,7 @@ def get_ghstack_prs(repo: GitRepo, pr: "GitHubPR") -> List[Tuple["GitHubPR", str
     Get the open PRs in the stack that are below this PR.  Throws error if any of the PRs are out of sync.
     """
     assert pr.is_ghstack_pr()
-    entire_stack: List[Tuple["GitHubPR", str]] = []
+    entire_stack: List[Tuple[GitHubPR, str]] = []
     # For ghstack, cherry-pick commits based from origin
     orig_ref = f"{repo.remote}/{re.sub(r'/head$', '/orig', pr.head_ref())}"
     rev_list = repo.revlist(f"{pr.default_branch()}..{orig_ref}")
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 0160314749cb1a..a6c14e5f82a760 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -3086,6 +3086,6 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.0.277',
+    'ruff==0.0.280',
 ]
 is_formatter = true
diff --git a/pyproject.toml b/pyproject.toml
index 5cfea348cae5aa..2b16f472f612f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ ignore = [
     "B019", "B020",
     "B023", "B024", "B026",
     "B028", # No explicit `stacklevel` keyword argument found
-    "B904", "B905",
+    "B904",
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
@@ -70,6 +70,7 @@ select = [
     # Not included in flake8
     "UP",
     "PERF",
+    "PGH004",
     "PLE",
     "TRY302",
 ]
diff --git a/test/cpp_api_parity/functional_impl_check.py b/test/cpp_api_parity/functional_impl_check.py
index c09aaccd8f76b6..828f57e7e69812 100644
--- a/test/cpp_api_parity/functional_impl_check.py
+++ b/test/cpp_api_parity/functional_impl_check.py
@@ -225,7 +225,7 @@ def build_cpp_tests(unit_test_class, print_cpp_source=False):
     assert len(unit_test_class.functional_test_params_map) > 0
     cpp_sources = TORCH_NN_COMMON_TEST_HARNESS + SAMPLE_FUNCTIONAL_CPP_SOURCE
     functions = []
-    for test_name, test_params in unit_test_class.functional_test_params_map.items():
+    for test_params in unit_test_class.functional_test_params_map.values():
         cpp_sources += generate_test_cpp_sources(test_params=test_params, template=TORCH_NN_FUNCTIONAL_TEST_FORWARD)
         functions.append(f'{test_params.functional_variant_name}_test_forward')
     if print_cpp_source:
diff --git a/test/cpp_api_parity/module_impl_check.py b/test/cpp_api_parity/module_impl_check.py
index 8ede07b59034a4..aa18798940ae28 100644
--- a/test/cpp_api_parity/module_impl_check.py
+++ b/test/cpp_api_parity/module_impl_check.py
@@ -292,7 +292,7 @@ def build_cpp_tests(unit_test_class, print_cpp_source=False):
     assert len(unit_test_class.module_test_params_map) > 0
     cpp_sources = TORCH_NN_COMMON_TEST_HARNESS + SAMPLE_MODULE_CPP_SOURCE
     functions = []
-    for test_name, test_params in unit_test_class.module_test_params_map.items():
+    for test_params in unit_test_class.module_test_params_map.values():
         cpp_sources += generate_test_cpp_sources(
             test_params=test_params, template=TORCH_NN_MODULE_TEST_FORWARD_BACKWARD)
         functions.append(f'{test_params.module_variant_name}_test_forward_backward')
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 21b2cd129d6e2a..f324d1dbc9d894 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -1870,14 +1870,14 @@ def step():
 
             step()
             original_osd = deepcopy(optim.state_dict())
-            for param_id, state in original_osd["state"].items():
+            for state in original_osd["state"].values():
                 # Add customized value
                 state["value1"] = 2.74
                 state["value2"] = None
 
             osd = FSDP.optim_state_dict(model, optim, optim_state_dict=original_osd)
             osd_to_load = FSDP.optim_state_dict_to_load(model, optim, osd)
-            for param_id, state in osd_to_load["state"].items():
+            for state in osd_to_load["state"].values():
                 self.assertEqual(state["value1"], 2.74)
                 self.assertEqual(state["value2"], None)
 
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 2e5197f3b5bb43..ca971a23fa2dc5 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -1898,8 +1898,8 @@ def rvs(self, n_sample):
         self._check_sampler_sampler(
             MixtureSameFamily(Categorical(probs=probs), Normal(loc, scale)),
             ScipyMixtureNormal(probs.numpy(), loc.numpy(), scale.numpy()),
-            '''MixtureSameFamily(Categorical(probs={}),
-            Normal(loc={}, scale={}))'''.format(probs, loc, scale))
+            f'''MixtureSameFamily(Categorical(probs={probs}),
+            Normal(loc={loc}, scale={scale}))''')
 
     def test_normal(self):
         loc = torch.randn(5, 5, requires_grad=True)
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 3cf250e6e5cfba..8c6830b0f4a45b 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -504,7 +504,7 @@ def __init__(
 
     def forward(self, init_features):
         features = [init_features]
-        for name, layer in self.items():
+        for layer in self.values():
             new_features = layer(features)
             features.append(new_features)
         return torch.cat(features, 1)
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index 0926783cff9c35..868f1694974be2 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -321,7 +321,7 @@ def get_all_tested_ops():
     overridable_outplace_we_care_about = get_public_overridable_outplace_we_care_about()
     op_to_opinfo = get_ops_covered_by_opinfos()
     result = set({})
-    for name, op in get_covered_ops(overridable_outplace_we_care_about).items():
+    for op in get_covered_ops(overridable_outplace_we_care_about).values():
         opinfos = op_to_opinfo[op]
         for opinfo in opinfos:
             result.add(opinfo.name)
@@ -332,7 +332,7 @@ def get_skipped_or_xfailed_ops_for(test_name):
     overridable_outplace_we_care_about = get_public_overridable_outplace_we_care_about()
     op_to_opinfo = get_ops_covered_by_opinfos()
     result = set({})
-    for name, op in get_covered_ops(overridable_outplace_we_care_about).items():
+    for op in get_covered_ops(overridable_outplace_we_care_about).values():
         opinfos = op_to_opinfo[op]
         for opinfo in opinfos:
             for decorator in opinfo.decorators:
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
index 105b3180b2331e..93d6670d576fd2 100644
--- a/test/inductor/test_kernel_benchmark.py
+++ b/test/inductor/test_kernel_benchmark.py
@@ -27,7 +27,7 @@ def setUp(self):
 
     def get_compiled_module(self):
         compiled_module = None
-        for k, v in PyCodeCache.cache.items():
+        for v in PyCodeCache.cache.values():
             if hasattr(v, "benchmark_compiled_module"):
                 self.assertTrue(
                     compiled_module is None, "Found multiple compiled modules"
diff --git a/test/inductor/test_triton_wrapper.py b/test/inductor/test_triton_wrapper.py
index afac34a84fa5b5..ae0725a3de3cf8 100644
--- a/test/inductor/test_triton_wrapper.py
+++ b/test/inductor/test_triton_wrapper.py
@@ -12,7 +12,7 @@
 class TestTritonWrapper(TestCase):
     def get_compiled_module(self):
         compiled_module = None
-        for k, v in PyCodeCache.cache.items():
+        for v in PyCodeCache.cache.values():
             if hasattr(v, "benchmark_compiled_module"):
                 self.assertTrue(
                     compiled_module is None, "Found multiple compiled modules"
diff --git a/test/jit/fixtures_srcs/test_upgrader_models_generation.py b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
index ae11a49bd2373d..58267c1e0ea320 100644
--- a/test/jit/fixtures_srcs/test_upgrader_models_generation.py
+++ b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
@@ -7,7 +7,7 @@
 
 class TestUpgraderModelGeneration(TestCase):
     def test_all_modules(self):
-        for a_module, expect_operator in ALL_MODULES.items():
+        for a_module in ALL_MODULES.keys():
             module_name = type(a_module).__name__
             self.assertTrue(
                 isinstance(a_module, torch.nn.Module),
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 57b2281070b3f1..85082b494e01c6 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -1964,7 +1964,7 @@ def __init__(self, configs):
                 self.configs = configs
 
             def forward(self, x):
-                for _id, config in self.configs.items():
+                for config in self.configs.values():
                     x += config.size
                 return x
 
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index f7fd1f22a68a7a..1d241172762a22 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -2383,7 +2383,7 @@ def __init__(self):
 
             def forward(self, feature_map: Dict[str, List[Tensor]]) -> Tensor:
                 output = []
-                for i, j in feature_map.items():
+                for j in feature_map.values():
                     output.append(self.linear(j[0]))
 
                 return torch.stack(output)
diff --git a/test/quantization/eager/test_numeric_suite_eager.py b/test/quantization/eager/test_numeric_suite_eager.py
index 128f7cb96a06f2..a798745d6537d4 100644
--- a/test/quantization/eager/test_numeric_suite_eager.py
+++ b/test/quantization/eager/test_numeric_suite_eager.py
@@ -104,7 +104,7 @@ def compare_and_validate_results(float_model, q_model):
                 float_model.state_dict(), q_model.state_dict()
             )
             self.assertEqual(len(weight_dict), 1)
-            for k, v in weight_dict.items():
+            for v in weight_dict.values():
                 self.assertTrue(v["float"].shape == v["quantized"].shape)
 
         model_list = [AnnotatedConvModel(qengine), AnnotatedConvBnReLUModel(qengine)]
@@ -126,7 +126,7 @@ def compare_and_validate_results(float_model, q_model):
                 float_model.state_dict(), q_model.state_dict()
             )
             self.assertEqual(len(weight_dict), 1)
-            for k, v in weight_dict.items():
+            for v in weight_dict.values():
                 self.assertTrue(v["float"].shape == v["quantized"].shape)
 
         model_list = [AnnotatedSingleLayerLinearModel(qengine)]
@@ -148,7 +148,7 @@ def compare_and_validate_results(float_model, q_model):
                 float_model.state_dict(), q_model.state_dict()
             )
             self.assertEqual(len(weight_dict), 1)
-            for k, v in weight_dict.items():
+            for v in weight_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -172,7 +172,7 @@ def compare_and_validate_results(float_model, q_model):
                 float_model.state_dict(), q_model.state_dict()
             )
             self.assertEqual(len(weight_dict), 1)
-            for k, v in weight_dict.items():
+            for v in weight_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -194,7 +194,7 @@ def test_compare_model_stub_conv_static(self):
         def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             ob_dict = compare_model_stub(float_model, q_model, module_swap_list, data)
             self.assertEqual(len(ob_dict), 1)
-            for k, v in ob_dict.items():
+            for v in ob_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -221,7 +221,7 @@ def test_compare_model_stub_linear_static(self):
         def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             ob_dict = compare_model_stub(float_model, q_model, module_swap_list, data)
             self.assertEqual(len(ob_dict), 1)
-            for k, v in ob_dict.items():
+            for v in ob_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -246,7 +246,7 @@ def test_compare_model_stub_partial(self):
         def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             ob_dict = compare_model_stub(float_model, q_model, module_swap_list, data)
             self.assertEqual(len(ob_dict), 1)
-            for k, v in ob_dict.items():
+            for v in ob_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -301,7 +301,7 @@ def test_compare_model_stub_functional_static(self):
         self.assertTrue(isinstance(q_model.myadd_relu, Shadow))
         self.assertTrue(isinstance(q_model.my_scalar_add, Shadow))
         self.assertTrue(isinstance(q_model.my_scalar_mul, Shadow))
-        for k, v in ob_dict.items():
+        for v in ob_dict.values():
             self.assertTrue(len(v["float"]) == len(v["quantized"]))
             for i, val in enumerate(v["quantized"]):
                 self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -315,7 +315,7 @@ def test_compare_model_stub_linear_dynamic(self):
         def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             ob_dict = compare_model_stub(float_model, q_model, module_swap_list, data)
             self.assertEqual(len(ob_dict), 1)
-            for k, v in ob_dict.items():
+            for v in ob_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -344,7 +344,7 @@ def compare_and_validate_results(
                 float_model, q_model, module_swap_list, input, hidden
             )
             self.assertEqual(len(ob_dict), 1)
-            for k, v in ob_dict.items():
+            for v in ob_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -375,7 +375,7 @@ def compare_and_validate_results(float_model, q_model, data):
             expected_act_compare_dict_keys = {"conv.stats", "quant.stats"}
 
             self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
-            for k, v in act_compare_dict.items():
+            for v in act_compare_dict.values():
                 self.assertTrue(v["float"][0].shape == v["quantized"][0].shape)
 
         model_list = [AnnotatedConvModel(qengine), AnnotatedConvBnReLUModel(qengine)]
@@ -398,7 +398,7 @@ def compare_and_validate_results(float_model, q_model, data):
             expected_act_compare_dict_keys = {"fc1.quant.stats", "fc1.module.stats"}
 
             self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
-            for k, v in act_compare_dict.items():
+            for v in act_compare_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -434,7 +434,7 @@ def test_compare_model_outputs_functional_static(self):
             "quant.stats",
         }
         self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
-        for k, v in act_compare_dict.items():
+        for v in act_compare_dict.values():
             self.assertTrue(len(v["float"]) == len(v["quantized"]))
             for i, val in enumerate(v["quantized"]):
                 self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -451,7 +451,7 @@ def compare_and_validate_results(float_model, q_model, data):
             expected_act_compare_dict_keys = {"fc1.stats"}
 
             self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
-            for k, v in act_compare_dict.items():
+            for v in act_compare_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
@@ -480,7 +480,7 @@ def compare_and_validate_results(float_model, q_model, input, hidden):
             expected_act_compare_dict_keys = {"lstm.stats"}
 
             self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys)
-            for k, v in act_compare_dict.items():
+            for v in act_compare_dict.values():
                 self.assertTrue(len(v["float"]) == len(v["quantized"]))
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(len(v["float"][i]) == len(v["quantized"][i]))
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index f84e2048775346..191f927c7b217f 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -637,7 +637,7 @@ def test_op_relationship_mapping(self):
         # 4. go through the ops mapped to each QuantizeHandler type, and verify
         # correctness.
         def _op_in_base_sets_of_related_ops(op):
-            for name, ops in base_name_to_sets_of_related_ops.items():
+            for ops in base_name_to_sets_of_related_ops.values():
                 if op in ops:
                     return True
             return False
@@ -1829,7 +1829,7 @@ def test_extend_logger_results_with_comparison(self):
             results, 'fp32', 'int8', compute_cosine_similarity,
             'cosine_similarity_int8_vs_fp32')
 
-        for layer_name, layer_results in results.items():
+        for layer_results in results.values():
             assert 'sqnr_int8_vs_fp32' in \
                 layer_results['weight']['int8'][0].keys()
             assert 'l2_error_int8_vs_fp32' in \
diff --git a/test/run_test.py b/test/run_test.py
index b81544d8b4d634..f676a0f4782c08 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -846,7 +846,7 @@ def run_doctests(test_module, test_directory, options):
     if enabled["qengine"] == "auto":
         try:
             # Is there a better check if quantization is enabled?
-            import torch.ao.nn.quantized as nnq  # NOQA
+            import torch.ao.nn.quantized as nnq  # NOQA: F401
 
             torch.backends.quantized.engine = "qnnpack"
             torch.backends.quantized.engine = "fbgemm"
@@ -857,9 +857,9 @@ def run_doctests(test_module, test_directory, options):
 
     if enabled["onnx"] == "auto":
         try:
-            import onnx  # NOQA
-            import onnxruntime  # NOQA
-            import onnxscript  # NOQA
+            import onnx  # NOQA: F401
+            import onnxruntime  # NOQA: F401
+            import onnxscript  # NOQA: F401
         except ImportError:
             exclude_module_list.append("torch.onnx.*")
             enabled["onnx"] = False
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index e98385a8ce3a25..cb485bda7af49c 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -782,11 +782,11 @@ def test_find_dangling_impls_ext(self):
         impls = C._dispatch_find_dangling_impls()
         self.assertEqual(1, len(impls))
         self.assertEqual(
-            '''\
+            f'''\
 name: __test::foo
 schema: (none)
-CPU: registered at {}:5 :: () -> () [ boxed unboxed ]
-'''.format(extension_path),
+CPU: registered at {extension_path}:5 :: () -> () [ boxed unboxed ]
+''',
             impls[0])
 
     def test_dispatch_print_registrations_for_dispatch_key_invalid(self):
diff --git a/test/test_fx.py b/test/test_fx.py
index 15daf07ac2435f..e7480dc4282b1f 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -3490,7 +3490,7 @@ def f_sum(x):
 
         def f_sum_dict(x):
             out = 0
-            for k, v in x.items():
+            for v in x.values():
                 out += v
             return out
 
@@ -4302,7 +4302,7 @@ def _get_functional(cls):
                 try:
                     sig = inspect.signature(fn)
                     has_tensor_arg = False
-                    for arg, param in sig.parameters.items():
+                    for param in sig.parameters.values():
                         if isinstance(param.annotation, type) and issubclass(param.annotation, torch.Tensor):
                             has_tensor_arg = True
                     if not has_tensor_arg:
diff --git a/test/test_jit.py b/test/test_jit.py
index cd7312ad92ec0c..cfc5aa986f43b8 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3497,12 +3497,12 @@ def test_sequence_parsing(self):
         ]
         for exp, result in tests:
             cu = torch.jit.CompilationUnit()
-            full = """
+            full = f"""
 def bar(x, y):
     return x + y
 def foo(x):
-    {}
-            """.format(exp)
+    {exp}
+            """
             if isinstance(result, str):
                 with self.assertRaisesRegex(RuntimeError, result):
                     cu.define(full)
@@ -4006,7 +4006,7 @@ def replace(e):
                 return e.getattr('name')
 
             return e
-        for k, v in result.items():
+        for v in result.values():
             for i in range(len(v)):
                 if isinstance(v[i], tuple):
                     n, v2 = v[i]
@@ -13065,10 +13065,10 @@ def test_method_casts_script(self):
         ]
 
         for cast_type in cast_types:
-            cu = torch.jit.CompilationUnit('''
+            cu = torch.jit.CompilationUnit(f'''
             def cast_to(x):
                 return x.{cast_type}()
-            '''.format(cast_type=cast_type))
+            ''')
 
             x = torch.rand(3, 4, 5) * 128
             cu_result = cu.cast_to(x)
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9c85af4fbd6eaf..eb4f156182e978 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -5328,16 +5328,14 @@ def tracker(worker):
         elapsed_scipy_ms = 1000.0 * elapsed_scipy / repeat
         elapsed_general_scipy_ms = 1000.0 * elapsed_general_scipy / repeat
 
-        print('''
+        print(f'''
 CPU timings: torch.lobpcg vs scipy.sparse.linalg.lobpcg
 -------------------------------------------------------
               | standard    | generalized | method
-torch.lobpcg  | {:10.2f}  | {:10.2f}  | ortho
-scipy_lobpcg  | {:10.2f}  | {:10.2f}  | N/A
--(input size: {:4}, eigenpairs:{:2}, units: ms per call)-
-        '''.format(elapsed_ortho_ms, elapsed_ortho_general_ms,
-                   elapsed_scipy_ms, elapsed_general_scipy_ms,
-                   m, k))
+torch.lobpcg  | {elapsed_ortho_ms:10.2f}  | {elapsed_ortho_general_ms:10.2f}  | ortho
+scipy_lobpcg  | {elapsed_scipy_ms:10.2f}  | {elapsed_general_scipy_ms:10.2f}  | N/A
+-(input size: {m:4}, eigenpairs:{k:2}, units: ms per call)-
+        ''')
 
         # Handling of very small tolerence
         tol = 1e-100
@@ -5378,14 +5376,14 @@ def tracker(worker):
             iters2_general = -1
             eq_err_general_scipy = -1
 
-        print('''\
-Handling of small tol={:6.0e}: torch.lobpcg vs scipy.sparse.linalg.lobpcg
+        print(f'''\
+Handling of small tol={tol:6.0e}: torch.lobpcg vs scipy.sparse.linalg.lobpcg
 ----------------------------------------------------------------------------
               | standard    | generalized |  niter | method
-torch.lobpcg  | {:10.2e}  | {:10.2e}  | {:6} | ortho
-scipy_lobpcg  | {:10.2e}  | {:10.2e}  | {:6} | N/A
----(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})---
-'''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter))
+torch.lobpcg  | {eq_err:10.2e}  | {eq_err_general:10.2e}  | {iters1:6} | ortho
+scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:6} | N/A
+---(input size: {m:4}, eigenpairs:{k:2}, units: relative error, maxiter={niter:4})---
+''')
 
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, activation=None):
         dtype = t.dtype
diff --git a/test/test_mkldnn_fusion.py b/test/test_mkldnn_fusion.py
index 59c6af1c56a61d..4858a27dec9145 100644
--- a/test/test_mkldnn_fusion.py
+++ b/test/test_mkldnn_fusion.py
@@ -204,7 +204,7 @@ def forward(self, x):
                 x = self.unary(x)
                 return x
 
-        for pointwise_name, pointwise_info in self._unary_list().items():
+        for pointwise_info in self._unary_list().values():
             options = itertools.product([[2, 3, 10], [2, 10]], [True, False])
             for input_shape, bias in options:
                 with torch.no_grad():
@@ -233,7 +233,7 @@ def forward(self, x):
                 return x
 
         input_shapes = {2: (112, 112), 3: (55, 55, 55)}
-        for pointwise_name, pointwise_info in self._unary_list().items():
+        for pointwise_info in self._unary_list().values():
             for dim in [2, 3]:
                 channels_last = torch.channels_last if dim == 2 else torch.channels_last_3d
                 options = itertools.product([True, False], [1, 2], [1, 4], [torch.contiguous_format, channels_last])
@@ -347,7 +347,7 @@ def forward(self, x):
 
         input_shapes = {2: (28, 28)}
         kernel_size = 3
-        for pointwise_name, pointwise_info in self._unary_list().items():
+        for pointwise_info in self._unary_list().values():
             for dim in [2]:
                 channels_last = torch.channels_last if dim == 2 else torch.channels_last_3d
                 options = itertools.product([True, False], [1, 2], [1, 4], [torch.contiguous_format, channels_last], [False, True])
diff --git a/test/test_nn.py b/test/test_nn.py
index c29a031e4d3e7e..e7ac3aacacb38c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2890,7 +2890,7 @@ def _test_loss_equal_input_target_shape(self, cast):
 
         input = cast(torch.randn(3, 5))
         target = cast(torch.randn(5, 3))
-        for _name, fn in losses.items():
+        for fn in losses.values():
             self.assertRaises(Exception, lambda: fn(input, target))
 
     def test_loss_equal_input_target_shape(self):
@@ -5522,7 +5522,7 @@ def test_pointwise_loss_broadcast(self):
         }
 
         input = torch.randn(2, 1, requires_grad=True)
-        for _name, fn in losses.items():
+        for fn in losses.values():
             for requires_grad in [True, False]:
                 # When target.requires_grad=True, its impl is in Python, while the other is in TH.
                 target = torch.randn(2, 10, requires_grad=requires_grad)
diff --git a/test/test_ops.py b/test/test_ops.py
index 3b43a56bc4c36b..ca93ab161a9c95 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1325,7 +1325,7 @@ def unsupported(dtype, e):
                 # one or more tensors requiring grad
                 def _tensor_requires_grad(x):
                     if isinstance(x, dict):
-                        for k, v in x.items():
+                        for v in x.values():
                             if _tensor_requires_grad(v):
                                 return True
                     if isinstance(x, (list, tuple)):
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 76739ed5c62493..82640442769ce2 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -3908,7 +3908,7 @@ def forward(self, input):
             state_dict = m.state_dict()
             torch.save(state_dict, f)
             result = torch.load(f, mmap=True)
-            for k, v in result.items():
+            for v in result.values():
                 self.assertTrue(v.is_cuda)
 
     def run(self, *args, **kwargs):
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index 0a9e2a1539b6a7..ec130ca40ce1f1 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -18,7 +18,7 @@
 
 def extract_all_operators(selective_builder: SelectiveBuilder) -> Set[str]:
     ops = []
-    for op_name, op in selective_builder.operators.items():
+    for op_name in selective_builder.operators.keys():
         ops.append(op_name)
     return set(ops)
 
@@ -74,7 +74,7 @@ def gen_supported_mobile_models(model_dicts: List[Any], output_dir: str) -> None
         if "debug_info" in model_dict:
             debug_info = json.loads(model_dict["debug_info"][0])
             if debug_info["is_new_style_rule"]:
-                for asset, asset_info in debug_info["asset_info"].items():
+                for asset_info in debug_info["asset_info"].values():
                     md5_hashes.update(asset_info["md5_hash"])
 
     supported_hashes = ""
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1b2c31e9d2b997..4f894b044f3445 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -2285,7 +2285,7 @@ def one_layer_rnn_data(
     hh_bias = params[3] if has_biases else None
 
     step_output = []
-    hiddens: List["torch.Tensor"] = []
+    hiddens: List[torch.Tensor] = []
 
     last_batch_size = batch_sizes[-1] if reverse else batch_sizes[0]
     cur_hidden = hidden.narrow(0, 0, last_batch_size)
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 5df8adae09cecb..8e67d7949715dc 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -832,7 +832,7 @@ def __init__(
     ):
         guards = output_graph.guards if output_graph else None
         self.valid = True
-        self._weakrefs: List["ReferenceType[object]"] = []
+        self._weakrefs: List[ReferenceType[object]] = []
         self._seen_ids: Set[int] = set()
         self.output_graph = output_graph
 
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index e4cd65d7de1a5f..08312cbb535c02 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -151,7 +151,7 @@ def increment_op_count(cnt):
 def print_time_report():
     total = 0
     total_by_key = {}
-    for frame, timings in frame_phase_timing.items():
+    for timings in frame_phase_timing.values():
         for key, timing in timings.items():
             total += timing
             if key not in total_by_key:
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 95250fe0f39ac8..9827556bd8a5b5 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -312,7 +312,7 @@ def call_function(
         ):
             raise UserError(
                 UserErrorType.DYNAMIC_CONTROL_FLOW,
-                "Expected a list of tensors but got {actual_args}".format(
+                "Expected a list of tensors but got {actual_args}".format(  # noqa: UP032
                     actual_args=[
                         str(operand.python_type())
                         if isinstance(operand, VariableTracker)
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index ce5abf100db03a..771b18e64cf1c2 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -162,7 +162,7 @@ def export(
                 **kwargs,
             )
 
-            params_buffers: "OrderedDict[str, Union[torch.Tensor, torch.nn.Parameter]]" = OrderedDict()
+            params_buffers: OrderedDict[str, Union[torch.Tensor, torch.nn.Parameter]] = OrderedDict()
             for name, param in gm_torch_level.named_parameters(recurse=True, remove_duplicate=False):
                 params_buffers[name] = param
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 92b10455708020..ae40e3a12a812d 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -122,9 +122,9 @@ def __str__(self):
 class MemoryPlanningState:
     def __init__(self):
         super().__init__()
-        self.reuse_pool: Dict[
-            Any, List["FreeIfNotReusedLine"]
-        ] = collections.defaultdict(list)
+        self.reuse_pool: Dict[Any, List[FreeIfNotReusedLine]] = collections.defaultdict(
+            list
+        )
 
     def __contains__(self, key):
         return bool(self.reuse_pool.get(key, None))
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index e57e040ce616b5..20ca00603d17c2 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -452,7 +452,7 @@ def visit(value):
                     value.realize()
             return value
 
-        for key, value in self.env.items():
+        for value in self.env.values():
             try:
                 visit(value)
             except Exception:
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 3ecc2ff00d63b1..20f7a286964f46 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -58,7 +58,7 @@ def fuse(node1: "BaseSchedulerNode", node2: "BaseSchedulerNode"):
 
 class BaseSchedulerNode:
     def __init__(self, scheduler: "Scheduler", node: ir.Buffer):
-        self.scheduler: "Scheduler" = scheduler
+        self.scheduler: Scheduler = scheduler
         self.node: ir.Buffer = node
         self.users: Optional[List[NodeUser]] = None
         self.inverse_users: List[BaseSchedulerNode] = []
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4b93d25f547306..880c1902a9925c 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -14053,23 +14053,19 @@ def merge_dicts(*dicts):
     if hasattr(torch, unary_foreach_func_name):
         add_docstr(
             getattr(torch, unary_foreach_func_name),
-            r"""
-{}(self: List[Tensor]) -> List[Tensor]
+            rf"""
+{unary_foreach_func_name}(self: List[Tensor]) -> List[Tensor]
 
-Apply :func:`torch.{}` to each Tensor of the input list.
-            """.format(
-                unary_foreach_func_name, unary_base_func_name
-            ),
+Apply :func:`torch.{unary_base_func_name}` to each Tensor of the input list.
+            """,
         )
     unary_inplace_foreach_func_name = f"{unary_foreach_func_name}_"
     if hasattr(torch, unary_inplace_foreach_func_name):
         add_docstr(
             getattr(torch, unary_inplace_foreach_func_name),
-            r"""
-{}(self: List[Tensor]) -> None
+            rf"""
+{unary_inplace_foreach_func_name}(self: List[Tensor]) -> None
 
-Apply :func:`torch.{}` to each Tensor of the input list.
-        """.format(
-                unary_inplace_foreach_func_name, unary_base_func_name
-            ),
+Apply :func:`torch.{unary_base_func_name}` to each Tensor of the input list.
+        """,
         )
diff --git a/torch/ao/ns/fx/graph_matcher.py b/torch/ao/ns/fx/graph_matcher.py
index ed90f530ba2694..8db946ec707a71 100644
--- a/torch/ao/ns/fx/graph_matcher.py
+++ b/torch/ao/ns/fx/graph_matcher.py
@@ -134,7 +134,7 @@ def _recursively_add_node_arg_to_stack(self, arg: Any) -> None:
             for inner_arg in arg:
                 self._recursively_add_node_arg_to_stack(inner_arg)
         elif isinstance(arg, torch.fx.immutable_collections.immutable_dict):
-            for key, value in arg.items():
+            for value in arg.values():
                 self._recursively_add_node_arg_to_stack(value)
 
     def _is_matchable(self, node: Node) -> bool:
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index 3f4e156859024b..edd5284cf6eb6f 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -424,7 +424,7 @@ def _can_insert(node_a_arg, gm_a):
                     return False
             cur_idx += 1
 
-        for kwarg_name, kwarg_val in norm_kwargs.items():
+        for kwarg_val in norm_kwargs.values():
             # stitch the inputs from base graph
             if cur_idx == 0:
                 pass
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index dbc9097cb0556c..b1a872056d16cf 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -471,7 +471,7 @@ def add_op_to_sets_of_related_ops(
     related_op: Optional[NSNodeTargetType],
 ) -> None:
     if related_op is not None:
-        for base_name, set_of_related_ops in base_name_to_sets_of_related_ops.items():
+        for set_of_related_ops in base_name_to_sets_of_related_ops.values():
             if related_op in set_of_related_ops:
                 set_of_related_ops.add(op)
                 return
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index fa328deb0f592a..dba4e133e66071 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -704,7 +704,7 @@ def create_add_loggers_graph(
     from torch.ao.ns._numeric_suite_fx import OutputLogger, OutputComparisonLogger
 
     def _get_subgraph_containing_node(node, subgraphs_dedup):
-        for name, subgraph in subgraphs_dedup.items():
+        for subgraph in subgraphs_dedup.values():
             if node in subgraph:
                 return subgraph
         return None
@@ -1289,7 +1289,7 @@ def print_n_shadows_summary(
         return
 
     results = []
-    for subgraph_name, subgraph_data in results_comparison.items():
+    for subgraph_data in results_comparison.values():
         mean_all_candidates = [
             candidate['cmp_mean']
             for candidate_name, candidate in subgraph_data['candidates'].items()
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index bb6d3f14431a3b..2925dfe012125f 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -26,7 +26,7 @@ def get_type_a_related_to_b(
     # TODO(future PR): add the rest of modules and ops here
     type_a_related_to_b: Set[Tuple[NSNodeTargetType, NSNodeTargetType]] = set()
 
-    for base_name, s in base_name_to_sets_of_related_ops.items():
+    for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
         for idx_0 in range(0, len(s_list)):
@@ -70,7 +70,7 @@ def get_reversed_fusions() -> List[Tuple[NSFusionType, int]]:
     all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config())
 
     default_base_op_idx = 0
-    for quant_pattern, _quant_handler in all_quant_patterns.items():
+    for quant_pattern in all_quant_patterns.keys():
         # TODO: this is a temporary hack to flatten the patterns from quantization so
         # that it works with the ns matcher function, maybe we should use `_is_match`
         # in torch.ao.quantization.fx.match_utils to match the patterns
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 8d6f54ef9c148f..bf35a7e531e1ab 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -363,7 +363,7 @@ def rekey_logger_info_on_node_name_of_model(
     new_results = {}
     for old_layer_name, result_type_to_results in results.items():
         new_layer_name = None
-        for _result_type, model_name_to_results in result_type_to_results.items():
+        for model_name_to_results in result_type_to_results.values():
             for cur_model_name, list_of_results in model_name_to_results.items():
                 if cur_model_name == model_name:
                     assert len(list_of_results)
@@ -389,8 +389,8 @@ def maybe_add_missing_fqns(results: NSResultsType) -> None:
 
     # Check in the first result to find any model with fqn entries defined.
     model_name_with_fqns = None
-    for layer_name, result_type_to_results in results.items():
-        for result_type, model_name_to_results in result_type_to_results.items():
+    for result_type_to_results in results.values():
+        for model_name_to_results in result_type_to_results.values():
             for model_name, model_results in model_name_to_results.items():
                 if len(model_results) > 0:
                     if model_results[0]["fqn"] is not None:
@@ -400,8 +400,8 @@ def maybe_add_missing_fqns(results: NSResultsType) -> None:
         break
 
     if model_name_with_fqns:
-        for layer_name, result_type_to_results in results.items():
-            for result_type, model_name_to_results in result_type_to_results.items():
+        for result_type_to_results in results.values():
+            for model_name_to_results in result_type_to_results.values():
                 ref_model_results = model_name_to_results[model_name_with_fqns]
                 for model_name, model_results in model_name_to_results.items():
                     if model_name == model_name_with_fqns:
diff --git a/torch/ao/quantization/fuse_modules.py b/torch/ao/quantization/fuse_modules.py
index 7c7ef1a88e83a7..77a95b7e0873b5 100644
--- a/torch/ao/quantization/fuse_modules.py
+++ b/torch/ao/quantization/fuse_modules.py
@@ -56,11 +56,11 @@ def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None):
     fused = fuser_method(is_qat, *mod_list)
     # NOTE: forward hooks not processed in the two following for loops will be lost after the fusion
     # Move pre forward hooks of the base module to resulting fused module
-    for handle_id, pre_hook_fn in mod_list[0]._forward_pre_hooks.items():
+    for pre_hook_fn in mod_list[0]._forward_pre_hooks.values():
         fused.register_forward_pre_hook(pre_hook_fn)
     mod_list[0]._forward_pre_hooks.clear()
     # Move post forward hooks of the last module to resulting fused module
-    for handle_id, hook_fn in mod_list[-1]._forward_hooks.items():
+    for hook_fn in mod_list[-1]._forward_hooks.values():
         fused.register_forward_hook(hook_fn)
     mod_list[-1]._forward_hooks.clear()
     new_mod[0] = fused
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index aa9f1f7467f932..803a52c4b570c5 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -1370,7 +1370,7 @@ def insert_observers_for_model(
     # Step 1, set the observer or fake quantize module constructor for each node in the
     # matched_node_pattern
 
-    for node_name, match_res_with_qconfig in node_name_to_match_result_with_qconfig.items():
+    for match_res_with_qconfig in node_name_to_match_result_with_qconfig.values():
         last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
         assert qhandler is not None
         _set_target_dtype_info_for_matched_node_pattern(
@@ -1425,7 +1425,7 @@ def insert_observers_for_model(
 
     # reset the counters and set of processed_nodes
     processed_nodes: Set[Node] = set()
-    for node_name, match_res_with_qconfig in node_name_to_match_result_with_qconfig.items():
+    for match_res_with_qconfig in node_name_to_match_result_with_qconfig.values():
         last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
         is_supported_by_backend = _is_pattern_dtype_config_and_qconfig_supported_by_backend(
             pattern, matched_node_pattern, qconfig, backend_config)
@@ -1654,10 +1654,7 @@ def _run_prepare_fx_on_standalone_modules(
     not modify the graph, it just replaces the unobserved modules with
     their observed versions.
     """
-    for (
-        node_name,
-        (root_node, _, pattern, qhandler, qconfig),
-    ) in node_name_to_match_result_with_qconfig.items():
+    for (root_node, _, pattern, qhandler, qconfig) in node_name_to_match_result_with_qconfig.values():
         if qhandler is None:
             continue
         elif not qhandler.is_standalone_module():
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index d2e11cb2fd6f81..0b906a1777de01 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -217,13 +217,13 @@ def _compare_prepare_convert_qconfig_mappings(
     ]
     dict_names = [_OBJECT_TYPE_DICT_KEY, _MODULE_NAME_DICT_KEY, _MODULE_NAME_REGEX_DICT_KEY]
     for i in range(len(prepare_dicts)):
-        for name, qconfig in prepare_dicts[i].items():
-            assert name in convert_dicts[i], "Missing key {} {} in convert QConfigMapping \
-                when it was present in prepare".format(dict_names[i], name)
+        for name in prepare_dicts[i].keys():
+            assert name in convert_dicts[i], f"Missing key {dict_names[i]} {name} in convert QConfigMapping \
+                when it was present in prepare"
             assert convert_dicts[i][name] is None \
                 or qconfig_equals(prepare_dicts[i][name], convert_dicts[i][name]), \
-                "Expected convert QConfigMapping to have the same qconfig as prepare for key {} {}; \
-                prepare: {}; convert: {}".format(dict_names[i], name, prepare_dicts[i][name], convert_dicts[i][name])
+                f"Expected convert QConfigMapping to have the same qconfig as prepare for key {dict_names[i]} {name}; \
+                prepare: {prepare_dicts[i][name]}; convert: {convert_dicts[i][name]}"
 
 def _is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[DTypeConfig]):
     for dtype_config in dtype_configs:
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index dae5e9092c4ffb..c4665bb24a2a57 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -556,7 +556,7 @@ def _fuse_conv_bn_qat(m: GraphModule) -> GraphModule:
             _get_conv_bn_getitem_nodes(r.replacements)
 
         # Step (3a): Copy over metadata for all three nodes in [conv - bn - getitem]
-        for match_pattern_node, original_node in _filter_nodes_map(r.nodes_map).items():
+        for original_node in _filter_nodes_map(r.nodes_map).values():
             if original_node.target == torch.ops.aten.convolution.default:
                 replacement_conv_node.meta = original_node.meta
                 original_to_replacement_node[original_node] = replacement_conv_node
diff --git a/torch/ao/quantization/pt2e/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/pt2e/quantizer/x86_inductor_quantizer.py
index cef306745dbb25..36f7785b162865 100644
--- a/torch/ao/quantization/pt2e/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/pt2e/quantizer/x86_inductor_quantizer.py
@@ -73,7 +73,7 @@ def _get_supported_x86_inductor_config_and_operators() -> List[OperatorConfig]:
     supported_config_and_operators: List[OperatorConfig] = []
     for quantization_config in [get_default_x86_inductor_quantization_config(), ]:
         ops = _supported_quantized_operators()
-        for op_string, pattern_list in ops.items():
+        for pattern_list in ops.values():
             supported_config_and_operators.append(
                 OperatorConfig(quantization_config, pattern_list)
             )
diff --git a/torch/ao/quantization/pt2e/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/pt2e/quantizer/xnnpack_quantizer.py
index 3eeebe1a5a823e..bda553d03b01e8 100644
--- a/torch/ao/quantization/pt2e/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/pt2e/quantizer/xnnpack_quantizer.py
@@ -116,7 +116,7 @@ def _get_supported_symmetric_config_and_operators() -> List[OperatorConfig]:
         get_symmetric_quantization_config(is_per_channel=True, is_qat=True),
     ]:
         ops = _supported_symmetric_quantized_operators()
-        for op_string, pattern_list in ops.items():
+        for pattern_list in ops.values():
             supported_config_and_operators.append(
                 OperatorConfig(quantization_config, pattern_list)
             )
@@ -517,7 +517,7 @@ def _annotate_linear(
         output_act_qspec = get_output_act_qspec(quantization_config)
         weight_qspec = get_weight_qspec(quantization_config)
         bias_qspec = get_bias_qspec(quantization_config)
-        for module_or_fn_type, partitions in module_partitions.items():
+        for partitions in module_partitions.values():
             for p in partitions:
                 act_nodes = [
                     n
diff --git a/torch/distributed/_shard/sharded_tensor/__init__.py b/torch/distributed/_shard/sharded_tensor/__init__.py
index 18d5d513202b05..bb0271ca1826bb 100644
--- a/torch/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/distributed/_shard/sharded_tensor/__init__.py
@@ -408,7 +408,7 @@ def pre_load_state_dict_hook(module, state_dict, prefix, local_metadata, strict,
     Pre-load state dict hook to add ShardedTensor to the module.
     """
     for submodule_name, submodule in module.named_modules():
-        for attr_name, attr in submodule.__dict__.items():
+        for attr_name in submodule.__dict__.keys():
             mod_prefix = prefix + submodule_name
             key = mod_prefix + ('.' if mod_prefix else '') + attr_name
             if key in state_dict:
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 25a07b6493900c..6bb8ed98b4684b 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -446,7 +446,7 @@ def _flatten_optim_state_dict(
             for fqn in fqns:
                 if not unflat_osd_state[fqn]:
                     continue
-                for state_name, param_state in unflat_osd_state[fqn].items():
+                for state_name in unflat_osd_state[fqn].keys():
                     unflat_osd_state[fqn][state_name] = _broadcast_state(
                         fsdp_state, unflat_osd_state[fqn][state_name], group=group
                     )
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index bc6910e98c47a9..71ab76682814ab 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -314,7 +314,7 @@ def _get_checked_instance(self, cls, _instance=None):
 
     def __repr__(self) -> str:
         param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__]
-        args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p]
+        args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p]  # noqa: UP032
                                 if self.__dict__[p].numel() == 1
                                 else self.__dict__[p].size()) for p in param_names])
         return self.__class__.__name__ + '(' + args_string + ')'
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index cef8553e56d2fa..b0cf86c8c5db64 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -739,7 +739,7 @@ def trace(
                 self.root = torch.nn.Module()
                 fn = root
 
-            tracer_cls: Optional[Type["Tracer"]] = getattr(self, "__class__", None)
+            tracer_cls: Optional[Type[Tracer]] = getattr(self, "__class__", None)
             self.graph = Graph(tracer_cls=tracer_cls)
             if hasattr(fn, '__code__'):
                 code = fn.__code__
diff --git a/torch/fx/experimental/partitioner_utils.py b/torch/fx/experimental/partitioner_utils.py
index eb306b9581e337..b56cf0102f696d 100644
--- a/torch/fx/experimental/partitioner_utils.py
+++ b/torch/fx/experimental/partitioner_utils.py
@@ -12,8 +12,8 @@ class Partition:
     def __init__(self, partition_id: int) -> None:
         self.nodes: Set[Node] = set()
         self.partition_id = partition_id
-        self.parents: Set["Partition"] = set()
-        self.children: Set["Partition"] = set()
+        self.parents: Set[Partition] = set()
+        self.children: Set[Partition] = set()
         self.bfs_level: int = -1
         self.used_mem_bytes: int = 0
         self.logical_device_ids: List[int] = []
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 9f95c0390098df..ce57ac4856feb5 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1927,26 +1927,26 @@ def __init__(
         self.guards: List[ShapeGuard] = []
         # Maps symbolic ints to their original concrete values
         # Currently populated from tensors
-        self.var_to_val: Dict["sympy.Symbol", "sympy.Integer"] = {}
+        self.var_to_val: Dict[sympy.Symbol, sympy.Integer] = {}
         # Maps symbolic ints to their min/max range.  These ranges
         # are conservative: the int MUST fall in the range, but the
         # range may contain ints which may not actually appear in
         # practice
-        self.var_to_range: Dict["sympy.Symbol", ValueRanges] = {}
-        self.var_to_sources: Dict["sympy.Symbol", List[Source]] = {}
-        self.var_to_stack: Dict["sympy.Symbol", traceback.StackSummary] = {}
+        self.var_to_range: Dict[sympy.Symbol, ValueRanges] = {}
+        self.var_to_sources: Dict[sympy.Symbol, List[Source]] = {}
+        self.var_to_stack: Dict[sympy.Symbol, traceback.StackSummary] = {}
         # Maps symbolic ints to the guards that refine their lower/upper
         # bound. If one of them is None, it means that there are no guards
         # that refine that respective bound.
-        self.var_to_guards: Dict["sympy.Symbol", Tuple[Optional[ShapeGuard], Optional[ShapeGuard]]] = {}
+        self.var_to_guards: Dict[sympy.Symbol, Tuple[Optional[ShapeGuard], Optional[ShapeGuard]]] = {}
         # Maps from sympy ints to expressions representing them
         # Populated from equality guards (i.e. a.shape[0] == b.shape[0])
-        self.replacements: Dict["sympy.Symbol", "sympy.Expr"] = {}  #
+        self.replacements: Dict[sympy.Symbol, sympy.Expr] = {}  #
         # Set holds a % b expressions that evaluate to 0.
-        self.divisible: Set["sympy.Expr"] = set()
+        self.divisible: Set[sympy.Expr] = set()
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
-        self.val_to_var: Dict[int, "sympy.Expr"] = {}
+        self.val_to_var: Dict[int, sympy.Expr] = {}
         if specialize_zero_one:
             self.val_to_var = {0: sympy.Integer(0), 1: sympy.Integer(1)}
         self.unbacked_symfloat_counter = itertools.count()
diff --git a/torch/fx/node.py b/torch/fx/node.py
index e7a8105da4c867..023e5761b60c0c 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -198,7 +198,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
         # would appear once here, but represents two uses.
         #
         # Is a dict to act as an "ordered set". Keys are significant, value dont-care
-        self.users : Dict['Node', None] = {}
+        self.users : Dict[Node, None] = {}
         # Type expression representing the output value of this node.
         # This should contain the same class of Type objects that would appear
         # as type annotations for function inputs/outputs.
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index bcc601d0cea6ab..96c59c49e08d6e 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -235,7 +235,7 @@ def _tensor_meta_to_label(self, tm) -> str:
                 return result
             elif isinstance(tm, dict):
                 result = ""
-                for k, v in tm.items():
+                for v in tm.values():
                     result += self._tensor_meta_to_label(v)
                 return result
             elif isinstance(tm, tuple):
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
index 0af6de5508a822..42032b4b6cad1f 100644
--- a/torch/fx/passes/tools_common.py
+++ b/torch/fx/passes/tools_common.py
@@ -164,7 +164,7 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
             if node not in self.acc_nodes:
                 continue
 
-            fusion_group: "FxNetAccFusionsFinder.FusionGroup" = self.FusionGroup(
+            fusion_group: FxNetAccFusionsFinder.FusionGroup = self.FusionGroup(
                 top_node_idx=self.nodes.index(node),
                 nodes={node},
                 inputs=set(node.all_input_nodes),
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index ed020332fc5597..e842f024daff01 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -112,13 +112,13 @@ def _get_valid_constant(attr, v, owner_type):
     elif isinstance(v, (tuple, list)):
         return tuple(_get_valid_constant(attr, x, owner_type) for x in v)
     constants = ", ".join(torch.typename(typ) for typ in _constant_types)
-    raise TypeError(textwrap.dedent("""
-        '{}' object in attribute '{}.{}' is not a valid constant.
+    raise TypeError(textwrap.dedent(f"""
+        '{torch.typename(type(v))}' object in attribute '{owner_type}.{attr}' is not a valid constant.
         Valid constants are:
         1. a nn.ModuleList
-        2. a value of type {{{}}}
+        2. a value of type {{{constants}}}
         3. a list or tuple of (2)
-        """.format(torch.typename(type(v)), owner_type, attr, constants)))
+        """))
 
 
 class SourceContext(torch._C._jit_tree_views.SourceRangeFactory):
@@ -509,7 +509,7 @@ def create_script_module_impl(nn_module, concrete_type, stubs_fn):
     def init_fn(script_module):
         # Initialize the ScriptModule:
         # 1. Copy the attributes/parameters/buffers from the original `nn_module` to the new ScriptModule.
-        for name, (attr_type, is_param) in concrete_type.get_attributes().items():
+        for name in concrete_type.get_attributes().keys():
             orig_value = getattr(nn_module, name)
             orig_value = orig_value.value if isinstance(orig_value, torch.jit.Attribute) else orig_value
             cpp_module.setattr(name, orig_value)
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 42321bb959c4f5..b397af48d31c27 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -83,7 +83,7 @@ def _helper(a, map_fn):
     for a in args:
         impl_args.append(_helper(a, map_fn))
     impl_kwargs = {}
-    for k, v in kwargs.items():
+    for k in kwargs.keys():
         impl_kwargs[k] = _helper(a, map_fn)
     return impl_args, impl_kwargs
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 2e6eaec90f4c26..d1efe56a53e4af 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -58,7 +58,7 @@ def __init__(self, hook: Callable, module: Optional["Module"] = None):
         self.with_module: bool = False
 
         if module is not None:
-            self.module: weakref.ReferenceType["Module"] = weakref.ref(module)
+            self.module: weakref.ReferenceType[Module] = weakref.ref(module)
             self.with_module = True
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 211c4e65768f40..fdaca358e8b8ed 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -59,7 +59,7 @@ def __init__(self, mode: str, input_size: int, hidden_size: int,
         self.dropout = float(dropout)
         self.bidirectional = bidirectional
         self.proj_size = proj_size
-        self._flat_weight_refs: List[Optional[weakref.ReferenceType["Parameter"]]] = []
+        self._flat_weight_refs: List[Optional[weakref.ReferenceType[Parameter]]] = []
         num_directions = 2 if bidirectional else 1
 
         if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 086f08f0c18b13..1e16f11b6b626e 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -107,10 +107,8 @@ def _get_composite_method(cls, module, name, *args, **kwargs):
                     found += 1
             assert (
                 found <= 1
-            ), "Avoid adding multiple pruning hooks to the\
-                same tensor {} of module {}. Use a PruningContainer.".format(
-                name, module
-            )
+            ), f"Avoid adding multiple pruning hooks to the\
+                same tensor {name} of module {module}. Use a PruningContainer."
 
             for k in hooks_to_remove:
                 del module._forward_pre_hooks[k]
@@ -264,7 +262,7 @@ class PruningContainer(BasePruningMethod):
     """
 
     def __init__(self, *args):
-        self._pruning_methods: Tuple["BasePruningMethod", ...] = tuple()
+        self._pruning_methods: Tuple[BasePruningMethod, ...] = tuple()
         if not isinstance(args, Iterable):  # only 1 item
             self._tensor_name = args._tensor_name
             self.add_pruning_method(args)
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
index b9b9dbf9b28805..ea29d095ea1c0f 100644
--- a/torch/nn/utils/spectral_norm.py
+++ b/torch/nn/utils/spectral_norm.py
@@ -115,7 +115,7 @@ def _solve_v_and_rescale(self, weight_mat, u, target_sigma):
 
     @staticmethod
     def apply(module: Module, name: str, n_power_iterations: int, dim: int, eps: float) -> 'SpectralNorm':
-        for k, hook in module._forward_pre_hooks.items():
+        for hook in module._forward_pre_hooks.values():
             if isinstance(hook, SpectralNorm) and hook.name == name:
                 raise RuntimeError(f"Cannot register two spectral_norm hooks on the same parameter {name}")
 
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index 719cf36a133836..d54c34a3e9f205 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -29,7 +29,7 @@ def compute_weight(self, module: Module) -> Any:
     def apply(module, name: str, dim: int) -> 'WeightNorm':
         warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.")
 
-        for k, hook in module._forward_pre_hooks.items():
+        for hook in module._forward_pre_hooks.values():
             if isinstance(hook, WeightNorm) and hook.name == name:
                 raise RuntimeError(f"Cannot register two weight_norm hooks on the same parameter {name}")
 
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index a38337426313db..cbceec8abc3f45 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -148,7 +148,7 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -159,14 +159,14 @@ def step(self, closure=None):
         lr (float, optional): coefficient that scale delta before it is applied
             to the parameters (default: 1.0)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
     .. _ADADELTA\: An Adaptive Learning Rate Method:
         https://arxiv.org/abs/1212.5701
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 
 def adadelta(
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index c1e981809c4ecf..1fd3012cdb7d1f 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -160,7 +160,7 @@ def step(self, closure=None):
 
     For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
     and Stochastic Optimization`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -169,14 +169,14 @@ def step(self, closure=None):
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-10)
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
     .. _Adaptive Subgradient Methods for Online Learning and Stochastic
         Optimization: http://jmlr.org/papers/v12/duchi11a.html
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 
 def adagrad(
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 687d45534ac5c2..e267974ca37ef4 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -215,7 +215,7 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -228,18 +228,17 @@ def step(self, closure=None):
         amsgrad (bool, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False)
-        {foreach}
-        {maximize}
-        {capturable}
-        {differentiable}
-        {fused}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_capturable_doc}
+        {_differentiable_doc}
+        {_fused_doc}
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, capturable=_capturable_doc,
-               differentiable=_differentiable_doc, fused=_fused_doc)
+    """
 
 
 def adam(params: List[Tensor],
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 1ee927274558f1..f8d1fb0178fdae 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -156,7 +156,7 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -166,14 +166,14 @@ def step(self, closure=None):
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 
 def adamax(
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 7a97e5d6a91363..73fd60ae7f7c85 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -245,7 +245,7 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Decoupled Weight Decay Regularization`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -258,21 +258,17 @@ def step(self, closure=None):
         amsgrad (bool, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False)
-        {maximize}
-        {foreach}
-        {capturable}
-        {differentiable}
-        {fused}
+        {_maximize_doc}
+        {_foreach_doc}
+        {_capturable_doc}
+        {_differentiable_doc}
+        {_fused_doc}
     .. _Decoupled Weight Decay Regularization:
         https://arxiv.org/abs/1711.05101
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
 
-    """.format(maximize=_maximize_doc,
-               foreach=_foreach_doc,
-               fused=_fused_doc,
-               capturable=_capturable_doc,
-               differentiable=_differentiable_doc)
+    """
 
 
 def adamw(
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index e483e1c31fbc7c..5e140b0ca2ad7a 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -136,7 +136,7 @@ def step(self, closure=None):
         return loss
 
 
-ASGD.__doc__ = r"""Implements Averaged Stochastic Gradient Descent.
+ASGD.__doc__ = fr"""Implements Averaged Stochastic Gradient Descent.
 
     It has been proposed in `Acceleration of stochastic approximation by
     averaging`_.
@@ -149,14 +149,14 @@ def step(self, closure=None):
         alpha (float, optional): power for eta update (default: 0.75)
         t0 (float, optional): point at which to start averaging (default: 1e6)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
     .. _Acceleration of stochastic approximation by averaging:
         https://dl.acm.org/citation.cfm?id=131098
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 
 def asgd(
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index aeb3fc8b77dd2c..4278bb32bfd5b1 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -136,7 +136,7 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -147,13 +147,13 @@ def step(self, closure=None):
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
-        {foreach}
-        {differentiable}
+        {_foreach_doc}
+        {_differentiable_doc}
 
     .. _Incorporating Nesterov Momentum into Adam:
         https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
 
-    """.format(foreach=_foreach_doc, differentiable=_differentiable_doc)
+    """
 
 
 def nadam(params: List[Tensor],
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 120620ab949cc1..4e5742636edfea 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -159,7 +159,7 @@ def step(self, closure=None):
     This implementation uses the same weight_decay implementation as Adam (were the weight_decay is applied
     to the gradient) and not the one from AdamW (were weight_decay is applied to the update). This
     is different from the `author's implementation`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -169,15 +169,15 @@ def step(self, closure=None):
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        {foreach}
-        {differentiable}
+        {_foreach_doc}
+        {_differentiable_doc}
 
     .. _On the variance of the adaptive learning rate and beyond:
         https://arxiv.org/abs/1908.03265
     .. _author's implementation:
         https://github.com/LiyuanLucasLiu/RAdam
 
-    """.format(foreach=_foreach_doc, differentiable=_differentiable_doc)
+    """
 
 
 def radam(
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index cec27d95506840..df64a9b44ca993 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -176,7 +176,7 @@ def step(self, closure=None):
     learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma`
     is the scheduled learning rate and :math:`v` is the weighted moving average
     of the squared gradient.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -188,11 +188,11 @@ def step(self, closure=None):
         centered (bool, optional) : if ``True``, compute the centered RMSProp,
             the gradient is normalized by an estimation of its variance
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 
 def rmsprop(
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 93e7241010500a..04c70d057224b1 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -153,7 +153,7 @@ def step(self, closure=None):
     For further details regarding the algorithm we refer to the paper
     `A Direct Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm
     <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.1417>`_.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -163,11 +163,11 @@ def step(self, closure=None):
             (default: (0.5, 1.2))
         step_sizes (Tuple[float, float], optional): a pair of minimal and
             maximal allowed step sizes (default: (1e-6, 50))
-        {foreach}
-        {maximize}
-        {differentiable}
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
 
-    """.format(foreach=_foreach_doc, maximize=_maximize_doc, differentiable=_differentiable_doc)
+    """
 
 def rprop(
     params: List[Tensor],
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 1f679ffb994f1a..326186dc5d1f17 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -127,7 +127,7 @@ def step(self, closure=None):
 
     Nesterov momentum is based on the formula from
     `On the importance of initialization and momentum in deep learning`__.
-    """ + r"""
+    """ + fr"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -136,10 +136,10 @@ def step(self, closure=None):
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         dampening (float, optional): dampening for momentum (default: 0)
         nesterov (bool, optional): enables Nesterov momentum (default: False)
-        {maximize}
-        {foreach}
-        {differentiable}
-    """.format(maximize=_maximize_doc, foreach=_foreach_doc, differentiable=_differentiable_doc) + r"""
+        {_maximize_doc}
+        {_foreach_doc}
+        {_differentiable_doc}
+    """ + r"""
 
     Example:
         >>> # xdoctest: +SKIP
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index c68441cb389c04..5f088a05b5d8ed 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -97,7 +97,7 @@ def step(self, closure=None):
 
         return loss
 
-SparseAdam.__doc__ = r"""SparseAdam implements a masked version of the Adam algorithm
+SparseAdam.__doc__ = fr"""SparseAdam implements a masked version of the Adam algorithm
     suitable for sparse gradients. Currently, due to implementation constraints (explained
     below), SparseAdam is only intended for a narrow subset of use cases, specifically
     parameters of a dense layout with gradients of a sparse layout. This occurs in a
@@ -150,9 +150,9 @@ def step(self, closure=None):
             running averages of gradient and its square (default: (0.9, 0.999))
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
-        {maximize}
+        {_maximize_doc}
 
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
 
-    """.format(maximize=_maximize_doc)
+    """
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index ebd24383e0b53f..684b9f4fcbfa5d 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -998,7 +998,7 @@ def _write(self, filename, str_or_bytes):
 
     def _validate_dependency_graph(self):
         # 1. Check the graph for any errors inserted during dependency analysis.
-        for module_name, attrs in self.dependency_graph.nodes.items():
+        for attrs in self.dependency_graph.nodes.values():
             if "error" in attrs:
                 raise PackagingError(self.dependency_graph, debug=self.debug)
 
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 891c878cc5f059..91fa699a6d22a7 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1136,7 +1136,7 @@ class precisionOverride:
 
     def __init__(self, d):
         assert isinstance(d, dict), "precisionOverride not given a dtype : precision dict!"
-        for dtype, prec in d.items():
+        for dtype in d.keys():
             assert isinstance(dtype, torch.dtype), f"precisionOverride given unknown dtype {dtype}"
 
         self.d = d
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index aad7679c544611..2c3482b46700a1 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -152,7 +152,7 @@ def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
-                from transformers import (  # noqa: Unused
+                from transformers import (  # noqa: F401
                     AutoModelForMaskedLM,
                     BertConfig,
                 )
@@ -760,7 +760,7 @@ def _join_processes(self, fn) -> None:
                 self._check_return_codes(elapsed_time)
         finally:
             # Close all pipes
-            for pid, pipe in self.pid_to_pipe.items():
+            for pipe in self.pid_to_pipe.values():
                 pipe.close()
 
     def _check_no_test_errors(self, elapsed_time) -> None:
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index f2ef164d96ff8f..bc3bbe2ebcf209 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -264,7 +264,7 @@ def my_complex_tensor_function(list_input, tensor_class_input, dict_input):
     res = list_input[0]
     for t in list_input:
         res += t
-    for k, v in dict_input.items():
+    for v in dict_input.values():
         res += v
     complex_tensors = tensor_class_input.tensors
     return (res, complex_tensors[0], complex_tensors[1], complex_tensors[2])
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index bdeace15a71d3c..f96c2fb436be9d 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -868,7 +868,7 @@ def get_traced_sample_variant_pairs(device, dtype, op):
         return outputs
 
     for sample in samples:
-        for func_type, variant in variants.items():
+        for variant in variants.values():
             if variant is None:
                 continue
 
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 3ecdc06d9bf77d..2c7e969b829976 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -284,7 +284,7 @@ def tree_flatten(pytree: PyTree) -> Tuple[List[Any], TreeSpec]:
 
     # Recursively flatten the children
     result : List[Any] = []
-    children_specs : List['TreeSpec'] = []
+    children_specs : List[TreeSpec] = []
     for child in child_pytrees:
         flat, child_spec = tree_flatten(child)
         result += flat
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index c1636ddb78a2bf..b8134c599d66cb 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -233,7 +233,7 @@ def merge(measurements: Iterable["Measurement"]) -> List["Measurement"]:
         Merge will extrapolate times to `number_per_run=1` and will not
         transfer any metadata. (Since it might differ between replicates)
         """
-        grouped_measurements: DefaultDict[TaskSpec, List["Measurement"]] = collections.defaultdict(list)
+        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
         for m in measurements:
             grouped_measurements[m.task_spec].append(m)
 
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index cfa6cd95b524dc..f1650ffb5b60fd 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -322,7 +322,7 @@ def close_streams(cls, v, depth=0):
         else:
             # Traverse only simple structures
             if isinstance(v, dict):
-                for kk, vv in v.items():
+                for vv in v.values():
                     cls.close_streams(vv, depth=depth + 1)
             elif isinstance(v, (list, tuple)):
                 for vv in v:
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index a987bca6dcd51b..6bfa4b9f85bd6f 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -107,7 +107,7 @@ def from_dlpack(ext_tensor: Any) -> 'torch.Tensor':
             # attribute, but it is not documented
             # The array API specify that the default legacy stream must be passed
             # with a value of 1 for CUDA
-            # https://data-apis.org/array-api/latest/API_specification/array_object.html?dlpack-self-stream-none#dlpack-self-stream-none  # NOQA
+            # https://data-apis.org/array-api/latest/API_specification/array_object.html?dlpack-self-stream-none#dlpack-self-stream-none
             is_cuda = device[0] == DLDeviceType.kDLGPU
             # Since pytorch is not using PTDS by default, lets directly pass
             # the legacy stream
diff --git a/torch/utils/tensorboard/_utils.py b/torch/utils/tensorboard/_utils.py
index 3715b7504ff049..2b959726fd2622 100644
--- a/torch/utils/tensorboard/_utils.py
+++ b/torch/utils/tensorboard/_utils.py
@@ -98,10 +98,8 @@ def convert_to_HWC(tensor, input_format):  # tensor: numpy array
     ), f"You can not use the same dimension shordhand twice.         input_format: {input_format}"
     assert len(tensor.shape) == len(
         input_format
-    ), "size of input tensor and input format are different. \
-        tensor shape: {}, input_format: {}".format(
-        tensor.shape, input_format
-    )
+    ), f"size of input tensor and input format are different. \
+        tensor shape: {tensor.shape}, input_format: {input_format}"
     input_format = input_format.upper()
 
     if len(input_format) == 4:
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index 653d7b295629cb..ea38fa3fdcb42b 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -374,7 +374,7 @@ def add_generated_native_functions(
     # First we group of NaitveFunctions by schema kind,
     # then we detect which ones are missing and generate them.
     pre_grouped_native_functions = pre_group_native_functions(rs)
-    for k, d in pre_grouped_native_functions.items():
+    for d in pre_grouped_native_functions.values():
         has_functional = SchemaKind.functional in d
         has_inplace = SchemaKind.inplace in d
         has_mutable = SchemaKind.mutable in d
diff --git a/torchgen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py
index 13910db85c9871..dab15685804ea2 100644
--- a/torchgen/operator_versions/gen_mobile_upgraders.py
+++ b/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -307,7 +307,7 @@ def get_upgrader_bytecode_function_to_index_map(
     upgrader_bytecode_function_to_index_map = {}
     index = 0
     for upgrader_bytecode in upgrader_dict:
-        for upgrader_name, bytecode in upgrader_bytecode.items():
+        for upgrader_name in upgrader_bytecode.keys():
             if upgrader_name in EXCLUE_UPGRADER_SET:
                 continue
             upgrader_bytecode_function_to_index_map[upgrader_name] = index