From aae0594cda64d9cdb8e7dcf23037d70e0270777e Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Fri, 10 Jan 2025 05:08:40 +0000
Subject: [PATCH 1/6] pccl dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index e4e0786e..e4c95edf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "pyarrow",
     "toposolve",
     "psutil",
+    "pccl @ git+ssh://git@github.com/PrimeIntellect-ai/pccl.git@main#egg=pccl&subdirectory=python/framework",
 ]
 
 [project.optional-dependencies]

From 4c401d271c77f0bf0150b7c1e8f6e12cb49821d6 Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Fri, 10 Jan 2025 05:08:53 +0000
Subject: [PATCH 2/6] uv lock

---
 uv.lock | 87 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/uv.lock b/uv.lock
index b94a8576..51dfdd06 100644
--- a/uv.lock
+++ b/uv.lock
@@ -5,11 +5,7 @@ resolution-markers = [
     "python_full_version < '3.11' and sys_platform != 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
     "python_full_version == '3.12.*' and sys_platform != 'linux'",
     "python_full_version >= '3.13' and sys_platform == 'linux'",
     "python_full_version >= '3.13' and sys_platform != 'linux'",
@@ -328,7 +324,7 @@ name = "click"
 version = "8.1.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 }
 wheels = [
@@ -764,6 +760,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
 ]
 
+[[package]]
+name = "ipaddress"
+version = "1.0.23"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b9/9a/3e9da40ea28b8210dd6504d3fe9fe7e013b62bf45902b458d1cdc3c34ed9/ipaddress-1.0.23.tar.gz", hash = "sha256:b7f8e0369580bb4a24d5ba1d7cc29660a4a6987763faf1d8a8046830e020e7e2", size = 32958 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/f8/49697181b1651d8347d24c095ce46c7346c37335ddc7d255833e7cde674d/ipaddress-1.0.23-py2.py3-none-any.whl", hash = "sha256:6e0f4a39e66cb5bb9a137b00276a2eff74f93b71dcbdad6f10ff7df9d3557fcc", size = 18159 },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.4"
@@ -1084,6 +1089,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824 },
     { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519 },
     { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741 },
+    { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628 },
+    { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 },
 ]
 
 [[package]]
@@ -1247,7 +1254,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
-    { url = "https://files.pythonhosted.org/packages/e2/2a/4f27ca96232e8b5269074a72e03b4e0d43aa68c9b965058b1684d07c6ff8/nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc", size = 396895858 },
 ]
 
 [[package]]
@@ -1257,7 +1263,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
-    { url = "https://files.pythonhosted.org/packages/f3/79/8cf313ec17c58ccebc965568e5bcb265cdab0a1df99c4e674bb7a3b99bfe/nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922", size = 9938035 },
 ]
 
 [[package]]
@@ -1267,7 +1272,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
-    { url = "https://files.pythonhosted.org/packages/7c/30/8c844bfb770f045bcd8b2c83455c5afb45983e1a8abf0c4e5297b481b6a5/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec", size = 19751955 },
 ]
 
 [[package]]
@@ -1277,7 +1281,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
-    { url = "https://files.pythonhosted.org/packages/a8/8b/450e93fab75d85a69b50ea2d5fdd4ff44541e0138db16f9cd90123ef4de4/nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e", size = 878808 },
 ]
 
 [[package]]
@@ -1285,11 +1288,10 @@ name = "nvidia-cudnn-cu12"
 version = "9.1.0.70"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
-    { url = "https://files.pythonhosted.org/packages/3f/d0/f90ee6956a628f9f04bf467932c0a25e5a7e706a684b896593c06c82f460/nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a", size = 679925892 },
 ]
 
 [[package]]
@@ -1297,12 +1299,11 @@ name = "nvidia-cufft-cu12"
 version = "11.2.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
-    { url = "https://files.pythonhosted.org/packages/f6/ee/3f3f8e9874f0be5bbba8fb4b62b3de050156d159f8b6edc42d6f1074113b/nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b", size = 210576476 },
 ]
 
 [[package]]
@@ -1312,7 +1313,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
-    { url = "https://files.pythonhosted.org/packages/1c/22/2573503d0d4e45673c263a313f79410e110eb562636b0617856fdb2ff5f6/nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771", size = 55799918 },
 ]
 
 [[package]]
@@ -1320,14 +1320,13 @@ name = "nvidia-cusolver-cu12"
 version = "11.6.1.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
-    { name = "nvidia-cusparse-cu12" },
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
-    { url = "https://files.pythonhosted.org/packages/f2/be/d435b7b020e854d5d5a682eb5de4328fd62f6182507406f2818280e206e2/nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c", size = 125224015 },
 ]
 
 [[package]]
@@ -1335,12 +1334,11 @@ name = "nvidia-cusparse-cu12"
 version = "12.3.1.170"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
-    { url = "https://files.pythonhosted.org/packages/a2/e0/3155ca539760a8118ec94cc279b34293309bcd14011fc724f87f31988843/nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f", size = 204684315 },
 ]
 
 [[package]]
@@ -1358,7 +1356,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
-    { url = "https://files.pythonhosted.org/packages/81/19/0babc919031bee42620257b9a911c528f05fb2688520dcd9ca59159ffea8/nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1", size = 95336325 },
 ]
 
 [[package]]
@@ -1368,7 +1365,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
-    { url = "https://files.pythonhosted.org/packages/54/1b/f77674fbb73af98843be25803bbd3b9a4f0a96c75b8d33a2854a5c7d2d77/nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485", size = 66307 },
 ]
 
 [[package]]
@@ -1433,6 +1429,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d3/5e/76a9d08b4b4e4583f269cb9f64de267f9aeae0dacef23307f53a14211716/pathvalidate-3.2.1-py3-none-any.whl", hash = "sha256:9a6255eb8f63c9e2135b9be97a5ce08f10230128c4ae7b3e935378b82b22c4c9", size = 23833 },
 ]
 
+[[package]]
+name = "pccl"
+version = "0.1.0"
+source = { git = "ssh://git@github.com/PrimeIntellect-ai/pccl.git?subdirectory=python%2Fframework&rev=main#4b56405257ebbe95f216570edd5f1293eb019dd6" }
+dependencies = [
+    { name = "cffi" },
+    { name = "ipaddress" },
+    { name = "pycparser" },
+    { name = "torch" },
+]
+
 [[package]]
 name = "peft"
 version = "0.13.2"
@@ -1477,7 +1484,7 @@ name = "portalocker"
 version = "3.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pywin32", marker = "platform_system == 'Windows'" },
+    { name = "pywin32", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7e/57/b969aed128768558255822e75b402a19530bd63321f637d42f4724abc1ed/portalocker-3.0.0.tar.gz", hash = "sha256:21f535de2e7a82c94c130c054adb5c7421d480d5619d61073996e2f89bcb879b", size = 41961 }
 wheels = [
@@ -1532,8 +1539,6 @@ version = "6.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/18/c7/8c6872f7372eb6a6b2e4708b88419fb46b857f7a2e1892966b851cc79fc9/psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2", size = 508067 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c5/66/78c9c3020f573c58101dc43a44f6855d01bbbd747e24da2f0c4491200ea3/psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35", size = 249766 },
-    { url = "https://files.pythonhosted.org/packages/e1/3f/2403aa9558bea4d3854b0e5e567bc3dd8e9fbc1fc4453c0aa9aafeb75467/psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1", size = 253024 },
     { url = "https://files.pythonhosted.org/packages/0b/37/f8da2fbd29690b3557cca414c1949f92162981920699cd62095a984983bf/psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0", size = 250961 },
     { url = "https://files.pythonhosted.org/packages/35/56/72f86175e81c656a01c4401cd3b1c923f891b31fbcebe98985894176d7c9/psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0", size = 287478 },
     { url = "https://files.pythonhosted.org/packages/19/74/f59e7e0d392bc1070e9a70e2f9190d652487ac115bb16e2eff6b22ad1d24/psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", size = 290455 },
@@ -2380,21 +2385,21 @@ dependencies = [
     { name = "fsspec" },
     { name = "jinja2" },
     { name = "networkx" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
@@ -2439,7 +2444,7 @@ name = "tqdm"
 version = "4.66.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/58/83/6ba9844a41128c62e810fddddd72473201f3eacde02046066142a2d96cc5/tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad", size = 169504 }
 wheels = [
@@ -2485,7 +2490,7 @@ name = "triton"
 version = "3.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock", marker = "python_full_version < '3.13'" },
+    { name = "filelock", marker = "python_full_version < '3.13' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },
@@ -2743,6 +2748,7 @@ dependencies = [
     { name = "fsspec", extra = ["gcs"] },
     { name = "ninja" },
     { name = "numpy" },
+    { name = "pccl" },
     { name = "psutil" },
     { name = "pyarrow" },
     { name = "pydantic-config" },
@@ -2781,6 +2787,7 @@ requires-dist = [
     { name = "lm-eval", marker = "extra == 'all'" },
     { name = "ninja" },
     { name = "numpy" },
+    { name = "pccl", git = "ssh://git@github.com/PrimeIntellect-ai/pccl.git?subdirectory=python%2Fframework&rev=main" },
     { name = "psutil" },
     { name = "pyarrow" },
     { name = "pydantic-config", git = "https://github.com/samsja/pydantic_config.git?rev=74c94ee" },

From 71848cfcdf0e34d60249b8cc492b1bf0be4b3a0e Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Fri, 10 Jan 2025 05:36:54 +0000
Subject: [PATCH 3/6] map changes from torchft PR

---
 src/zeroband/diloco.py | 28 ++++++++--------
 src/zeroband/train.py  | 73 ++++++++++--------------------------------
 2 files changed, 31 insertions(+), 70 deletions(-)

diff --git a/src/zeroband/diloco.py b/src/zeroband/diloco.py
index 2e387055..593e026c 100644
--- a/src/zeroband/diloco.py
+++ b/src/zeroband/diloco.py
@@ -1,15 +1,16 @@
+import os
+from torch.distributed.device_mesh import init_device_mesh
 import re
 import time
 from pydantic_config import BaseConfig
 import torch
 from torch import nn
-from zeroband.collectives import Compression, all_reduce
-from zeroband.comms import ElasticDeviceMesh
+from zeroband.collectives import Compression
 from zeroband.utils.world_info import get_world_info
 from zeroband.utils.logging import get_logger
-import torch.distributed as dist
 from torch.distributed._tensor.api import DTensor
 from functools import lru_cache
+from pccl import Communicator, ReduceOp
 
 
 class DilocoConfig(BaseConfig):
@@ -59,7 +60,7 @@ def __init__(
         self,
         config: DilocoConfig,
         model: nn.Module,
-        elastic_device_mesh: ElasticDeviceMesh,
+        comm: Communicator,
     ):
         self.config = config
 
@@ -67,11 +68,13 @@ def __init__(
             from zeroband.C.collectives import ring_allreduce as _  # noqa: F401
             # just force compilation
 
-        self.elastic_device_mesh = elastic_device_mesh
+        self.comm = comm
 
         self._logger = get_logger()
         self.world_info = get_world_info()
 
+        self.cpu_local_mesh = init_device_mesh("cpu", mesh_shape=(int(os.environ["LOCAL_WORLD_SIZE"]),))
+
         self._init_offloaded_optimizer(model=model)
 
     @torch.no_grad()
@@ -89,14 +92,11 @@ def sync_pseudo_gradient(self, model: nn.Module, fake: bool = False, flag: str =
         """
         _start_time = time.perf_counter()
 
-        self.elastic_device_mesh.maybe_reinit_global_pg(admit_joiners=False)
-        world_size_post_init = self.elastic_device_mesh.global_pg.size()
-
-        world_size = world_size_post_init
+        self.comm.update_topology()
+        world_size = self.comm.get_attribute()
 
         self._logger.debug("sync pseudo gradient %s with world size %d", " fake" if fake else "", world_size)
 
-        global_pg = self.elastic_device_mesh.global_pg
         for i in range(self.config.retry_all_reduce):
             for param_offloaded, param in zip(self.param_list_cpu, model.parameters()):
                 if fake:
@@ -114,7 +114,7 @@ def sync_pseudo_gradient(self, model: nn.Module, fake: bool = False, flag: str =
                 # all_reduce(self.config.compression, self.offloaded_grad_flat_tensor, dist.ReduceOp.SUM, global_pg)
                 for j, tensor_group in enumerate(self._offloaded_grad_grouped_tensor):
                     t0 = time.perf_counter()
-                    all_reduce(self.config.compression, tensor_group, dist.ReduceOp.SUM, global_pg)
+                    self.comm.all_reduce(tensor_group, tensor_group, ReduceOp.SUM)
                     self._logger.debug(
                         f"{j}/{len(self._offloaded_grad_grouped_tensor)} all reduce bucket done in {time.perf_counter() - t0:.6f} seconds, numel: {tensor_group.numel()}"
                     )
@@ -125,7 +125,7 @@ def sync_pseudo_gradient(self, model: nn.Module, fake: bool = False, flag: str =
                 break
             except Exception as e:
                 self._logger.error(f"Error syncing pseudo gradient: {e}, retry {i+1}/{self.config.retry_all_reduce}")
-                global_pg = self.elastic_device_mesh.get_global_pg(maybe_reinit=True)
+                self.comm.update_topology()
         else:
             self._logger.error(
                 "Failed to sync pseudo gradient after %d retries. Resorting to calculating pseudo-gradient without reduce",
@@ -181,14 +181,14 @@ def get_offloaded_param(self, model: nn.Module) -> list[nn.Parameter]:
             offloaded_param = nn.Parameter(
                 DTensor.from_local(
                     data_tensor,
-                    device_mesh=self.elastic_device_mesh.cpu_local_mesh,
+                    device_mesh=self.cpu_local_mesh,
                     placements=param.data.placements,
                 )
             )
 
             offloaded_param.grad = DTensor.from_local(
                 grad_tensor,
-                device_mesh=self.elastic_device_mesh.cpu_local_mesh,
+                device_mesh=self.cpu_local_mesh,
                 placements=param.data.placements,
             )
             # here we pre-allocate the grad DTensor on cpu.
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index 9f4a5f5c..e272d53a 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -3,6 +3,7 @@
 import time
 from pydantic import model_validator
 from multiprocessing.process import _children
+from torch.distributed.device_mesh import init_device_mesh
 
 import torch
 from pydantic_config import parse_argv, BaseConfig
@@ -16,7 +17,6 @@
 import torch.distributed as dist
 from zeroband import utils
 from zeroband.diloco import Diloco, DilocoConfig
-from zeroband.comms import ElasticDeviceMesh
 from zeroband.loss import cross_entropy_max_z_loss
 from zeroband.models.llama.model import AttnFnType, create_block_mask_from_seqlens
 
@@ -38,6 +38,8 @@
 from zeroband.checkpoint import CkptConfig, CkptManager, TrainingProgress
 from zeroband.lr_scheduler import get_scheduler
 
+from pccl import Communicator, Attribute
+
 
 class OptimConfig(BaseConfig):
     lr: float = 4e-4
@@ -212,9 +214,9 @@ def train(config: Config):
         num = 1 if isinstance(config.train.ac_ckpt, bool) else config.train.ac_ckpt
         apply_ac_ckpt(model, num)
 
-    elastic_device_mesh = ElasticDeviceMesh(
-        enable=config.diloco is not None, live_recovery_rank_src=config.ckpt.live_recovery_rank_src
-    )
+    dist.init_process_group(backend="cpu:gloo,cuda:nccl")
+    comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
+    cuda_local_mesh = init_device_mesh("cuda", mesh_shape=(int(os.environ["LOCAL_WORLD_SIZE"]),))
 
     mp_policy = MixedPrecisionPolicy(
         param_dtype=torch.bfloat16, reduce_dtype=torch.float32 if config.train.reduce_fp32 else None
@@ -228,13 +230,13 @@ def train(config: Config):
         fully_shard(
             transformer_block,
             mp_policy=mp_policy,
-            mesh=elastic_device_mesh.cuda_local_mesh,
+            mesh=cuda_local_mesh,
             reshard_after_forward=reshard_after_forward,
         )
     fully_shard(
         model,
         mp_policy=mp_policy,
-        mesh=elastic_device_mesh.cuda_local_mesh,
+        mesh=cuda_local_mesh,
         reshard_after_forward=config.train.reshard_after_forward,
     )
     logger.debug("model fsdped")
@@ -248,7 +250,7 @@ def train(config: Config):
     )
 
     if config.diloco is not None:
-        diloco = Diloco(config.diloco, model, elastic_device_mesh)
+        diloco = Diloco(config.diloco, model, comm)
 
     scheduler = get_scheduler(
         sched_type=config.optim.sched_type,
@@ -312,7 +314,7 @@ def train(config: Config):
 
     logger.info("starting training")
 
-    need_live_recovery = config.ckpt.live_recovery_rank_src is not None
+    first_step = True
     while True:
         if num_inner_steps > 1:
             # if we don't use diloco we don't print the outer step logs
@@ -320,47 +322,9 @@ def train(config: Config):
 
         time_start_outer = time.perf_counter()
 
-        if config.diloco is not None:
-            # this is a patch for now to allow live recovery worker to not affect the all reduce at all
-
-            if not need_live_recovery:
-                elastic_device_mesh.maybe_reinit_global_pg(admit_joiners=True)
-
-                maybe_dest_rank = elastic_device_mesh.live_recovery.should_send_ckpt_to()
-                if maybe_dest_rank is not None:
-                    logger.info(f"Start live recovery to rank {maybe_dest_rank}")
-                    ckpt_manager.send_ckpt_to_peer(elastic_device_mesh.global_pg, maybe_dest_rank, blocking=True)
-
-                    elastic_device_mesh.live_recovery.reset()
-            else:
-                ## receiving
-                time_start_live_recovery = time.perf_counter()
-                logger.info(f"Start live recovery from rank {config.ckpt.live_recovery_rank_src}")
-
-                ## we create grad buffer and opts stats mamnually, the value will be overwritten by the ckpt but we need the DTensor to be correctly init before loading it
-
-                diloco.outer_optimizer.step()  # need to step to init the DTensor stats
-
-                ckpt_manager.recv_ckpt_from_peer(elastic_device_mesh.global_pg)
-
-                log_hash_training_state(
-                    config,
-                    model,
-                    inner_optimizer,
-                    diloco,
-                    metric_logger,
-                    step=training_progress.step,
-                    id="live_reco_recv",
-                )
-                need_live_recovery = False
-
-                if config.ckpt.remote_data_load:
-                    ckpt_manager.remote_data_load()
-
-                logger.info("live recovery done in %f", time.perf_counter() - time_start_live_recovery)
-
-        # at the beginning of the inner steps we allow joiner to arrive.
-        # We maybe reinit before the all reduce but only to allow leaving, not to join anymore
+        if not first_step:
+            comm.update_topology()
+        first_step = False
 
         if world_info.rank == 0 and config.monitor is not None:
             monitor.set_stage("inner_loop")
@@ -409,9 +373,9 @@ def train(config: Config):
                 else:
                     loss_batch += loss.clone().detach()
 
-            dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg)
+            dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG)
             if config.optim.z_loss:
-                dist.all_reduce(tensor=z_loss_batch, op=dist.ReduceOp.AVG, group=elastic_device_mesh.local_pg)
+                dist.all_reduce(tensor=z_loss_batch, op=dist.ReduceOp.AVG)
 
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             inner_optimizer.step()
@@ -432,7 +396,7 @@ def train(config: Config):
             else:
                 # we count the total tokens with respect to all diloco workers
                 # might need to tweak this as some worker might fail to join the all reduce later
-                training_progress.total_tokens += new_tokens * elastic_device_mesh.global_pg.size()
+                training_progress.total_tokens += new_tokens * comm.get_attribute(Attribute.CURRENT_WORLD_SIZE)
 
             metrics = {
                 "Loss": loss_batch.item(),
@@ -458,7 +422,7 @@ def train(config: Config):
                 log += f", tokens_per_second: {tokens_per_second:.2f}, mfu: {metrics['mfu']:.2f}"
 
             if config.diloco is not None:
-                metrics["num_peers"] = elastic_device_mesh.global_pg.size()
+                metrics["num_peers"] = comm.get_attribute(Attribute.CURRENT_WORLD_SIZE)
                 log += f", diloco_peers: {metrics['num_peers']}"
 
             if world_info.rank == 0:
@@ -531,9 +495,6 @@ def train(config: Config):
             monitor.finish()
 
     ckpt_manager.wait_for_blocking_job()
-
-    del elastic_device_mesh  # allow to clean up for smoother tests transition
-
     logger.info("Training finished, exiting ...")
 
 

From 3eda0d24fa629f079c028ce98875bc8bcaa51dde Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Fri, 10 Jan 2025 08:20:17 +0000
Subject: [PATCH 4/6] fix: missing connect

---
 src/zeroband/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index e272d53a..ee6e891d 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -216,6 +216,7 @@ def train(config: Config):
 
     dist.init_process_group(backend="cpu:gloo,cuda:nccl")
     comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
+    comm.connect()
     cuda_local_mesh = init_device_mesh("cuda", mesh_shape=(int(os.environ["LOCAL_WORLD_SIZE"]),))
 
     mp_policy = MixedPrecisionPolicy(
@@ -325,6 +326,7 @@ def train(config: Config):
         if not first_step:
             comm.update_topology()
         first_step = False
+        print("Hello")
 
         if world_info.rank == 0 and config.monitor is not None:
             monitor.set_stage("inner_loop")

From 9addd21b80cbfe326508676b0569c2833af79666 Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Fri, 10 Jan 2025 08:57:32 +0000
Subject: [PATCH 5/6] TEMP: pin debug train

---
 src/zeroband/train.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index ee6e891d..1fac2d99 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -38,7 +38,7 @@
 from zeroband.checkpoint import CkptConfig, CkptManager, TrainingProgress
 from zeroband.lr_scheduler import get_scheduler
 
-from pccl import Communicator, Attribute
+from pccl import Attribute
 
 
 class OptimConfig(BaseConfig):
@@ -195,6 +195,7 @@ def train(config: Config):
         seq_length=config.data.seq_length,
         attn_fn=config.train.attn_fn,
     )
+    print(model)
 
     model = model.to(world_info.local_rank)
     logger.debug("model loaded")
@@ -215,9 +216,12 @@ def train(config: Config):
         apply_ac_ckpt(model, num)
 
     dist.init_process_group(backend="cpu:gloo,cuda:nccl")
-    comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
-    comm.connect()
+    if config.diloco is not None:
+        pass
+        # comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
+        # comm.connect()
     cuda_local_mesh = init_device_mesh("cuda", mesh_shape=(int(os.environ["LOCAL_WORLD_SIZE"]),))
+    print(cuda_local_mesh)
 
     mp_policy = MixedPrecisionPolicy(
         param_dtype=torch.bfloat16, reduce_dtype=torch.float32 if config.train.reduce_fp32 else None
@@ -251,7 +255,7 @@ def train(config: Config):
     )
 
     if config.diloco is not None:
-        diloco = Diloco(config.diloco, model, comm)
+        diloco = Diloco(config.diloco, model, None)
 
     scheduler = get_scheduler(
         sched_type=config.optim.sched_type,
@@ -323,15 +327,15 @@ def train(config: Config):
 
         time_start_outer = time.perf_counter()
 
-        if not first_step:
+        if not first_step and config.diloco is not None:
             comm.update_topology()
         first_step = False
-        print("Hello")
 
         if world_info.rank == 0 and config.monitor is not None:
             monitor.set_stage("inner_loop")
 
         for inner_step in range(num_inner_steps):
+            print("Starting inner step")
             loss_batch = 0
             z_loss_batch = 0
 
@@ -348,11 +352,14 @@ def train(config: Config):
                     block_mask = create_block_mask_from_seqlens(seqlens) if seqlens is not None else None
                 else:
                     block_mask = None
+                print("Starting inner step")
 
+                print("Model forward!")
                 logits = model(tokens=input_ids, block_mask=block_mask).contiguous()
                 flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab")
                 flatten_labels = rearrange(labels, "b seq -> (b seq)")
 
+                print("Mid inner step")
                 if config.optim.z_loss:
                     ce_loss, z_loss = cross_entropy_max_z_loss(
                         flatten_logits, flatten_labels, config.optim.z_loss_weight
@@ -368,16 +375,19 @@ def train(config: Config):
                     loss = F.cross_entropy(flatten_logits, flatten_labels) / gradient_accumulation_steps
                     del logits
                     loss.backward()
+                print("End? inner step")
 
                 if config.optim.z_loss:
                     loss_batch += ce_loss.clone().detach()
                     z_loss_batch += z_loss.clone().detach()
                 else:
                     loss_batch += loss.clone().detach()
+                print("Z loss")
 
             dist.all_reduce(tensor=loss_batch, op=dist.ReduceOp.AVG)
             if config.optim.z_loss:
                 dist.all_reduce(tensor=z_loss_batch, op=dist.ReduceOp.AVG)
+            print("Hi")
 
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             inner_optimizer.step()
@@ -398,7 +408,9 @@ def train(config: Config):
             else:
                 # we count the total tokens with respect to all diloco workers
                 # might need to tweak this as some worker might fail to join the all reduce later
+                print("Get attr")
                 training_progress.total_tokens += new_tokens * comm.get_attribute(Attribute.CURRENT_WORLD_SIZE)
+                print("Post Get attr")
 
             metrics = {
                 "Loss": loss_batch.item(),

From 7555284e423ee93dc6a4d5d6de9b6922f7e1fd2c Mon Sep 17 00:00:00 2001
From: Jackmin801 <ongjackm@gmail.com>
Date: Sat, 11 Jan 2025 08:48:43 +0000
Subject: [PATCH 6/6] TEMP: dev

---
 configs/150M/3090.toml    | 10 ++++++++--
 configs/debug/diloco.toml |  4 ----
 master.py                 | 13 +++++++++++++
 meow.sh                   | 19 +++++++++++++++++++
 meow1.sh                  | 18 ++++++++++++++++++
 src/zeroband/train.py     |  7 +++----
 6 files changed, 61 insertions(+), 10 deletions(-)
 create mode 100644 master.py
 create mode 100644 meow.sh
 create mode 100644 meow1.sh

diff --git a/configs/150M/3090.toml b/configs/150M/3090.toml
index 761d1b66..078d55bf 100644
--- a/configs/150M/3090.toml
+++ b/configs/150M/3090.toml
@@ -7,7 +7,13 @@ micro_bs = 16 # change this base on the gpu
 reshard_after_forward = true
 
 [optim]
-batch_size = 512
+batch_size = 64
 warmup_steps = 1000
 total_steps = 88_000
-lr = 4e-4
\ No newline at end of file
+lr = 4e-4
+
+[data]
+fake = true
+
+[diloco]
+inner_steps = 20
diff --git a/configs/debug/diloco.toml b/configs/debug/diloco.toml
index c98e4603..6fbe7349 100644
--- a/configs/debug/diloco.toml
+++ b/configs/debug/diloco.toml
@@ -13,7 +13,3 @@ total_steps = 4
 
 [data]
 fake = true
-
-[diloco]
-inner_steps = 5
-
diff --git a/master.py b/master.py
new file mode 100644
index 00000000..4bfd909b
--- /dev/null
+++ b/master.py
@@ -0,0 +1,13 @@
+from pccl import *
+
+HOST: str = "0.0.0.0:48148"
+
+
+def main():
+    print(f"Starting master node on {HOST}")
+    master: MasterNode = MasterNode(listen_address=HOST)
+    master.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/meow.sh b/meow.sh
new file mode 100644
index 00000000..4923195d
--- /dev/null
+++ b/meow.sh
@@ -0,0 +1,19 @@
+export PCCL_LOG_LEVEL=DEBUG
+export WANDB_MODE=disabled
+export PCCL_MASTER_ADDR=127.0.0.1:48148
+
+export CUDA_VISIBLE_DEVICES=0,1
+#export GLOBAL_RANK=0 
+#export GLOBAL_UNIQUE_ID=A0 
+#export REPLICA_GROUP_ID=A0
+
+#export GLOO_SOCKET_IFNAME=tailscale0
+export ZERO_BAND_LOG_LEVEL=DEBUG
+export ZERO_BAND_LOG_ALL_RANK=true
+
+uv run torchrun --nproc_per_node=2 \
+	--rdzv-endpoint localhost:10001 \
+	src/zeroband/train.py \
+	@configs/150M/3090.toml \
+	--no-wandb-resume
+	#--ckpt.live_recovery_rank_src 0
diff --git a/meow1.sh b/meow1.sh
new file mode 100644
index 00000000..791fbe74
--- /dev/null
+++ b/meow1.sh
@@ -0,0 +1,18 @@
+export WANDB_MODE=disabled
+export PCCL_MASTER_ADDR=127.0.0.1:48148
+
+export CUDA_VISIBLE_DEVICES=2,3
+#export GLOBAL_RANK=0 
+#export GLOBAL_UNIQUE_ID=A0 
+#export REPLICA_GROUP_ID=A0
+
+#export GLOO_SOCKET_IFNAME=tailscale0
+export ZERO_BAND_LOG_LEVEL=DEBUG
+export ZERO_BAND_LOG_ALL_RANK=true
+
+uv run torchrun --nproc_per_node=2 \
+	--rdzv-endpoint localhost:10002 \
+	src/zeroband/train.py \
+	@configs/150M/3090.toml \
+	--no-wandb-resume
+	#--ckpt.live_recovery_rank_src 0
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index 1fac2d99..f9fc6089 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -38,7 +38,7 @@
 from zeroband.checkpoint import CkptConfig, CkptManager, TrainingProgress
 from zeroband.lr_scheduler import get_scheduler
 
-from pccl import Attribute
+from pccl import Attribute, Communicator
 
 
 class OptimConfig(BaseConfig):
@@ -217,9 +217,8 @@ def train(config: Config):
 
     dist.init_process_group(backend="cpu:gloo,cuda:nccl")
     if config.diloco is not None:
-        pass
-        # comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
-        # comm.connect()
+        comm = Communicator(os.environ["PCCL_MASTER_ADDR"], peer_group=dist.get_rank())
+        comm.connect()
     cuda_local_mesh = init_device_mesh("cuda", mesh_shape=(int(os.environ["LOCAL_WORLD_SIZE"]),))
     print(cuda_local_mesh)