From dbddd9f6fd5c6bd5a14407b21dc21f343012e0d0 Mon Sep 17 00:00:00 2001
From: Sami Jaghouar <sami.jaghouar@hotmail.fr>
Date: Thu, 19 Dec 2024 09:49:04 +0000
Subject: [PATCH] remove useless memory stuff

---
 src/zeroband/train.py          | 14 ----------
 src/zeroband/utils/__init__.py | 50 ----------------------------------
 2 files changed, 64 deletions(-)

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index 15d0cd93..a7f6d486 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -2,7 +2,6 @@
 from typing import Literal
 import time
 import warnings
-import psutil
 from pydantic import model_validator
 from multiprocessing.process import _children
 
@@ -24,7 +23,6 @@
 
 from zeroband.utils import (
     FakeTokenizer,
-    GPUMemoryMonitor,
     PerfCounter,
     get_module_signature,
     get_optimizer_signature,
@@ -73,7 +71,6 @@ class TrainConfig(BaseConfig):
 
     log_model_hash: bool = False
 
-    memory_monitor: bool = False
     memory_profiler: MemoryProfilerConfig | None = None
 
     sequence_packing: bool = True
@@ -310,8 +307,6 @@ def train(config: Config):
             config, model, inner_optimizer, diloco, metric_logger, step=training_progress.step, id="resume"
         )
 
-    if config.train.memory_monitor:
-        gpu_mem_monitor = GPUMemoryMonitor()
     if config.train.memory_profiler is not None:
         memory_profiler = MemoryProfiler(config.train.memory_profiler.freq, config.train.memory_profiler.snapshot_dir)
 
@@ -447,7 +442,6 @@ def train(config: Config):
                 # we count the total tokens with respect to all diloco workers
                 # might need to tweak this as some worker might fail to join the all reduce later
                 training_progress.total_tokens += new_tokens * elastic_device_mesh.global_pg.size()
-            remaining_cpu_ram = psutil.virtual_memory().available / (1024 * 1024 * 1024)
 
             metrics = {
                 "Loss": loss_batch.item(),
@@ -456,16 +450,11 @@ def train(config: Config):
                 "Perplexity": torch.exp(loss_batch).item(),
                 "total_tokens": training_progress.total_tokens,
                 "time": time.time(),
-                "remaining_cpu_ram": remaining_cpu_ram,
             }
 
             if config.optim.z_loss:
                 metrics["z_loss"] = z_loss_batch.item()
 
-            if config.train.memory_monitor:
-                peak_gpu_stats = gpu_mem_monitor.get_peak_stats()
-                metrics.update(peak_gpu_stats)
-
             log = f"step: {training_progress.step}, loss: {loss_batch.item():.4f}"
 
             tokens_per_second = perf_counter.get_tokens_per_second()
@@ -539,9 +528,6 @@ def train(config: Config):
                     }
                 )
 
-        if config.train.memory_monitor:
-            logger.info(f"outer step peak gpu stats: {gpu_mem_monitor.format_peak_states()}")
-
         if training_progress.step >= config.optim.total_steps:
             # we only allow to break outisde of the inner loop.
             # This avoid ending the training in the middle of a the inner loop
diff --git a/src/zeroband/utils/__init__.py b/src/zeroband/utils/__init__.py
index 1bb454fb..c0ea3699 100644
--- a/src/zeroband/utils/__init__.py
+++ b/src/zeroband/utils/__init__.py
@@ -1,13 +1,10 @@
 import hashlib
 import socket
 import time
-from typing import Any
 import torch
 from torch.distributed.fsdp import ShardingStrategy
 from torch.distributed._tensor.api import DTensor
 
-from zeroband.utils.logging import get_logger
-
 
 __all__ = ["get_sharding_strategy", "get_peak_flops", "get_num_flop_per_token", "get_num_params"]
 
@@ -165,53 +162,6 @@ def get_tensor_list_signature(tensor_list: list[torch.Tensor]) -> str:
     return hashlib.md5(str(tensors).encode("utf-8")).hexdigest()
 
 
-class GPUMemoryMonitor:
-    # inspired from https://github.com/pytorch/torchtitan/blob/eef8bb2b1b6f0875ab0581079e1511d51654910e/torchtitan/metrics.py#L32
-    def __init__(self, device: str = "cuda"):
-        self.device = torch.device(device)  # device object
-        self.device_capacity = torch.cuda.get_device_properties(self.device).total_memory
-        self.device_capacity_gib = self._to_gib(self.device_capacity)
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.empty_cache()
-
-        self._logger = get_logger()
-
-    def _to_gib(self, memory_in_bytes):
-        # NOTE: GiB (gibibyte) is 1024, vs GB is 1000
-        _gib_in_bytes = 1024 * 1024 * 1024
-        memory_in_gib = memory_in_bytes / _gib_in_bytes
-        return memory_in_gib
-
-    def _to_pct(self, memory):
-        return 100 * memory / self.device_capacity
-
-    def get_peak_stats(self) -> dict[str, Any]:
-        cuda_info = torch.cuda.memory_stats(self.device)
-
-        max_active = cuda_info["active_bytes.all.peak"]
-        max_active_gib = self._to_gib(max_active)
-        max_active_pct = self._to_pct(max_active)
-
-        max_reserved = cuda_info["reserved_bytes.all.peak"]
-        max_reserved_gib = self._to_gib(max_reserved)
-        max_reserved_pct = self._to_pct(max_reserved)
-
-        return {
-            "gpu_max_active_gib": max_active_gib,
-            "gpu_max_active_pct": max_active_pct,
-            "gpu_max_reserved_gib": max_reserved_gib,
-            "gpu_max_reserved_pct": max_reserved_pct,
-        }
-
-    def reset_peak_stats(self):
-        torch.cuda.reset_peak_memory_stats()
-
-    def format_peak_states(self, peak_stats: dict[str, Any] | None = None) -> str:
-        if peak_stats is None:
-            peak_stats = self.get_peak_stats()
-        return f"Active {peak_stats['gpu_max_active_gib']:.2f} GiB ({peak_stats['gpu_max_active_pct']:.2f}%), Reserved {peak_stats['gpu_max_reserved_gib']:.2f} GiB ({peak_stats['gpu_max_reserved_pct']:.2f}%)"
-
-
 def get_random_available_port_list(num_port):
     # https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number
     ports = []