Skip to content

Commit

Permalink
Merge pull request #96 from lfwa/dev
Browse files Browse the repository at this point in the history
Fix DRAM measurements and improve logging
  • Loading branch information
Snailed authored Feb 6, 2025
2 parents bac7c10 + ea007b1 commit c586de4
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 54 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

jobs:
test:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12']
Expand All @@ -30,7 +30,7 @@ jobs:

lint:
needs: test
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7, 3.8, 3.9, '3.10']
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ on:

jobs:
test:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.7', '3.8','3.9', '3.10', '3.11', '3.12']
python-version: ['3.7.17', '3.8.18','3.9.21', '3.10.16', '3.11.11', '3.12.9'] # Latest version available for each. Ref: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
24 changes: 14 additions & 10 deletions carbontracker/components/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from carbontracker.components.handler import Handler
from typing import Iterable, List, Union, Type, Sized
from carbontracker.loggerutil import Logger
import os

COMPONENTS = [
{
Expand Down Expand Up @@ -43,7 +45,7 @@ def handlers_by_name(name) -> List[Type[Handler]]:


class Component:
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger):
self.name = name
if name not in component_names():
raise exceptions.ComponentNameError(
Expand All @@ -54,6 +56,7 @@ def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
)
self.power_usages: List[List[float]] = []
self.cur_epoch: int = -1 # Sentry
self.logger = logger

@property
def handler(self) -> Handler:
Expand Down Expand Up @@ -97,18 +100,19 @@ def collect_power_usage(self, epoch: int):
self.power_usages.append([])
try:
self.power_usages[-1] += self.handler.power_usage()
except exceptions.IntelRaplPermissionError:
except exceptions.IntelRaplPermissionError as e:
energy_paths = " and ".join(e.file_names)
commands = ["sudo chmod +r " + energy_path for energy_path in e.file_names]
# Only raise error if no measurements have been collected.
if not self.power_usages[-1]:
print(
"No sudo access to read Intel's RAPL measurements from the energy_uj file."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/40"
)
self.logger.err_critical(
r"Could not read CPU/DRAM energy consumption due to lack of read-permissions.\n\tPlease run the following command(s): \n\t\t" + r"\n\t\t".join(commands)
)
# Append zero measurement to avoid further errors.
self.power_usages.append([0])
except exceptions.GPUPowerUsageRetrievalError:
if not self.power_usages[-1]:
print(
self.logger.err_critical(
"GPU model does not support retrieval of power usages in NVML."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/36"
)
Expand Down Expand Up @@ -154,16 +158,16 @@ def shutdown(self):


def create_components(
components: str, pids: Iterable[int], devices_by_pid: bool
components: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger
) -> List[Component]:
components = components.strip().replace(" ", "").lower()
if components == "all":
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in component_names()
]
else:
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in components.split(",")
]
20 changes: 12 additions & 8 deletions carbontracker/components/cpu/intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ def _read_energy(self, path: str) -> int:

def _get_measurements(self):
measurements = []
permission_errors = []
for package in self._rapl_devices:
try:
power_usage = self._read_energy(os.path.join(RAPL_DIR, package))
measurements.append(power_usage)
# If there is no sudo access, we cannot read the energy_uj file.
# Permission denied error is raised.
except PermissionError:
raise exceptions.IntelRaplPermissionError()
permission_errors += [os.path.join(RAPL_DIR, package, "energy_uj")]

except FileNotFoundError:
# check cpu/gpu/dram
Expand All @@ -79,12 +80,15 @@ def _get_measurements(self):
)

measurements.append(total_power_usage)

if permission_errors:
raise exceptions.IntelRaplPermissionError(permission_errors)
return measurements

def _convert_rapl_name(self, name, pattern) -> Union[None, str]:
if re.match(pattern, name):
return "cpu:" + name[-1]
def _convert_rapl_name(self, package, name, pattern) -> Union[None, str]:
match = re.match(pattern, package)
name = name if "package" not in name else "cpu"
if match:
return name + ":" + match.group(1)

def init(self):
# Get amount of intel-rapl folders
Expand All @@ -93,15 +97,15 @@ def init(self):
self._devices: List[str] = []
self._rapl_devices: List[str] = []
self.parts_pattern = re.compile(r"intel-rapl:(\d):(\d)")
devices_pattern = re.compile("intel-rapl:.")
devices_pattern = re.compile(r"intel-rapl:(\d)(:\d)?")

for package in packages:
if re.fullmatch(devices_pattern, package):
with open(os.path.join(RAPL_DIR, package, "name"), "r") as f:
name = f.read().strip()
if name != "psys":
if name != "psys" and ("package" in name or "dram" in name):
self._rapl_devices.append(package)
rapl_name = self._convert_rapl_name(package, devices_pattern)
rapl_name = self._convert_rapl_name(package, name, devices_pattern)
if rapl_name is not None:
self._devices.append(rapl_name)

Expand Down
5 changes: 4 additions & 1 deletion carbontracker/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

class NoComponentsAvailableError(Exception):
def __init__(
self,
Expand All @@ -23,7 +25,8 @@ def __init__(self, expected_unit, received_unit, message):
class IntelRaplPermissionError(Exception):
"""Raised when an Intel RAPL permission error occurs."""

pass
def __init__(self, file_names: List[str]):
self.file_names = file_names


class GPUPowerUsageRetrievalError(Exception):
Expand Down
2 changes: 1 addition & 1 deletion carbontracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def __init__(
self.tracker = CarbonTrackerThread(
delete=self._delete,
components=component.create_components(
components=components, pids=pids, devices_by_pid=devices_by_pid
components=components, pids=pids, devices_by_pid=devices_by_pid, logger=self.logger
),
logger=self.logger,
ignore_errors=ignore_errors,
Expand Down
28 changes: 22 additions & 6 deletions tests/components/test_intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_available(self, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["some_directory"]

component = Component(name='cpu', pids=[], devices_by_pid={})
component = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertTrue(component.available())

@patch("os.path.exists")
Expand All @@ -21,6 +21,7 @@ def test_available(self, mock_listdir, mock_exists):
def test_devices(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.side_effect = [["intel-rapl:0", "intel-rapl:1"], ["name"], ["name"]]
mock_file.return_value.read.side_effect = ["package-0", "package-1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()
Expand All @@ -34,7 +35,7 @@ def test_available_false(self, mock_available, mock_listdir, mock_exists):
mock_exists.return_value = False
mock_listdir.return_value = []

cpu = Component(name='cpu', pids=[], devices_by_pid={})
cpu = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertFalse(cpu.available())

@patch("time.sleep")
Expand Down Expand Up @@ -72,14 +73,18 @@ def test__read_energy(self, mock_file):
@patch("builtins.open", new_callable=mock_open)
def test__get_measurements(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]
mock_file.return_value.read.return_value = "1000000"
# Simulate true RAPL zone hierarchy
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:0:0", "intel-rapl:0:1", "intel-rapl:0:2","intel-rapl:1"]
#mock_file.return_value.read.return_value = "1000000"
mock_file.return_value.read.side_effect = ["package-0", "cores", "uncores", "dram", "psys", "1000000", "99999", "88", "88", "88"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

measurements = cpu._get_measurements()
self.assertEqual(measurements, [1000000, 1000000])
self.assertEqual(measurements, [1000000, 99999])
self.assertEqual(cpu._rapl_devices, ["intel-rapl:0", "intel-rapl:0:2"]) # Only package and dram zones are considered, the rest are included in package
self.assertEqual(cpu._devices, ["cpu:0", "dram:0"])

@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
Expand All @@ -89,14 +94,25 @@ def test__convert_rapl_name(self, mock_file, mock_listdir):
cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

self.assertEqual(cpu._convert_rapl_name("intel-rapl:0", re.compile("intel-rapl:.")), "cpu:0")
self.assertEqual(cpu._convert_rapl_name("intel-rapl:0", "package-0", re.compile(r"intel-rapl:(\d)(:\d)?")), "cpu:0")

@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
def test__convert_rapl_name_dram(self, mock_file, mock_listdir):
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

self.assertEqual(cpu._convert_rapl_name("intel-rapl:1", "dram", re.compile(r"intel-rapl:(\d)(:\d)?")), "dram:1")

@patch("os.path.exists")
@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
def test_init(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]
mock_file.return_value.read.side_effect = ["package-0", "package-1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()
Expand Down
Loading

0 comments on commit c586de4

Please sign in to comment.