Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DRAM measurements and improve logging #96

Merged
merged 10 commits into from
Feb 6, 2025
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

jobs:
test:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12']
Expand All @@ -30,7 +30,7 @@ jobs:

lint:
needs: test
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7, 3.8, 3.9, '3.10']
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ on:

jobs:
test:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.7', '3.8','3.9', '3.10', '3.11', '3.12']
python-version: ['3.7.17', '3.8.18','3.9.21', '3.10.16', '3.11.11', '3.12.9'] # Latest version available for each. Ref: https://raw.githubusercontent.com/actions/python-versions/main/versions-manifest.json

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
24 changes: 14 additions & 10 deletions carbontracker/components/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from carbontracker.components.handler import Handler
from typing import Iterable, List, Union, Type, Sized
from carbontracker.loggerutil import Logger
import os

COMPONENTS = [
{
Expand Down Expand Up @@ -43,7 +45,7 @@ def handlers_by_name(name) -> List[Type[Handler]]:


class Component:
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger):
self.name = name
if name not in component_names():
raise exceptions.ComponentNameError(
Expand All @@ -54,6 +56,7 @@ def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
)
self.power_usages: List[List[float]] = []
self.cur_epoch: int = -1 # Sentry
self.logger = logger

@property
def handler(self) -> Handler:
Expand Down Expand Up @@ -97,18 +100,19 @@ def collect_power_usage(self, epoch: int):
self.power_usages.append([])
try:
self.power_usages[-1] += self.handler.power_usage()
except exceptions.IntelRaplPermissionError:
except exceptions.IntelRaplPermissionError as e:
energy_paths = " and ".join(e.file_names)
commands = ["sudo chmod +r " + energy_path for energy_path in e.file_names]
# Only raise error if no measurements have been collected.
if not self.power_usages[-1]:
print(
"No sudo access to read Intel's RAPL measurements from the energy_uj file."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/40"
)
self.logger.err_critical(
r"Could not read CPU/DRAM energy consumption due to lack of read-permissions.\n\tPlease run the following command(s): \n\t\t" + r"\n\t\t".join(commands)
)
# Append zero measurement to avoid further errors.
self.power_usages.append([0])
except exceptions.GPUPowerUsageRetrievalError:
if not self.power_usages[-1]:
print(
self.logger.err_critical(
"GPU model does not support retrieval of power usages in NVML."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/36"
)
Expand Down Expand Up @@ -154,16 +158,16 @@ def shutdown(self):


def create_components(
components: str, pids: Iterable[int], devices_by_pid: bool
components: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger
) -> List[Component]:
components = components.strip().replace(" ", "").lower()
if components == "all":
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in component_names()
]
else:
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in components.split(",")
]
20 changes: 12 additions & 8 deletions carbontracker/components/cpu/intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ def _read_energy(self, path: str) -> int:

def _get_measurements(self):
measurements = []
permission_errors = []
for package in self._rapl_devices:
try:
power_usage = self._read_energy(os.path.join(RAPL_DIR, package))
measurements.append(power_usage)
# If there is no sudo access, we cannot read the energy_uj file.
# Permission denied error is raised.
except PermissionError:
raise exceptions.IntelRaplPermissionError()
permission_errors += [os.path.join(RAPL_DIR, package, "energy_uj")]

except FileNotFoundError:
# check cpu/gpu/dram
Expand All @@ -79,12 +80,15 @@ def _get_measurements(self):
)

measurements.append(total_power_usage)

if permission_errors:
raise exceptions.IntelRaplPermissionError(permission_errors)
return measurements

def _convert_rapl_name(self, name, pattern) -> Union[None, str]:
if re.match(pattern, name):
return "cpu:" + name[-1]
def _convert_rapl_name(self, package, name, pattern) -> Union[None, str]:
match = re.match(pattern, package)
name = name if "package" not in name else "cpu"
if match:
return name + ":" + match.group(1)

def init(self):
# Get amount of intel-rapl folders
Expand All @@ -93,15 +97,15 @@ def init(self):
self._devices: List[str] = []
self._rapl_devices: List[str] = []
self.parts_pattern = re.compile(r"intel-rapl:(\d):(\d)")
devices_pattern = re.compile("intel-rapl:.")
devices_pattern = re.compile(r"intel-rapl:(\d)(:\d)?")

for package in packages:
if re.fullmatch(devices_pattern, package):
with open(os.path.join(RAPL_DIR, package, "name"), "r") as f:
name = f.read().strip()
if name != "psys":
if name != "psys" and ("package" in name or "dram" in name):
self._rapl_devices.append(package)
rapl_name = self._convert_rapl_name(package, devices_pattern)
rapl_name = self._convert_rapl_name(package, name, devices_pattern)
if rapl_name is not None:
self._devices.append(rapl_name)

Expand Down
5 changes: 4 additions & 1 deletion carbontracker/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

class NoComponentsAvailableError(Exception):
def __init__(
self,
Expand All @@ -23,7 +25,8 @@ def __init__(self, expected_unit, received_unit, message):
class IntelRaplPermissionError(Exception):
"""Raised when an Intel RAPL permission error occurs."""

pass
def __init__(self, file_names: List[str]):
self.file_names = file_names


class GPUPowerUsageRetrievalError(Exception):
Expand Down
2 changes: 1 addition & 1 deletion carbontracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def __init__(
self.tracker = CarbonTrackerThread(
delete=self._delete,
components=component.create_components(
components=components, pids=pids, devices_by_pid=devices_by_pid
components=components, pids=pids, devices_by_pid=devices_by_pid, logger=self.logger
),
logger=self.logger,
ignore_errors=ignore_errors,
Expand Down
28 changes: 22 additions & 6 deletions tests/components/test_intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_available(self, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["some_directory"]

component = Component(name='cpu', pids=[], devices_by_pid={})
component = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertTrue(component.available())

@patch("os.path.exists")
Expand All @@ -21,6 +21,7 @@ def test_available(self, mock_listdir, mock_exists):
def test_devices(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.side_effect = [["intel-rapl:0", "intel-rapl:1"], ["name"], ["name"]]
mock_file.return_value.read.side_effect = ["package-0", "package-1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()
Expand All @@ -34,7 +35,7 @@ def test_available_false(self, mock_available, mock_listdir, mock_exists):
mock_exists.return_value = False
mock_listdir.return_value = []

cpu = Component(name='cpu', pids=[], devices_by_pid={})
cpu = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertFalse(cpu.available())

@patch("time.sleep")
Expand Down Expand Up @@ -72,14 +73,18 @@ def test__read_energy(self, mock_file):
@patch("builtins.open", new_callable=mock_open)
def test__get_measurements(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]
mock_file.return_value.read.return_value = "1000000"
# Simulate true RAPL zone hierarchy
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:0:0", "intel-rapl:0:1", "intel-rapl:0:2","intel-rapl:1"]
#mock_file.return_value.read.return_value = "1000000"
mock_file.return_value.read.side_effect = ["package-0", "cores", "uncores", "dram", "psys", "1000000", "99999", "88", "88", "88"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

measurements = cpu._get_measurements()
self.assertEqual(measurements, [1000000, 1000000])
self.assertEqual(measurements, [1000000, 99999])
self.assertEqual(cpu._rapl_devices, ["intel-rapl:0", "intel-rapl:0:2"]) # Only package and dram zones are considered, the rest are included in package
self.assertEqual(cpu._devices, ["cpu:0", "dram:0"])

@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
Expand All @@ -89,14 +94,25 @@ def test__convert_rapl_name(self, mock_file, mock_listdir):
cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

self.assertEqual(cpu._convert_rapl_name("intel-rapl:0", re.compile("intel-rapl:.")), "cpu:0")
self.assertEqual(cpu._convert_rapl_name("intel-rapl:0", "package-0", re.compile(r"intel-rapl:(\d)(:\d)?")), "cpu:0")

@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
def test__convert_rapl_name_dram(self, mock_file, mock_listdir):
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()

self.assertEqual(cpu._convert_rapl_name("intel-rapl:1", "dram", re.compile(r"intel-rapl:(\d)(:\d)?")), "dram:1")

@patch("os.path.exists")
@patch("os.listdir")
@patch("builtins.open", new_callable=mock_open, read_data="cpu")
def test_init(self, mock_file, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["intel-rapl:0", "intel-rapl:1"]
mock_file.return_value.read.side_effect = ["package-0", "package-1"]

cpu = IntelCPU(pids=[], devices_by_pid={})
cpu.init()
Expand Down
Loading