pytorch · atalman · Dec 24, 2024 · Dec 24, 2024
diff --git a/tools/analytics/cubinsizes.py b/tools/analytics/cubinsizes.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# Tool for analyzing sizes of CUDA kernels for various GPU architectures
+import os
+import struct
+import subprocess
+import sys
+from tempfile import TemporaryDirectory
+from typing import Dict
+
+
+# Try to auto-import elftools
+try:
+    from elftools.elf.elffile import ELFFile
+except ModuleNotFoundError:
+    print(f'elftools module not found, trying to install it from pip')
+    from pip._internal import main as pip_main
+    try:
+        pip_main(["install", "pyelftools", "--user"])
+    except SystemExit:
+        print(f'PIP installation failed, please install it manually by invoking "{sys.executable} -mpip install pyelftools --user"')
+        sys.exit(-1)
+    from elftools.elf.elffile import ELFFile
+
+
+# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
+def sizeof_fmt(num, suffix='B'):
+    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, 'Yi', suffix)
+
+
+def compute_cubin_sizes(file_name, section_name='.nv_fatbin', debug=False):
+    with open(file_name, 'rb') as f:
+        elf_file = ELFFile(f)
+        nv_fatbin = elf_file.get_section_by_name(section_name)
+        if nv_fatbin is None:
+            return {}
+        data = nv_fatbin.data()
+        idx, offs = 0, 0
+        elf_sizes = {}
+        while offs < len(data):
+            (magic, version, header_size, fatbin_size) = struct.unpack('IHHL', data[offs: offs + 16])
+            if magic != 0xba55ed50 or version != 1:
+                raise RuntimeError(f"Unexpected fatbin magic {hex(magic)} or version {version}")
+            if debug:
+                print(f"Found fatbin at {offs}  header_size={header_size} fatbin_size={fatbin_size}")
+            offs += header_size
+            fatbin_end = offs + fatbin_size
+            while offs < fatbin_end:
+                (kind, version, hdr_size, elf_size, empty, code_ver, sm_ver) = struct.unpack('HHILLIH', data[offs: offs + 30])
+                if version != 0x0101 or kind not in [1, 2]:
+                    raise RuntimeError(f"Unexpected cubin version {hex(version)} or kind {kind}")
+                sm_ver = f'{"ptx" if kind == 1 else "sm"}_{sm_ver}'
+                if debug:
+                    print(f"    {idx}: elf_size={elf_size} code_ver={hex(code_ver)} sm={sm_ver}")
+                if sm_ver not in elf_sizes:
+                    elf_sizes[sm_ver] = 0
+                elf_sizes[sm_ver] += elf_size
+                idx, offs = idx + 1, offs + hdr_size + elf_size
+            offs = fatbin_end
+        return elf_sizes
+
+
+class ArFileCtx:
+    def __init__(self, ar_name: str) -> None:
+        self.ar_name = os.path.abspath(ar_name)
+        self._tmpdir = TemporaryDirectory()
+
+    def __enter__(self) -> str:
+        self._pwd = os.getcwd()
+        rc = self._tmpdir.__enter__()
+        subprocess.check_call(['ar', 'x', self.ar_name])
+        return rc
+
+    def __exit__(self, ex, value, tb) -> None:
+        os.chdir(self._pwd)
+        return self._tmpdir.__exit__(ex, value, tb)
+
+
+def dict_add(rc: Dict[str, int], b: Dict[str, int]) -> Dict[str, int]:
+    for key, val in b.items():
+        rc[key] = (rc[key] if key in rc else 0) + val
+    return rc
+
+
+def main():
+    if sys.platform != 'linux':
+        print('This script only works with Linux ELF files')
+        return
+    if len(sys.argv) < 2:
+        print(f"{sys.argv[0]} invoked without any arguments trying to infer location of libtorch_cuda")
+        import torch
+        fname = os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_cuda.so')
+    else:
+        fname = sys.argv[1]
+
+    if not os.path.exists(fname):
+        print(f"Can't find {fname}")
+        sys.exit(-1)
+
+    section_names = ['.nv_fatbin', '__nv_relfatbin']
+    results = {name: {} for name in section_names}
+    print(f"Analyzing {fname}")
+    if os.path.splitext(fname)[1] == '.a':
+        with ArFileCtx(fname):
+            for fname in os.listdir("."):
+                if not fname.endswith(".o"): continue
+                for section_name in section_names:
+                    elf_sizes = compute_cubin_sizes(fname, section_name)
+                    dict_add(results[section_name], elf_sizes)
+    else:
+        for section_name in ['.nv_fatbin', '__nv_relfatbin']:
+            dict_add(results[section_name], compute_cubin_sizes(fname, section_name))
+
+    for section_name in section_names:
+        elf_sizes = results[section_name]
+        print(f"{section_name} size {sizeof_fmt(sum(elf_sizes.values()))}")
+        for (sm_ver, total_size) in elf_sizes.items():
+            print(f"  {sm_ver}: {sizeof_fmt(total_size)}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analytics/download_count_wheels.py b/tools/analytics/download_count_wheels.py
@@ -0,0 +1,163 @@
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+import gzip
+import os
+import re
+import urllib
+
+from tqdm import tqdm
+import boto3
+
+S3 = boto3.resource('s3')
+CLIENT = boto3.client('s3')
+BUCKET = S3.Bucket('pytorch')
+
+class CacheEntry:
+    _size = None
+
+    def __init__(self, download_uri: str):
+        self.download_uri = download_uri
+        self.bytes_sent = 0
+
+    @property
+    def os_type(self) -> str:
+        os_type = "linux"
+        if "win" in self.download_uri:
+            os_type = "windows"
+        elif "macosx" in self.download_uri:
+            os_type = "macos"
+        return os_type
+
+    @property
+    def target_arch(self) -> str:
+        target_arch = "cpu"
+        result = re.search(r"cu[0-9]+", self.download_uri)
+        if result:
+            target_arch = result[0]
+        return target_arch
+
+    @property
+    def package_name(self) -> str:
+        filename_contents = os.path.basename(self.download_uri).split('-')
+        return filename_contents[0]
+
+    @property
+    def package_version(self) -> str:
+        if "dev" in self.download_uri:
+            results = re.search(
+                r"[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+",
+                self.download_uri
+            )
+        else:
+            results = re.search(
+                r"[0-9]+\.[0-9]+\.[0-9]+", self.download_uri
+            )
+        if not results:
+            raise Exception("Wtf there's no version o.O")
+        return results[0]
+
+    @property
+    def size(self) -> int:
+        if self._size is None:
+            for key in BUCKET.objects.filter(
+                    Prefix=self.download_uri.lstrip("/")
+            ):
+                self._size = key.size
+            if self._size is None:
+                raise Exception(
+                    f"No object found for prefix {self.download_uri}"
+                )
+        return self._size
+
+    @property
+    def downloads(self):
+        return self.bytes_sent // self.size
+
+def parse_logs(log_directory: str) -> dict:
+    bytes_cache = {}
+    for (dirpath, _, filenames) in os.walk(log_directory):
+        for filename in tqdm(filenames):
+            with gzip.open(os.path.join(dirpath, filename), 'r') as gf:
+                string = gf.read().decode("utf-8")
+                entries = []
+                entries += string.splitlines()[2:]
+            for entry in entries:
+                columns = entry.split('\t')
+                bytes_sent = int(columns[3])
+                download_uri = urllib.parse.unquote(
+                    urllib.parse.unquote(columns[7])
+                )
+                status = columns[8]
+                if not all([
+                        status.startswith("2"),
+                        download_uri.endswith((".whl", ".zip"))
+                ]):
+                    continue
+                if not bytes_cache.get(download_uri):
+                    bytes_cache[download_uri] = CacheEntry(download_uri)
+                bytes_cache[download_uri].bytes_sent += bytes_sent
+    return bytes_cache
+
+def output_results(bytes_cache: dict) -> None:
+    os_results = defaultdict(int)
+    arch_results = defaultdict(int)
+    package_results = defaultdict(lambda: defaultdict(int))
+    for _, val in tqdm(bytes_cache.items()):
+        try:
+            os_results[val.os_type] += val.downloads
+            arch_results[val.target_arch] += val.downloads
+            package_results[val.package_name][val.package_version] += (
+                val.downloads
+            )
+        except Exception:
+            pass
+    print("=-=-= Results =-=-=")
+    print("=-=-= OS =-=-=")
+    total_os_num = sum(os_results.values())
+    for os_type, num in os_results.items():
+        print(
+            f"\t* {os_type}: {num} ({(num/total_os_num)*100:.2f}%)"
+        )
+
+    print("=-=-= ARCH =-=-=")
+    total_arch_num = sum(arch_results.values())
+    for arch_type, num in arch_results.items():
+        print(
+            f"\t* {arch_type}: {num} ({(num/total_arch_num) * 100:.2f}%)"
+        )
+
+    print("=-=-= By Package =-=-=")
+    for package_name, upper_val in package_results.items():
+        print(f"=-=-= {package_name} =-=-=")
+        total_package_num = sum(upper_val.values())
+        for package_version, num in upper_val.items():
+            print(
+                f"\t* {package_version}: {num} ({(num/total_package_num) * 100:.2f}%)"
+            )
+
+def download_logs(log_directory: str, since: float):
+    dt_now = datetime.now(timezone.utc)
+    dt_end = datetime(dt_now.year, dt_now.month, dt_now.day, tzinfo=timezone.utc)
+    dt_start = dt_end - timedelta(days=1, hours=1) # Add 1 hour padding to account for potentially missed logs due to timing 
+    for key in tqdm(BUCKET.objects.filter(Prefix='cflogs')):
+        remote_fname = key.key
+        local_fname = os.path.join(log_directory, remote_fname)
+        # Only download things from yesterday
+        dt_modified = key.last_modified.replace(tzinfo=timezone.utc)
+        if dt_start >= dt_modified or dt_end < dt_modified:
+            continue
+        # TODO: Do this in parallel
+        if not os.path.exists(local_fname):
+            dirname = os.path.dirname(local_fname)
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+            CLIENT.download_file("pytorch", remote_fname, local_fname)
+
+
+if __name__ == "__main__":
+    print("Downloading logs")
+    download_logs('cache', 1)
+    print("Parsing logs")
+    cache = parse_logs('cache/cflogs/')
+    print("Calculating results")
+    output_results(cache)
diff --git a/tools/analytics/duplicates_analyze.py b/tools/analytics/duplicates_analyze.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+from typing import Dict, List
+from subprocess import check_output
+import os
+import sys
+
+
+def get_defined_symbols(fname: str, verbose: bool = False) -> Dict[str, int]:
+    if verbose:
+        print(f"Processing {fname}...", end='', flush=True)
+    if sys.platform == 'darwin':
+        lines = check_output(['nm', '--defined-only', '-n', fname]).decode('ascii').split("\n")[:-1]
+        rc = {}
+        for idx, line in enumerate(lines):
+            addr, stype, name = line.split(' ')
+            size = 4 if idx + 1 == len(lines) else (int(lines[idx + 1].split(' ')[0], 16) - int(addr, 16))
+            rc[name] = size
+    else:
+        lines = check_output(['nm', '--print-size', '--defined-only', fname]).decode('ascii').split('\n')
+        rc = {e[3]: int(e[1], 16) for e in [line.split() for line in lines] if len(e) == 4}
+    if verbose:
+        print("done")
+    return rc
+
+
+def get_deps(fname: str) -> List[str]:
+    if sys.platform == 'darwin':
+        rc = []
+        lines = check_output(['otool', '-l', fname]).decode('ascii').split("\n")[1:-1]
+        for idx, line in enumerate(lines):
+            if line.strip() != 'cmd LC_LOAD_DYLIB':
+                continue
+            path = lines[idx + 2].strip()
+            assert path.startswith('name')
+            rc.append(os.path.basename(path.split(' ')[1]))
+        return rc
+    lines = check_output(['readelf', '--dynamic', fname]).decode('ascii').split('\n')
+    return [line.split('[')[1][:-1] for line in lines if '(NEEDED)' in line]
+
+
+def humansize(size):
+    if size < 1024:
+        return f"{size} bytes"
+    if size < 1024**2:
+        return f"{int(size/1024)} Kb"
+    if size < 1024**3:
+        return f"{size/(1024.0**2):.2f} Mb"
+    return f"{size/(1024.0**3):.2f} Gb"
+
+
+def print_sizes(libname, depth: int = 2) -> None:
+    libs = [libname]
+    depth = 2
+    symbols = {os.path.basename(libname): get_defined_symbols(libname, verbose=True)}
+    for _ in range(depth):
+        for lib in libs:
+            dirname = os.path.dirname(lib)
+            for dep in get_deps(lib):
+                path = os.path.join(dirname, dep)
+                if not os.path.exists(path):
+                    continue
+                if path not in libs:
+                    libs.append(path)
+                    symbols[dep] = get_defined_symbols(path, verbose=True)
+
+    for lib in libs:
+        lib_symbols = symbols[os.path.basename(lib)]
+        lib_keys = set(lib_symbols.keys())
+        rc = f"{lib} symbols size {humansize(sum(lib_symbols.values()))}"
+        for dep in get_deps(lib):
+            if dep not in symbols:
+                continue
+            dep_overlap = lib_keys.intersection(set(symbols[dep].keys()))
+            overlap_size = sum(lib_symbols[k] for k in dep_overlap)
+            if overlap_size > 0:
+                rc += f" {dep} overlap is {humansize(overlap_size)}"
+        print(rc)
+
+
+def print_symbols_overlap(libname1: str, libname2: str) -> None:
+    sym1 = get_defined_symbols(libname1, verbose=True)
+    sym2 = get_defined_symbols(libname2, verbose=True)
+    sym1_size = sum(sym1.values())
+    sym2_size = sum(sym2.values())
+    sym_overlap = set(sym1.keys()).intersection(set(sym2.keys()))
+    overlap_size = sum(sym1[s] for s in sym_overlap)
+    if overlap_size == 0:
+        print(f"{libname1} symbols size {humansize(sym1_size)} does not overlap with {libname2}")
+        return
+    print(f"{libname1} symbols size {humansize(sym1_size)} overlap {humansize(overlap_size)} ({100.0 * overlap_size/sym1_size :.2f}%)")
+    for sym in sym_overlap:
+        print(sym)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        print_symbols_overlap(sys.argv[1], sys.argv[2])
+    else:
+        print_sizes(sys.argv[1] if len(sys.argv) > 1 else "lib/libtorch_cuda.so")