diff --git a/tools/analytics/cubinsizes.py b/tools/analytics/cubinsizes.py new file mode 100755 index 0000000000..33875057a7 --- /dev/null +++ b/tools/analytics/cubinsizes.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# Tool for analyzing sizes of CUDA kernels for various GPU architectures +import os +import struct +import subprocess +import sys +from tempfile import TemporaryDirectory +from typing import Dict + + +# Try to auto-import elftools +try: + from elftools.elf.elffile import ELFFile +except ModuleNotFoundError: + print(f'elftools module not found, trying to install it from pip') + from pip._internal import main as pip_main + try: + pip_main(["install", "pyelftools", "--user"]) + except SystemExit: + print(f'PIP installation failed, please install it manually by invoking "{sys.executable} -mpip install pyelftools --user"') + sys.exit(-1) + from elftools.elf.elffile import ELFFile + + +# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size +def sizeof_fmt(num, suffix='B'): + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yi', suffix) + + +def compute_cubin_sizes(file_name, section_name='.nv_fatbin', debug=False): + with open(file_name, 'rb') as f: + elf_file = ELFFile(f) + nv_fatbin = elf_file.get_section_by_name(section_name) + if nv_fatbin is None: + return {} + data = nv_fatbin.data() + idx, offs = 0, 0 + elf_sizes = {} + while offs < len(data): + (magic, version, header_size, fatbin_size) = struct.unpack('IHHL', data[offs: offs + 16]) + if magic != 0xba55ed50 or version != 1: + raise RuntimeError(f"Unexpected fatbin magic {hex(magic)} or version {version}") + if debug: + print(f"Found fatbin at {offs} header_size={header_size} fatbin_size={fatbin_size}") + offs += header_size + fatbin_end = offs + fatbin_size + while offs < fatbin_end: + (kind, version, hdr_size, elf_size, empty, code_ver, sm_ver) = struct.unpack('HHILLIH', data[offs: offs + 30]) + if version != 0x0101 or kind not in [1, 2]: + raise RuntimeError(f"Unexpected cubin version {hex(version)} or kind {kind}") + sm_ver = f'{"ptx" if kind == 1 else "sm"}_{sm_ver}' + if debug: + print(f" {idx}: elf_size={elf_size} code_ver={hex(code_ver)} sm={sm_ver}") + if sm_ver not in elf_sizes: + elf_sizes[sm_ver] = 0 + elf_sizes[sm_ver] += elf_size + idx, offs = idx + 1, offs + hdr_size + elf_size + offs = fatbin_end + return elf_sizes + + +class ArFileCtx: + def __init__(self, ar_name: str) -> None: + self.ar_name = os.path.abspath(ar_name) + self._tmpdir = TemporaryDirectory() + + def __enter__(self) -> str: + self._pwd = os.getcwd() + rc = self._tmpdir.__enter__() + subprocess.check_call(['ar', 'x', self.ar_name]) + return rc + + def __exit__(self, ex, value, tb) -> None: + os.chdir(self._pwd) + return self._tmpdir.__exit__(ex, value, tb) + + +def dict_add(rc: Dict[str, int], b: Dict[str, int]) -> Dict[str, int]: + for key, val in b.items(): + rc[key] = (rc[key] if key in rc else 0) + val + return rc + + +def main(): + if sys.platform != 'linux': + print('This script only works with Linux ELF files') + return + if len(sys.argv) < 2: + print(f"{sys.argv[0]} invoked without any arguments trying to infer location of libtorch_cuda") + import torch + fname = os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_cuda.so') + else: + fname = sys.argv[1] + + if not os.path.exists(fname): + print(f"Can't find {fname}") + sys.exit(-1) + + section_names = ['.nv_fatbin', '__nv_relfatbin'] + results = {name: {} for name in section_names} + print(f"Analyzing {fname}") + if os.path.splitext(fname)[1] == '.a': + with ArFileCtx(fname): + for fname in os.listdir("."): + if not fname.endswith(".o"): continue + for section_name in section_names: + elf_sizes = compute_cubin_sizes(fname, section_name) + dict_add(results[section_name], elf_sizes) + else: + for section_name in ['.nv_fatbin', '__nv_relfatbin']: + dict_add(results[section_name], compute_cubin_sizes(fname, section_name)) + + for section_name in section_names: + elf_sizes = results[section_name] + print(f"{section_name} size {sizeof_fmt(sum(elf_sizes.values()))}") + for (sm_ver, total_size) in elf_sizes.items(): + print(f" {sm_ver}: {sizeof_fmt(total_size)}") + + +if __name__ == '__main__': + main() diff --git a/tools/analytics/download_count_wheels.py b/tools/analytics/download_count_wheels.py new file mode 100644 index 0000000000..d3562fb16c --- /dev/null +++ b/tools/analytics/download_count_wheels.py @@ -0,0 +1,163 @@ +from collections import defaultdict +from datetime import datetime, timedelta, timezone +import gzip +import os +import re +import urllib + +from tqdm import tqdm +import boto3 + +S3 = boto3.resource('s3') +CLIENT = boto3.client('s3') +BUCKET = S3.Bucket('pytorch') + +class CacheEntry: + _size = None + + def __init__(self, download_uri: str): + self.download_uri = download_uri + self.bytes_sent = 0 + + @property + def os_type(self) -> str: + os_type = "linux" + if "win" in self.download_uri: + os_type = "windows" + elif "macosx" in self.download_uri: + os_type = "macos" + return os_type + + @property + def target_arch(self) -> str: + target_arch = "cpu" + result = re.search(r"cu[0-9]+", self.download_uri) + if result: + target_arch = result[0] + return target_arch + + @property + def package_name(self) -> str: + filename_contents = os.path.basename(self.download_uri).split('-') + return filename_contents[0] + + @property + def package_version(self) -> str: + if "dev" in self.download_uri: + results = re.search( + r"[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+", + self.download_uri + ) + else: + results = re.search( + r"[0-9]+\.[0-9]+\.[0-9]+", self.download_uri + ) + if not results: + raise Exception("Wtf there's no version o.O") + return results[0] + + @property + def size(self) -> int: + if self._size is None: + for key in BUCKET.objects.filter( + Prefix=self.download_uri.lstrip("/") + ): + self._size = key.size + if self._size is None: + raise Exception( + f"No object found for prefix {self.download_uri}" + ) + return self._size + + @property + def downloads(self): + return self.bytes_sent // self.size + +def parse_logs(log_directory: str) -> dict: + bytes_cache = {} + for (dirpath, _, filenames) in os.walk(log_directory): + for filename in tqdm(filenames): + with gzip.open(os.path.join(dirpath, filename), 'r') as gf: + string = gf.read().decode("utf-8") + entries = [] + entries += string.splitlines()[2:] + for entry in entries: + columns = entry.split('\t') + bytes_sent = int(columns[3]) + download_uri = urllib.parse.unquote( + urllib.parse.unquote(columns[7]) + ) + status = columns[8] + if not all([ + status.startswith("2"), + download_uri.endswith((".whl", ".zip")) + ]): + continue + if not bytes_cache.get(download_uri): + bytes_cache[download_uri] = CacheEntry(download_uri) + bytes_cache[download_uri].bytes_sent += bytes_sent + return bytes_cache + +def output_results(bytes_cache: dict) -> None: + os_results = defaultdict(int) + arch_results = defaultdict(int) + package_results = defaultdict(lambda: defaultdict(int)) + for _, val in tqdm(bytes_cache.items()): + try: + os_results[val.os_type] += val.downloads + arch_results[val.target_arch] += val.downloads + package_results[val.package_name][val.package_version] += ( + val.downloads + ) + except Exception: + pass + print("=-=-= Results =-=-=") + print("=-=-= OS =-=-=") + total_os_num = sum(os_results.values()) + for os_type, num in os_results.items(): + print( + f"\t* {os_type}: {num} ({(num/total_os_num)*100:.2f}%)" + ) + + print("=-=-= ARCH =-=-=") + total_arch_num = sum(arch_results.values()) + for arch_type, num in arch_results.items(): + print( + f"\t* {arch_type}: {num} ({(num/total_arch_num) * 100:.2f}%)" + ) + + print("=-=-= By Package =-=-=") + for package_name, upper_val in package_results.items(): + print(f"=-=-= {package_name} =-=-=") + total_package_num = sum(upper_val.values()) + for package_version, num in upper_val.items(): + print( + f"\t* {package_version}: {num} ({(num/total_package_num) * 100:.2f}%)" + ) + +def download_logs(log_directory: str, since: float): + dt_now = datetime.now(timezone.utc) + dt_end = datetime(dt_now.year, dt_now.month, dt_now.day, tzinfo=timezone.utc) + dt_start = dt_end - timedelta(days=1, hours=1) # Add 1 hour padding to account for potentially missed logs due to timing + for key in tqdm(BUCKET.objects.filter(Prefix='cflogs')): + remote_fname = key.key + local_fname = os.path.join(log_directory, remote_fname) + # Only download things from yesterday + dt_modified = key.last_modified.replace(tzinfo=timezone.utc) + if dt_start >= dt_modified or dt_end < dt_modified: + continue + # TODO: Do this in parallel + if not os.path.exists(local_fname): + dirname = os.path.dirname(local_fname) + if not os.path.exists(dirname): + os.makedirs(dirname) + CLIENT.download_file("pytorch", remote_fname, local_fname) + + +if __name__ == "__main__": + print("Downloading logs") + download_logs('cache', 1) + print("Parsing logs") + cache = parse_logs('cache/cflogs/') + print("Calculating results") + output_results(cache) diff --git a/tools/analytics/duplicates_analyze.py b/tools/analytics/duplicates_analyze.py new file mode 100755 index 0000000000..8fdc3af22b --- /dev/null +++ b/tools/analytics/duplicates_analyze.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +from typing import Dict, List +from subprocess import check_output +import os +import sys + + +def get_defined_symbols(fname: str, verbose: bool = False) -> Dict[str, int]: + if verbose: + print(f"Processing {fname}...", end='', flush=True) + if sys.platform == 'darwin': + lines = check_output(['nm', '--defined-only', '-n', fname]).decode('ascii').split("\n")[:-1] + rc = {} + for idx, line in enumerate(lines): + addr, stype, name = line.split(' ') + size = 4 if idx + 1 == len(lines) else (int(lines[idx + 1].split(' ')[0], 16) - int(addr, 16)) + rc[name] = size + else: + lines = check_output(['nm', '--print-size', '--defined-only', fname]).decode('ascii').split('\n') + rc = {e[3]: int(e[1], 16) for e in [line.split() for line in lines] if len(e) == 4} + if verbose: + print("done") + return rc + + +def get_deps(fname: str) -> List[str]: + if sys.platform == 'darwin': + rc = [] + lines = check_output(['otool', '-l', fname]).decode('ascii').split("\n")[1:-1] + for idx, line in enumerate(lines): + if line.strip() != 'cmd LC_LOAD_DYLIB': + continue + path = lines[idx + 2].strip() + assert path.startswith('name') + rc.append(os.path.basename(path.split(' ')[1])) + return rc + lines = check_output(['readelf', '--dynamic', fname]).decode('ascii').split('\n') + return [line.split('[')[1][:-1] for line in lines if '(NEEDED)' in line] + + +def humansize(size): + if size < 1024: + return f"{size} bytes" + if size < 1024**2: + return f"{int(size/1024)} Kb" + if size < 1024**3: + return f"{size/(1024.0**2):.2f} Mb" + return f"{size/(1024.0**3):.2f} Gb" + + +def print_sizes(libname, depth: int = 2) -> None: + libs = [libname] + depth = 2 + symbols = {os.path.basename(libname): get_defined_symbols(libname, verbose=True)} + for _ in range(depth): + for lib in libs: + dirname = os.path.dirname(lib) + for dep in get_deps(lib): + path = os.path.join(dirname, dep) + if not os.path.exists(path): + continue + if path not in libs: + libs.append(path) + symbols[dep] = get_defined_symbols(path, verbose=True) + + for lib in libs: + lib_symbols = symbols[os.path.basename(lib)] + lib_keys = set(lib_symbols.keys()) + rc = f"{lib} symbols size {humansize(sum(lib_symbols.values()))}" + for dep in get_deps(lib): + if dep not in symbols: + continue + dep_overlap = lib_keys.intersection(set(symbols[dep].keys())) + overlap_size = sum(lib_symbols[k] for k in dep_overlap) + if overlap_size > 0: + rc += f" {dep} overlap is {humansize(overlap_size)}" + print(rc) + + +def print_symbols_overlap(libname1: str, libname2: str) -> None: + sym1 = get_defined_symbols(libname1, verbose=True) + sym2 = get_defined_symbols(libname2, verbose=True) + sym1_size = sum(sym1.values()) + sym2_size = sum(sym2.values()) + sym_overlap = set(sym1.keys()).intersection(set(sym2.keys())) + overlap_size = sum(sym1[s] for s in sym_overlap) + if overlap_size == 0: + print(f"{libname1} symbols size {humansize(sym1_size)} does not overlap with {libname2}") + return + print(f"{libname1} symbols size {humansize(sym1_size)} overlap {humansize(overlap_size)} ({100.0 * overlap_size/sym1_size :.2f}%)") + for sym in sym_overlap: + print(sym) + + +if __name__ == '__main__': + if len(sys.argv) == 3: + print_symbols_overlap(sys.argv[1], sys.argv[2]) + else: + print_sizes(sys.argv[1] if len(sys.argv) > 1 else "lib/libtorch_cuda.so") diff --git a/tools/analytics/github_analyze.py b/tools/analytics/github_analyze.py new file mode 100755 index 0000000000..c6da4d8ca8 --- /dev/null +++ b/tools/analytics/github_analyze.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python3 + +from datetime import datetime, timedelta +from typing import Any, Dict, List, Iterable, Optional, Union +from urllib.request import urlopen, Request +from urllib.error import HTTPError +import json +import enum +import os + +class IssueState(enum.Enum): + OPEN = "open" + CLOSED = "closed" + ALL = "all" + + def __str__(self): + return self.value + + +class GitCommit: + commit_hash: str + title: str + body: str + author: str + author_date: datetime + commit_date: Optional[datetime] + pr_url: str + + def __init__(self, + commit_hash: str, + author: str, + author_date: datetime, + title: str, + body: str, + pr_url : str, + commit_date: Optional[datetime] = None) -> None: + self.commit_hash = commit_hash + self.author = author + self.author_date = author_date + self.commit_date = commit_date + self.title = title + self.body = body + self.pr_url = pr_url + + def __contains__(self, item: Any) -> bool: + return item in self.body or item in self.title + + def is_issue_mentioned(self, issue_url: str) -> bool: + if issue_url in self: + return True + if "/pull/" in issue_url: + return False + issue_hash = f"#{issue_url.split('issues/')[1]}" + if "fixes" in self.title.lower() and issue_hash in self.title: + return True + return any("fixes" in line.lower() and issue_hash in line for line in self.body.split("\n")) + + +def get_revert_revision(commit: GitCommit) -> Optional[str]: + import re + body_rc = re.search("Original Phabricator Diff: (D\\d+)", commit.body) + + if commit.title.startswith("Back out \"") and body_rc is not None: + return body_rc.group(1) + + rc = re.match("Revert (D\\d+):", commit.title) + if rc is None: + return None + return rc.group(1) + + +def get_diff_revision(commit: GitCommit) -> Optional[str]: + import re + rc = re.search("\\s*Differential Revision: (D\\d+)", commit.body) + if rc is None: + return None + return rc.group(1) + + +def get_ghf_revert_revision(commit: GitCommit) -> Optional[str]: + import re + rc = re.search("\\s*This reverts commit ([0-9a-f]+).", commit.body) + if all([ + commit.title.startswith("Revert"), + commit.author == "PyTorch MergeBot ", + rc is not None + ]): + return rc.group(1) + return None + + +def is_revert(commit: GitCommit) -> bool: + return get_revert_revision(commit) is not None or get_ghf_revert_revision(commit) is not None + + +def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit: + """ + Expect commit message generated using `--format=medium --date=unix` format, i.e.: + commit + Author: + Date: + + + + <full commit message> + + """ + if isinstance(lines, str): + lines = lines.split("\n") + # TODO: Handle merge commits correctly + if len(lines) > 1 and lines[1].startswith("Merge:"): + del lines[1] + assert len(lines) > 5 + assert lines[0].startswith("commit") + assert lines[1].startswith("Author: ") + assert lines[2].startswith("Date: ") + assert len(lines[3]) == 0 + return GitCommit(commit_hash=lines[0].split()[1].strip(), + author=lines[1].split(":", 1)[1].strip(), + author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), + title=lines[4].strip(), + body="\n".join(lines[5:]), + ) + + +def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit: + """ + Expect commit message generated using `--format=fuller --date=unix` format, i.e.: + commit <sha1> + Author: <author> + AuthorDate: <author date> + Commit: <committer> + CommitDate: <committer date> + + <title line> + + <full commit message> + + """ + if isinstance(lines, str): + lines = lines.split("\n") + # TODO: Handle merge commits correctly + if len(lines) > 1 and lines[1].startswith("Merge:"): + del lines[1] + assert len(lines) > 7 + assert lines[0].startswith("commit") + assert lines[1].startswith("Author: ") + assert lines[2].startswith("AuthorDate: ") + assert lines[3].startswith("Commit: ") + assert lines[4].startswith("CommitDate: ") + assert len(lines[5]) == 0 + + prUrl = "" + for line in lines: + if "Pull Request resolved:" in line: + prUrl = line.split("Pull Request resolved:")[1].strip() + break + + return GitCommit(commit_hash=lines[0].split()[1].strip(), + author=lines[1].split(":", 1)[1].strip(), + author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), + commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())), + title=lines[6].strip(), + body="\n".join(lines[7:]), + pr_url=prUrl, + ) + + +def _check_output(items: List[str], encoding='utf-8') -> str: + from subprocess import check_output + return check_output(items).decode(encoding) + + +def get_git_remotes(path: str) -> Dict[str, str]: + keys = _check_output(["git", "-C", path, "remote"]).strip().split("\n") + return {key: _check_output(["git", "-C", path, "remote", "get-url", key]).strip() for key in keys} + + +class GitRepo: + def __init__(self, path, remote='upstream'): + self.repo_dir = path + self.remote = remote + + def _run_git_cmd(self, *args) -> str: + return _check_output(['git', '-C', self.repo_dir] + list(args)) + + def _run_git_log(self, revision_range) -> List[GitCommit]: + log = self._run_git_cmd('log', '--format=fuller', + '--date=unix', revision_range, '--', '.').split("\n") + rc: List[GitCommit] = [] + cur_msg: List[str] = [] + for line in log: + if line.startswith("commit"): + if len(cur_msg) > 0: + rc.append(parse_fuller_format(cur_msg)) + cur_msg = [] + cur_msg.append(line) + if len(cur_msg) > 0: + rc.append(parse_fuller_format(cur_msg)) + return rc + + def get_commit_list(self, from_ref, to_ref) -> List[GitCommit]: + return self._run_git_log(f"{self.remote}/{from_ref}..{self.remote}/{to_ref}") + + def get_ghstack_orig_branches(self) -> List[str]: + return [x.strip() for x in self._run_git_cmd("branch", "--remotes", "--list", self.remote + "/gh/*/orig").strip().split("\n")] + + def show_ref(self, ref) -> str: + return self._run_git_cmd("show-ref", ref).split(" ")[0] + + def merge_base(self, ref1, ref2) -> str: + return self._run_git_cmd("merge-base", ref1, ref2).strip() + + def rev_list(self, ref): + return self._run_git_cmd("rev-list", f"{self.remote}/main..{ref}").strip().split() + + +def build_commit_dict(commits: List[GitCommit]) -> Dict[str, GitCommit]: + rc = {} + for commit in commits: + assert commit.commit_hash not in rc + rc[commit.commit_hash] = commit + return rc + + +def fetch_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + headers = {'Accept': 'application/vnd.github.v3+json'} + token = os.environ.get("GITHUB_TOKEN") + if token is not None and url.startswith('https://api.github.com/'): + headers['Authorization'] = f'token {token}' + if params is not None and len(params) > 0: + url += '?' + '&'.join(f"{name}={val}" for name, val in params.items()) + try: + with urlopen(Request(url, headers=headers)) as data: + return json.load(data) + except HTTPError as err: + if err.code == 403 and all(key in err.headers for key in ['X-RateLimit-Limit', 'X-RateLimit-Used']): + print(f"Rate limit exceeded: {err.headers['X-RateLimit-Used']}/{err.headers['X-RateLimit-Limit']}") + raise + +def fetch_multipage_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + if params is None: + params = {} + assert "page" not in params + page_idx, rc, prev_len, params = 1, [], -1, params.copy() + while len(rc) > prev_len: + prev_len = len(rc) + params["page"] = page_idx + page_idx += 1 + rc += fetch_json(url, params) + return rc + + +def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = IssueState.OPEN) -> List[Dict[str, Any]]: + url = f'https://api.github.com/repos/{org}/{project}/milestones' + return fetch_multipage_json(url, {"state": state}) + +def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN): + url = f'https://api.github.com/repos/{org}/{project}/issues' + return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state}) + + +def gh_get_ref_statuses(org: str, project: str, ref: str) -> Dict[str, Any]: + url = f'https://api.github.com/repos/{org}/{project}/commits/{ref}/status' + params = {"page": 1, "per_page": 100} + nrc = rc = fetch_json(url, params) + while "statuses" in nrc and len(nrc["statuses"]) == 100: + params["page"] += 1 + nrc = fetch_json(url, params) + if "statuses" in nrc: + rc["statuses"] += nrc["statuses"] + return rc + +def get_issue_comments(org: str, project: str, issue_number : int): + url = f'https://api.github.com/repos/{org}/{project}/issues/{issue_number}/comments' + + return fetch_multipage_json(url) + +def extract_statuses_map(json: Dict[str, Any]): + return {s["context"]: s["state"] for s in json["statuses"]} + + +class PeriodStats: + commits: int + reverts: int + authors: int + date: datetime + + def __init__(self, date: datetime, commits: int, reverts: int, authors: int) -> None: + self.date = date + self.commits = commits + self.reverts = reverts + self.authors = authors + + +def get_monthly_stats(commits: List[GitCommit]) -> Iterable[PeriodStats]: + y, m, total, reverts, authors = None, None, 0, 0, set() + for commit in commits: + commit_date = commit.commit_date if commit.commit_date is not None else commit.author_date + if y != commit_date.year or m != commit_date.month: + if y is not None: + yield PeriodStats(datetime(y, m, 1), total, reverts, len(authors)) + y, m, total, reverts, authors = commit_date.year, commit_date.month, 0, 0, set() + if is_revert(commit): + reverts += 1 + total += 1 + authors.add(commit.author) + + +def print_monthly_stats(commits: List[GitCommit]) -> None: + stats = list(get_monthly_stats(commits)) + for idx, stat in enumerate(stats): + y = stat.date.year + m = stat.date.month + total, reverts, authors = stat.commits, stat.reverts, stat.authors + reverts_ratio = 100.0 * reverts / total + if idx + 1 < len(stats): + commits_growth = 100.0 * (stat.commits / stats[idx + 1].commits - 1) + else: + commits_growth = float('nan') + print(f"{y}-{m:02d}: commits {total} ({commits_growth:+.1f}%) reverts {reverts} ({reverts_ratio:.1f}%) authors {authors}") + + +def print_reverts(commits: List[GitCommit]) -> None: + for commit in commits: + if not is_revert(commit): + continue + print(f"{commit.commit_date} {commit.title} {commit.commit_hash} {commit.body}") + + +def analyze_reverts(commits: List[GitCommit]): + for idx, commit in enumerate(commits): + revert_id = get_revert_revision(commit) + if revert_id is None: + continue + orig_commit = None + for i in range(1, 100): + orig_commit = commits[idx + i] + if get_diff_revision(orig_commit) == revert_id: + break + if orig_commit is None: + print(f"Failed to find original commit for {commit.title}") + continue + print(f"{commit.commit_hash} is a revert of {orig_commit.commit_hash}: {orig_commit.title}") + revert_statuses = gh_get_ref_statuses("pytorch", "pytorch", commit.commit_hash) + orig_statuses = gh_get_ref_statuses("pytorch", "pytorch", orig_commit.commit_hash) + orig_sm = extract_statuses_map(orig_statuses) + revert_sm = extract_statuses_map(revert_statuses) + for k in revert_sm.keys(): + if k not in orig_sm: + continue + if orig_sm[k] != revert_sm[k]: + print(f"{k} {orig_sm[k]}->{revert_sm[k]}") + + +def print_contributor_stats(commits, delta: Optional[timedelta] = None) -> None: + authors: Dict[str, int] = {} + now = datetime.now() + # Default delta is one non-leap year + if delta is None: + delta = timedelta(days=365) + for commit in commits: + date, author = commit.commit_date, commit.author + if now - date > delta: + break + if author not in authors: + authors[author] = 0 + authors[author] += 1 + + print(f"{len(authors)} contributors made {sum(authors.values())} commits in last {delta.days} days") + for count, author in sorted(((commit, author) for author, commit in authors.items()), reverse=True): + print(f"{author}: {count}") + + +def commits_missing_in_branch(repo: GitRepo, branch: str, orig_branch: str, milestone_idx: int) -> None: + def get_commits_dict(x, y): + return build_commit_dict(repo.get_commit_list(x, y)) + main_commits = get_commits_dict(orig_branch, 'main') + release_commits = get_commits_dict(orig_branch, branch) + print(f"len(main_commits)={len(main_commits)}") + print(f"len(release_commits)={len(release_commits)}") + print("URL;Title;Status") + for issue in gh_get_milestone_issues('pytorch', 'pytorch', milestone_idx, IssueState.ALL): + issue_url, state = issue["html_url"], issue["state"] + # Skip closed states if they were landed before merge date + if state == "closed": + mentioned_after_cut = any(commit.is_issue_mentioned(issue_url) for commit in main_commits.values()) + # If issue is not mentioned after cut, that it must be present in release branch + if not mentioned_after_cut: + continue + mentioned_in_release = any(commit.is_issue_mentioned(issue_url) for commit in release_commits.values()) + # if Issue is mentioned is release branch, than it was picked already + if mentioned_in_release: + continue + print(f'{issue_url};{issue["title"]};{state}') + +def commits_missing_in_release(repo: GitRepo, branch: str, orig_branch: str, minor_release: str, milestone_idx: int, cut_off_date : datetime, issue_num : int) -> None: + def get_commits_dict(x, y): + return build_commit_dict(repo.get_commit_list(x, y)) + main_commits = get_commits_dict(minor_release, 'main') + prev_release_commits = get_commits_dict(orig_branch, branch) + current_issue_comments = get_issue_comments('pytorch', 'pytorch',issue_num) # issue comments for the release tracker as cherry picks + print(f"len(main_commits)={len(main_commits)}") + print(f"len(prev_release_commits)={len(prev_release_commits)}") + print(f"len(current_issue_comments)={len(current_issue_comments)}") + print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}") + print("URL;Title;Status") + + # Iterate over the previous release branch to find potentially missing cherry picks in the current issue. + for commit in prev_release_commits.values(): + not_cherry_picked_in_current_issue = any(commit.pr_url not in issue_comment['body'] for issue_comment in current_issue_comments) + for main_commit in main_commits.values(): + if main_commit.pr_url == commit.pr_url : + mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date + if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date: + # Commits that are release only, which exist in previous release branch and not in main. + print(f'{commit.pr_url};{commit.title};{commit.commit_date}') + break + +def analyze_stacks(repo: GitRepo) -> None: + from tqdm.contrib.concurrent import thread_map + branches = repo.get_ghstack_orig_branches() + stacks_by_author: Dict[str, List[int]] = {} + for branch,rv_commits in thread_map(lambda x: (x, repo.rev_list(x)), branches, max_workers=10): + author = branch.split("/")[2] + if author not in stacks_by_author: + stacks_by_author[author]=[] + stacks_by_author[author].append(len(rv_commits)) + for author, slen in sorted(stacks_by_author.items(), key=lambda x:len(x[1]), reverse=True): + if len(slen) == 1: + print(f"{author} has 1 stack of depth {slen[0]}") + continue + print(f"{author} has {len(slen)} stacks max depth is {max(slen)} avg depth is {sum(slen)/len(slen):.2f} mean is {slen[len(slen)//2]}") + + +def parse_arguments(): + from argparse import ArgumentParser + parser = ArgumentParser(description="Print GitHub repo stats") + parser.add_argument("--repo-path", + type=str, + help="Path to PyTorch git checkout", + default=os.path.expanduser("~/git/pytorch/pytorch")) + parser.add_argument("--milestone-id", type=str) + parser.add_argument("--branch", type=str) + parser.add_argument("--minor-release", type=str) + parser.add_argument("--remote", + type=str, + help="Remote to base off of", + default="") + parser.add_argument("--analyze-reverts", action="store_true") + parser.add_argument("--print-reverts", action="store_true") + parser.add_argument("--contributor-stats", action="store_true") + parser.add_argument("--missing-in-branch", action="store_true") + parser.add_argument("--missing-in-release", action="store_true") + parser.add_argument("--analyze-stacks", action="store_true") + parser.add_argument('--date', type=lambda d: datetime.strptime(d, '%Y-%m-%d')) + parser.add_argument("--issue-num", type=int) + return parser.parse_args() + + +def main(): + import time + args = parse_arguments() + remote = args.remote + if not remote: + remotes = get_git_remotes(args.repo_path) + # Pick best remote + remote = next(iter(remotes.keys())) + for key in remotes: + if remotes[key].endswith('github.com/pytorch/pytorch'): + remote = key + + repo = GitRepo(args.repo_path, remote) + + if args.analyze_stacks: + analyze_stacks(repo) + return + + # Use milestone idx or search it along milestone titles + try: + milestone_idx = int(args.milestone_id) + except ValueError: + milestone_idx = -1 + milestones = gh_get_milestones() + for milestone in milestones: + if milestone.get('title', '') == args.milestone_id: + milestone_idx = int(milestone.get('number', '-2')) + if milestone_idx < 0: + print(f'Could not find milestone {args.milestone_id}') + return + + if args.missing_in_branch: + commits_missing_in_branch(repo, + args.branch, + f'orig/{args.branch}', + milestone_idx) + return + + if args.missing_in_release: + commits_missing_in_release(repo, + args.branch, + f'orig/{args.branch}', + args.minor_release, + milestone_idx, + args.date, + args.issue_num + ) + return + + print(f"Parsing git history with remote {remote}...", end='', flush=True) + start_time = time.time() + x = repo._run_git_log(f"{remote}/main") + print(f"done in {time.time()-start_time:.1f} sec") + if args.analyze_reverts: + analyze_reverts(x) + elif args.contributor_stats: + print_contributor_stats(x) + elif args.print_reverts: + print_reverts(x[:2**9]) + else: + print_monthly_stats(x) + + +if __name__ == "__main__": + main() diff --git a/tools/analytics/s3_test_stats_analyze.py b/tools/analytics/s3_test_stats_analyze.py new file mode 100644 index 0000000000..74b4f6de8d --- /dev/null +++ b/tools/analytics/s3_test_stats_analyze.py @@ -0,0 +1,147 @@ +import argparse +import boto3 +import bz2 +import json +import os +import re +import requests + +import pandas as pd + +from datetime import datetime, timedelta +from tqdm import tqdm +from typing import Any, Dict, Optional, List + +S3 = boto3.resource('s3') +CLIENT = boto3.client('s3') +BUCKET = S3.Bucket('ossci-metrics') + +GITHUB_API_BASE = "https://api.github.com/" +GITHUB_COMMITS_API = "repos/pytorch/pytorch/commits" +STRF_FORMAT = "%Y-%m-%dT%H:%M:%SZ" + +CACHE_PICKLE = "cache/test_time/dataframe.pickle" + +def _get_latests_git_commit_sha_list(lookback: int): + sha_since = (datetime.utcnow() - timedelta(hours = lookback)).strftime(STRF_FORMAT) + resp = requests.get(GITHUB_API_BASE + GITHUB_COMMITS_API + f"?since={sha_since}") + if resp.status_code == 200: + return [e.get('sha') for e in resp.json()] + else: + return [] + +def _json_to_df(data: Dict[str, Any], granularity: str) -> pd.DataFrame: + reformed_data = list() + for fname, fdata in data['files'].items(): + if granularity == 'file': + reformed_data.append({ + "job": data['job'], + "sha": data['sha'], + 'file': fname, + 'file_total_sec': fdata['total_seconds'], + }) + else: + for sname, sdata in fdata['suites'].items(): + if granularity == 'suite': + reformed_data.append({ + "job": data['job'], + "sha": data['sha'], + 'suite': sname, + 'suite_total_sec': sdata['total_seconds'], + }) + else: + for cname, cdata in sdata['cases'].items(): + reformed_data.append({ + "job": data['job'], + "sha": data['sha'], + 'case': cname, + 'case_status': cdata['status'], + 'case_sec': cdata['seconds'], + }) + df = pd.json_normalize(reformed_data) + return df + + +def download_stats(folder: str, lookback: int): + commit_sha_list = _get_latests_git_commit_sha_list(lookback) + for commit_sha in commit_sha_list: + for key in tqdm(BUCKET.objects.filter(Prefix=f'test_time/{commit_sha}')): + remote_fname = key.key + local_fname = os.path.join(folder, remote_fname) + # TODO: Do this in parallel + if not os.path.exists(local_fname): + dirname = os.path.dirname(local_fname) + if not os.path.exists(dirname): + os.makedirs(dirname) + # only download when there's a cache miss + if not os.path.exists(local_fname) or not os.path.isfile(local_fname): + print(f"\nDownloading {remote_fname}...") + CLIENT.download_file("ossci-metrics", remote_fname, local_fname) + + +def parse_and_export_stats(folder: str, granularity: str, commit_sha_lists: Optional[List[str]] = None): + dataframe = None + for (dirpath, _, filenames) in os.walk(folder): + for filename in tqdm(filenames): + splits = dirpath.split("/") + job_name = splits[-1] + sha = splits[-2] + if not commit_sha_lists or sha in commit_sha_lists: + with bz2.open(os.path.join(dirpath, filename), 'r') as zf: + string = zf.read().decode("utf-8") + data = json.loads(string) + # create a deep json with sha and job info + data['sha'] = sha + data['job'] = job_name + df = _json_to_df(data, granularity) + dataframe = df if dataframe is None else dataframe.append(df) + return dataframe + + +def main(): + parser = argparse.ArgumentParser( + __file__, + description="download and cache test stats locally, both raw and pandas format", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + '--lookback', + type=int, + help='lookback in # of hours', + default=24, + ) + parser.add_argument( + '--output', + help='output filename', + default='cache/df.pickle', + ) + parser.add_argument( + '--cache_folder', + help='cache folder', + default='cache', + ) + parser.add_argument( + '--granularity', + choices=['file', 'suite', 'case'], + help='granularity of stats summary', + default='file', + ) + args = parser.parse_args() + + lookback = args.lookback + cache_folder = args.cache_folder + output = args.output + granularity = args.granularity + + print("Downloading test stats") + download_stats(cache_folder, lookback) + print("Parsing test stats and write to pd dataframe") + if not os.path.exists(output): + dataframe = parse_and_export_stats(f'{cache_folder}/test_time/', granularity) + dataframe.to_pickle(output) + + + +if __name__ == "__main__": + main() + diff --git a/tools/analytics/validate_binaries.py b/tools/analytics/validate_binaries.py new file mode 100644 index 0000000000..65965c59ad --- /dev/null +++ b/tools/analytics/validate_binaries.py @@ -0,0 +1,77 @@ +from conda.cli.python_api import Commands, run_command +from tabulate import tabulate +from datetime import datetime +import json + +PLATFORMS = ["osx-64", "linux-64", "win-64"] +PYTHON_VERSIONS = ["3.10", "3.9", "3.8", "3.7"] +CUDA_CUDNN_VERSION = [ + ("11.7", "8.5.0"), ("cpu", None) +] +CHANNEL = "pytorch-test" +VERSION = "1.13.*" + + +def generate_expected_builds(platform: str) -> set: + builds = set() + for py_version in PYTHON_VERSIONS: + if platform == "osx-64": + # macos builds support cpu only. + builds.add(f"py{py_version}_0") + continue + + for cuda_version, cudnn_version in CUDA_CUDNN_VERSION: + if platform == "win-64": + cudnn_version = "8" + + if cuda_version == "cpu": + builds.add(f"py{py_version}_{cuda_version}_0") + else: + builds.add(f"py{py_version}_cuda{cuda_version}_cudnn{cudnn_version}_0") + return builds + + +def size_format(size_num) -> str: + for unit in ["", "K", "M", "G"]: + if abs(size_num) < 1024.0: + return f"{size_num:3.1f}{unit}B" + + size_num /= 1024.0 + return f"{size_num:3.1f}TB" + + +def main() -> None: + # Iterate over platform to gather build information of available conda version. + for platform in PLATFORMS: + expected_builds = generate_expected_builds(platform) + + # Actual builds available in Conda + stdout, stderr, return_code = run_command( + Commands.SEARCH, f"{CHANNEL}::*[name=pytorch version={VERSION} subdir={platform}]", "--json") + + if return_code != 0: + raise Exception(stderr) + + available_versions = json.loads(stdout) + output_data = [] + headers = ["File Name", "Date", "Size"] + actual_builds = set() + for version in available_versions["pytorch"]: + actual_builds.add(version["build"]) + output_data.append(( + version["fn"], + datetime.fromtimestamp(version["timestamp"] / 1000), + size_format(version["size"]) + )) + + assert len(expected_builds) > 0, "expected builds set should not be empty." + assert expected_builds == actual_builds, ( + f"Missing following builds in conda: {expected_builds.difference(actual_builds)} for platform {platform}" + ) + + print(f"\nSuccessfully verified following binaries are available in Conda for {platform}...") + print(tabulate(output_data, headers=headers, tablefmt="grid")) + + +if __name__ == "__main__": + main() diff --git a/tools/analytics/validate_pypi_staging.py b/tools/analytics/validate_pypi_staging.py new file mode 100644 index 0000000000..5321313dfc --- /dev/null +++ b/tools/analytics/validate_pypi_staging.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +import os.path +import shutil +import subprocess +import tempfile +import zipfile + +import boto3 +import botocore + +PLATFORMS = [ + "manylinux1_x86_64", + "manylinux2014_aarch64", + "win_amd64", + "macosx_11_0_arm64", +] +PYTHON_VERSIONS = [ + "cp38", + "cp39", + "cp310", + "cp311", + "cp312" + ] +S3_PYPI_STAGING = "pytorch-backup" +PACKAGE_RELEASES = { + "torch": "2.3.1", + "torchvision": "0.18.1", + "torchaudio": "2.3.1", + "torchtext": "0.18.0", + "executorch": "0.2.1" +} + +PATTERN_V = "Version:" +PATTERN_RD = "Requires-Dist:" + +s3 = boto3.client("s3") + + +def get_size(path): + size = os.path.getsize(path) + if size < 1024: + return f"{size} bytes" + elif size < pow(1024, 2): + return f"{round(size/1024, 2)} KB" + elif size < pow(1024, 3): + return f"{round(size/(pow(1024,2)), 2)} MB" + elif size < pow(1024, 4): + return f"{round(size/(pow(1024,3)), 2)} GB" + + +def generate_expected_builds(platform: str, package: str, release: str) -> list: + builds = [] + for py_version in PYTHON_VERSIONS: + py_spec = f"{py_version}-{py_version}" + platform_spec = platform + + if package == "torchtext" and ( + platform == "manylinux2014_aarch64" or py_version == "cp312" + ): + continue + + # strange macos file nameing + if "macos" in platform: + if package == "torch": + py_spec = f"{py_version}-none" + elif "macosx_10_9_x86_64" in platform: + platform_spec = "macosx_10_13_x86_64" + + builds.append( + f"{package}-{release}-pypi-staging/{package}-{release}-{py_spec}-{platform_spec}.whl" + ) + + return builds + + +def validate_file_metadata(build: str, package: str, version: str): + temp_dir = tempfile.mkdtemp() + tmp_file = f"{temp_dir}/{os.path.basename(build)}" + s3.download_file(Bucket=S3_PYPI_STAGING, Key=build, Filename=tmp_file) + print(f"Downloaded: {tmp_file} {get_size(tmp_file)}") + + try: + check_wheels = subprocess.run( + ["check-wheel-contents", tmp_file, "--ignore", "W002,W009,W004"], + capture_output=True, + text=True, + check=True, + encoding="utf-8", + ) + print(check_wheels.stdout) + print(check_wheels.stderr) + except subprocess.CalledProcessError as e: + exit_code = e.returncode + stderror = e.stderr + print(exit_code, stderror) + + with zipfile.ZipFile(f"{temp_dir}/{os.path.basename(build)}", "r") as zip_ref: + zip_ref.extractall(f"{temp_dir}") + + for i, line in enumerate( + open(f"{temp_dir}/{package}-{version}.dist-info/METADATA") + ): + if line.startswith(PATTERN_V): + print(f"{line}", end="") + exttracted_version = line.removeprefix(PATTERN_V).strip() + if version != exttracted_version: + print( + f"FAILURE VERSION DOES NOT MATCH expected {version} got {exttracted_version}" + ) + + elif line.startswith(PATTERN_RD): + print(f"{line}", end="") + + shutil.rmtree(temp_dir) + + +def main(): + expected_builds = dict.fromkeys(PACKAGE_RELEASES, []) + + # Iterate over platform to gather build information of available conda version. + for package in PACKAGE_RELEASES: + for platform in PLATFORMS: + expected_builds[package] = expected_builds[ + package + ] + generate_expected_builds(platform, package, PACKAGE_RELEASES[package]) + + for package in PACKAGE_RELEASES: + count = 0 + for build in expected_builds[package]: + try: + s3.head_object(Bucket=S3_PYPI_STAGING, Key=build) + print(f"Validating filename {os.path.basename(build)}") + validate_file_metadata(build, package, PACKAGE_RELEASES[package]) + count += 1 + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print(f"FAILED 404 Error on {build}") + elif e.response["Error"]["Code"] == "403": + print(f"FAILED Unauthorized Error on {build}") + else: + print(f"Error on {build}") + print(f"Package Validated {count} for {package}") + + +if __name__ == "__main__": + main()