-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkflows.py
195 lines (162 loc) · 7.49 KB
/
workflows.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
The exact format of a workflows dict evolves over several stages of filtering. Therefore,
its format will be different, depending on which stage it is built during. See examples
and custom type aliases below.
After stage 3 filter (checking for workflow files, retrieving workflow filenames):
{
"123": [
{ "name": "release.yml" },
{ "name": "test.yml" },
...
],
...
}
After stage 4 filter (checking for actual CI usage, retrieving workflow YAML content):
{
"123": {
"0": { "name": "release.yml", "text": "These are release YAML contents" },
"1": { "name": "test.yml", "text": "These are test YAML contents" },
...
},
...
}
"""
from typing import Any, Dict, List, Union
from run_commands import match_any_build_cmd_regex
from data_io import (
read_dict_from_json_file,
read_dict_from_yaml_str,
write_dict_to_json_file
)
WorkflowFilenameDict = Dict[str, List[Dict[str, str]]]
WorkflowInfoDict = Dict[str, Dict[str, Dict[str, str]]]
AnyWorkflowDict = Union[WorkflowFilenameDict, WorkflowInfoDict]
WorkflowRuns = List[Dict[str, Any]]
def encode_workflow_runs_path(workflow_runs_prefix: str, repo_id: str,
workflow_idx_str: str) -> str:
"""
Encode a filename for a JSON file containing all workflow runs for a given project / workflow.
Produces a filename of the form `workflow_runs_repo123workflow456.json`, which indicates that
the file contains workflow runs for workflow 456 in repo 123.
"""
return f"{workflow_runs_prefix}_repo{repo_id}workflow{workflow_idx_str}.json"
def load_workflows(input_project_workflows_path: str) -> AnyWorkflowDict:
"""
Read project workflow information from a JSON file into a dict. The exact format of the dict
may vary (ie. stage 3 retrieves workflow filenames, stage 4 retrieves YAML content).
"""
print(f"Loading workflows from {input_project_workflows_path}...")
workflows_dict = read_dict_from_json_file(input_project_workflows_path)
print(f"Loaded workflows for {len(workflows_dict.keys())} projects")
return workflows_dict
def save_workflows(project_workflows_dict: Dict, output_workflows_path: str) -> None:
"""
Write a dictionary containing project workflows to a JSON file. The exact format of the
dictionary may vary (ie. stage 3 retrieves workflow filenames, stage 4 retrieves YAML content).
"""
write_dict_to_json_file(project_workflows_dict, output_workflows_path)
print(
f"Wrote workflows for {len(project_workflows_dict.keys())} projects to {output_workflows_path}")
def load_workflow_runs(workflow_runs_path: str) -> WorkflowRuns:
"""
Read workflow runs for a given project / workflow from a JSON file, and write the data into
a list of dictionaries.
"""
workflow_runs = read_dict_from_json_file(workflow_runs_path)
return workflow_runs
def check_workflow_jobs_for_cmd(workflow: Union[Dict[str, Any], List[Any]]) -> bool:
"""
Traverse the provided portion of a workflow file (ie. DFS), testing all 'run' commands
for CI usage. Returns `True` if at least one run command matches a CI command regex, and
`False` otherwise. This function is called recursively, such that any match will bubble-up
and return `True`.
"""
if type(workflow) is dict:
for key, val in workflow.items():
# If this is a run cmd, and it matches the regex, return True
if key == 'run' and type(val) is str:
if match_any_build_cmd_regex(val):
return True
else:
if check_workflow_jobs_for_cmd(val):
return True
if type(workflow) is list:
for item in workflow:
if check_workflow_jobs_for_cmd(item):
return True
return False
def check_workflow_for_cmd(workflow: Dict[str, Any]) -> bool:
"""
Traverse the 'jobs' in a workflow file, testing all 'run' commands for CI usage. Returns
`True` if at least one run command matches a CI command regex, and `False` otherwise.
"""
if 'jobs' in workflow and workflow['jobs'] is not None:
return check_workflow_jobs_for_cmd(workflow['jobs'])
return False
def get_workflows_using_ci(workflows_filename: str) -> WorkflowInfoDict:
"""
Given the filename of a JSON file containing project YAML workflows, return the subset of
workflows that actually use CI. This definition of 'CI' is somewhat arbitrary, and is specific
to this study. We aim to avoid false positives (ie. returning `True` for a non-CI workflow),
and would prefer false negatives (ie. returning `False` for a CI workflow).
Example input workflows file (returned dict will look the same):
```
{
'123': {
'0': { "name": "build.yml", "text": "These are my YAML contents" },
...
},
...
}
```
"""
# Read JSON containing all workflows for all projects
workflows_dict = read_dict_from_json_file(workflows_filename)
ci_workflows_dict = {}
# Iterate through each repo, and each workflow for each repo
for i, (repo_id, workflows) in enumerate(workflows_dict.items()):
if i % 100 == 0:
print(
f"Checking repo workflows for CI usage ({i}/{len(workflows_dict.keys())})...")
for workflow_id, workflow_obj in workflows.items():
# If workflow actually uses CI, populate the running dict
if does_workflow_use_ci(workflow_obj):
if repo_id not in ci_workflows_dict:
ci_workflows_dict[repo_id] = {}
ci_workflows_dict[repo_id][workflow_id] = workflow_obj
print(
f"Only {len(ci_workflows_dict.keys())}/{len(workflows_dict.keys())} projects actually use CI")
return ci_workflows_dict
def does_workflow_use_ci(workflow_obj: Dict[str, str]) -> bool:
"""
Return `True` if a GitHub Actions workflow YAML defines at least one CI action, or `False`
otherwise. This definition of 'CI' is somewhat arbitrary, and is specific to this study. We
aim to avoid false positives (ie. returning `True` for a non-CI workflow), and would prefer
false negatives (ie. returning `False` for a CI workflow). The workflow YAML parameter is a
dict representation of the YAML file.
https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions.
Example workflow_obj:
```
{ "name": "build.yml", "text": "These are my YAML contents" }
```
"""
def uses_valid_yaml_filename():
return workflow_obj['name'].endswith('.yml') or workflow_obj['name'].endswith('.yaml')
def uses_on_push(workflow_yaml):
# NOTE: Will not use, since this filter will be applied when fetching build data
# Require that workflow runs 'on push'
if 'on' in workflow_yaml and workflow_yaml['on'] is not None:
on_dict_has_push = type(
workflow_yaml['on']) is dict and 'push' in workflow_yaml['on']
on_list_has_push = type(
workflow_yaml['on']) is list and 'push' in workflow_yaml['on']
if on_dict_has_push or on_list_has_push:
return True
if uses_valid_yaml_filename():
workflow_text = workflow_obj['text']
workflow_text = workflow_text.replace('\t', ' ')
workflow_yaml = read_dict_from_yaml_str(workflow_text)
# If workflow actually uses CI, populate the running dict
if workflow_yaml is not None:
return check_workflow_for_cmd(workflow_yaml)
return False