-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdvc.yaml
208 lines (195 loc) · 7.86 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
vars:
# DevGPT dataset: https://github.com/NAIST-SE/DevGPT
- DevGPT:
version: 'v9'
zenodo_id: 10086809
# where to clone Git repositories
- repositories_dir: /mnt/data/MSR_Challenge_2024/repositories/
stages:
download_DevGPT:
desc: 'Download DevGPT dataset ${DevGPT.version} from Zenodo'
cmd:
- dvc get-url --force https://zenodo.org/records/${DevGPT.zenodo_id}/files/DevGPT.zip?download=1
- mkdir -p data/external
- unzip DevGPT.zip -d data/external/DevGPT
- rm DevGPT.zip
outs:
- data/external/DevGPT/
clone_repos:
desc: 'Clone all repositories included in DevGPT dataset'
cmd: >-
python scripts/data/download_repositories.py
data/external/DevGPT/ ${repositories_dir}
data/repositories_download_status.json
deps:
- scripts/data/download_repositories.py
- data/external/DevGPT/
outs:
- data/repositories_download_status.json:
cache: false
commit_agg:
desc: 'Latest commit sharings to CSV + per-project aggregates'
cmd: >-
python scripts/data/commit_sharings_to_agg.py
data/external/DevGPT/ data/repositories_download_status.json
data/interim/
deps:
- scripts/data/commit_sharings_to_agg.py
- data/external/DevGPT/
- data/repositories_download_status.json
outs:
- data/interim/commit_sharings_df.csv
- data/interim/commit_sharings_groupby_repo_df.csv
pr_agg:
desc: 'Latest pr (pull request) sharings to CSV + per-project aggregates'
cmd: >-
python scripts/data/pr_sharings_to_agg.py
data/external/DevGPT/ data/repositories_download_status.json
data/interim/
deps:
- scripts/data/pr_sharings_to_agg.py
- data/external/DevGPT/
- data/repositories_download_status.json
outs:
- data/interim/pr_sharings_df.csv
- data/interim/pr_sharings_groupby_repo_df.csv
- data/interim/pr_sharings_split_commit_df.csv
issue_agg:
desc: 'Latest issue sharings to CSV + per-project aggregates'
cmd: >-
python scripts/data/issue_sharings_to_agg.py
data/external/DevGPT/ data/repositories_download_status.json
data/interim/
deps:
- scripts/data/issue_sharings_to_agg.py
- data/external/DevGPT/
- data/repositories_download_status.json
outs:
- data/interim/issue_sharings_df.csv
- data/interim/issue_sharings_groupby_repo_df.csv
commit_survival:
desc: 'Changes and lines survival (via blame) for latest commit sharings'
cmd: >-
python scripts/data/compute_changes_survival.py
data/interim/commit_sharings_df.csv data/repositories_download_status.json
data/interim/commit_sharings_changes_survival_df.csv data/interim/commit_sharings_lines_survival_df.csv
deps:
- scripts/data/compute_changes_survival.py
- data/repositories_download_status.json
- data/interim/commit_sharings_df.csv
outs:
- data/interim/commit_sharings_changes_survival_df.csv
- data/interim/commit_sharings_lines_survival_df.csv
pr_survival:
desc: 'Changes and lines survival (via blame) for latest pr sharings'
cmd: >-
python scripts/data/compute_changes_survival.py
data/interim/pr_sharings_df.csv data/repositories_download_status.json
data/interim/pr_sharings_changes_survival_df.csv data/interim/pr_sharings_lines_survival_df.csv
deps:
- scripts/data/compute_changes_survival.py
- data/repositories_download_status.json
- data/interim/pr_sharings_df.csv
outs:
- data/interim/pr_sharings_changes_survival_df.csv
- data/interim/pr_sharings_lines_survival_df.csv
pr_split_survival:
desc: 'Changes and lines survival (via blame) for pr sharings, all commits'
cmd: >-
python scripts/data/compute_changes_survival.py
data/interim/pr_sharings_split_commit_df.csv data/repositories_download_status.json
data/interim/pr_sharings_split_commit_changes_survival_df.csv
data/interim/pr_sharings_split_commit_lines_survival_df.csv
deps:
- scripts/data/compute_changes_survival.py
- data/repositories_download_status.json
- data/interim/pr_sharings_split_commit_df.csv
outs:
- data/interim/pr_sharings_split_commit_changes_survival_df.csv
- data/interim/pr_sharings_split_commit_lines_survival_df.csv
issue_survival:
desc: 'Changes and lines survival (via blame) for latest issue sharings'
cmd: >-
python scripts/data/compute_changes_survival.py
data/interim/issue_sharings_df.csv data/repositories_download_status.json
data/interim/issue_sharings_changes_survival_df.csv
data/interim/issue_sharings_lines_survival_df.csv
deps:
- scripts/data/compute_changes_survival.py
- data/repositories_download_status.json
- data/interim/issue_sharings_df.csv
outs:
- data/interim/issue_sharings_changes_survival_df.csv
- data/interim/issue_sharings_lines_survival_df.csv
repo_stats_git:
desc: 'Repository stats from git for all cloned project repos'
cmd: >-
python scripts/data/compute_repository_statistics_git.py
data/external/DevGPT/ data/repositories_download_status.json
data/interim/repository_statistics_git.json
deps:
- scripts/data/compute_repository_statistics_git.py
- data/external/DevGPT/
- data/repositories_download_status.json
outs:
- data/interim/repository_statistics_git.json
repo_stats_github:
desc: 'Repository info from GitHub for all cloned project repos'
cmd: >-
python scripts/data/compute_repository_statistics_github.py
data/external/DevGPT/ data/repositories_download_status.json
data/interim/repository_statistics_github.json
deps:
- scripts/data/compute_repository_statistics_github.py
- data/external/DevGPT/
- data/repositories_download_status.json
outs:
- data/interim/repository_statistics_github.json
commit_similarities:
desc: 'ChatGPT <-> commit diff similarities for commit sharings'
cmd: >-
python scripts/data/find_chatgpt_changes_similarities.py
data/external/DevGPT/ data/interim/commit_sharings_df.csv
data/repositories_download_status.json
data/interim/commit_sharings_similarities_df.csv
deps:
- scripts/data/find_chatgpt_changes_similarities.py
- data/external/DevGPT/
- data/interim/commit_sharings_df.csv
- data/repositories_download_status.json
outs:
- data/interim/commit_sharings_similarities_df.csv
- data/interim/commit_sharings_similarities_df.checkpoint_data.json:
persist: true
pr_similarities:
desc: 'ChatGPT <-> commit diff similarities for PR sharings'
cmd: >-
python scripts/data/find_chatgpt_changes_similarities.py
data/external/DevGPT/ data/interim/pr_sharings_df.csv
data/repositories_download_status.json
data/interim/pr_mergesha_sharings_similarities_df.csv
deps:
- scripts/data/find_chatgpt_changes_similarities.py
- data/external/DevGPT/
- data/interim/pr_sharings_df.csv
- data/repositories_download_status.json
outs:
- data/interim/pr_mergesha_sharings_similarities_df.csv
- data/interim/pr_mergesha_sharings_similarities_df.checkpoint_data.json:
persist: true
issue_similarities:
desc: 'ChatGPT <-> commit diff similarities for issue sharings'
cmd: >-
python scripts/data/find_chatgpt_changes_similarities.py
data/external/DevGPT/ data/interim/issue_sharings_df.csv
data/repositories_download_status.json
data/interim/issue_sharings_similarities_df.csv
deps:
- scripts/data/find_chatgpt_changes_similarities.py
- data/external/DevGPT/
- data/interim/issue_sharings_df.csv
- data/repositories_download_status.json
outs:
- data/interim/issue_sharings_similarities_df.csv
- data/interim/issue_sharings_similarities_df.checkpoint_data.json:
persist: true