-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_measures_for_wanted_repos.py
75 lines (55 loc) · 2.35 KB
/
get_measures_for_wanted_repos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os, re
import gzip, json
import paramiko
import pickle
# Need to load the pickle that has the ids of the wanted repos, so we can filter out the ones we aren't including
with open('final_pickles/wanted_repos.data', 'rb') as filehandle:
# read the data as binary data stream
wanted_repos = pickle.load(filehandle)
# Connect to the server
hostname = "vader.psych.wisc.edu"
username = "alyssa"
password = "Alyssa"
port = 1202
client = paramiko.SSHClient()
client.load_system_host_keys()
client.set_missing_host_key_policy(paramiko.WarningPolicy)
client.connect(hostname, port=port, username=username, password=password)
# Get a list of all the files
stdin, stdout, stderr = client.exec_command("cd 2015; ls")
filelist = stdout.read().splitlines()
filelist = list(map(lambda x: x.decode("utf-8"), filelist))
sftp = client.open_sftp()
remote_images_path = '2015/'
local_path = '/Temp/'
# Going to temp download one file at a time (delete after) and then load in the data
for file in filelist:
urls = {}
file_remote = remote_images_path + file
file_local = local_path + file
print(file_remote + '>>>' + file_local)
sftp.get(file_remote, file_local)
with gzip.open(file_local, 'rb') as gfile:
data = gfile.read()
data = data.decode("utf-8")
rows = data.split('\n')
jrows = []
for row in rows:
try:
jrows.append(json.loads(row))
except:
continue
# "measures of success"
jsonblobs = list(filter(lambda x: re.search('Fork', x['type']) or re.search('Watch', x['type']) or re.search('Issue', x['type']), jrows))
# Get events from all actions in this list
# ['PushEvent','DeleteEvent','CreateEvent','PullRequestEvent','IssuesEvent','ForkEvent',
# 'IssueCommentEvent','WatchEvent','CommitCommentEvent','PullRequestReviewCommentEvent']
wanted_json = list(filter(lambda x: x['repo']['id'] in map(int, wanted_repos), jsonblobs))
os.remove(file_local)
# One pickle per file so we can load all the pickles in a different script (so we don't run out of memory)
if not os.path.exists('wanted_measures'):
os.makedirs('wanted_measures')
with open('wanted_measures/wanted_measures' + file + '.data', 'wb') as filehandle:
pickle.dump(wanted_json, filehandle)
sftp.close()
client.close()