This repository was archived by the owner on Nov 11, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathextract_videos.py
160 lines (127 loc) · 4.76 KB
/
extract_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Copyright (C) 2015, 2016 Sheila Miguez, Will Kahn-Greene
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Extracts videos using the pyvideo API and puts them in JSON files
by category.
Usage::
python bin/extract_videos.py
"""
import json
import os
import sys
from collections import OrderedDict
try:
from steve import restapi
from steve import richardapi
except ImportError:
print('Requires: steve')
print('Run: pip install steve')
raise
API_URL = 'http://pyvideo.org/api/v2/'
def get_video_id(richard_url):
return int(richard_url.split('/video/')[1].split('/')[0])
def reorder_dict(data):
new_dict = OrderedDict()
for key in ('id', 'category', 'slug', 'title', 'summary', 'description',
'quality_notes', 'language', 'copyright_text', 'thumbnail_url',
'duration', 'videos', 'source_url', 'tags', 'speakers',
'recorded'):
new_dict[key] = data[key]
return new_dict
def main(args):
categories = richardapi.get_all_categories(API_URL)
data_path = os.path.join(os.getcwd(), 'data')
try:
os.mkdir(data_path)
except OSError:
pass
for cat in categories:
print('Working on %s...' % cat['title'])
# make the category directory
path = os.path.join(data_path, cat['slug'])
try:
os.mkdir(path)
except OSError:
pass
# save category data
with open(os.path.join(path, 'category.json'), 'w') as fp:
cat_data = OrderedDict((
('title', cat['title']),
('description', cat['description']),
('url', cat['url']),
('slug', cat['slug']),
('start_date', cat['start_date']),
))
json.dump(cat_data, fp, sort_keys=False, indent=2)
videos_path = os.path.join(path, 'videos')
try:
os.mkdir(videos_path)
except OSError:
pass
# pull down the video data
for video_url in cat['videos']:
print(' %s' % video_url)
video_id = get_video_id(video_url)
try:
video = richardapi.get_video(
api_url=API_URL,
auth_token=None,
video_id=video_id
)
except restapi.Http4xxException:
# If we get this, then the video is in draft. Let's just skip
# it.
print(' 404')
continue
# if this video is a "draft", then skip it
if video['state'] == 2:
print(' ... skipping: draft')
continue
# ditch embed because that's gross
del video['embed']
# if the language is None, then set it to English which is
# probably right.
if not video['language']:
video['language'] = 'English'
# delete added and updated since we don't need those anymore.
del video['added']
del video['updated']
# ditch state
del video['state']
videos = []
for fmt in ['mp4', 'webm', 'flv', 'ogv']:
if not video.get(('video_%s_url' % fmt)):
continue
videos.append({
'length': video.get(('video_%s_length' % fmt), 0),
'url': video['video_%s_url' % fmt],
'type': fmt
})
for key in [key for key in video.keys()
if key.startswith('video_%s' % fmt)]:
del video[key]
if video['source_url'] and 'youtu' in video['source_url']:
videos.append({
'length': 0,
'url': video['source_url'],
'type': 'youtube'
})
video['videos'] = videos
video_fn = os.path.join(videos_path, video['slug']) + '.json'
with open(video_fn, 'w') as fp:
json.dump(reorder_dict(video), fp, sort_keys=False, indent=2)
return
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))