-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcldfbench_ccmc.py
171 lines (159 loc) · 7.54 KB
/
cldfbench_ccmc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pathlib
import zipfile
import itertools
from pydplace.dataset import DatasetWithSocieties, data_schema
from clldutils.path import git_describe
TYPES = [ # Label, Inclusion criteria, Exclusion criteria
('DANCE',
'Sung with the goal of a person or persons dancing along to it.',
'Songs that happen to be accompanied by dancing but are used for other goals.',
),
('HEALING',
'Sung in a healing ceremony with the goal of curing sickness.',
'Songs describing sick people or a past epidemic.',
),
('LOVE',
'Sung to express love directly to another person or to describe currently felt love.',
'Songs about unrequited love, deceased loved ones, or love for animals or property.',
),
('LULLABY',
'Sung to an infant or child with the goal of soothing, calming, or putting to sleep.',
'Songs designed to excite the listener (e.g., "play songs"); singing games.',
),
('PLAY',
'A song that excites a child or infant and engages them in play. '
'This can include singing games.',
"Children's songs for soothing, calming, or putting to sleep.",
),
('PROCESSION',
'Sung to accompany a formalized march, entrance, or parade, such as during a wedding, '
'funeral or the introduction of a leader.',
'Processions of dancing.',
),
('MOURNING',
'Sung to express grief or sadness about the death of a person, in the present or past.',
'Songs for sick or dying people (these are more likely to be healing songs), or laments '
'about events other than the death of a person.',
),
('WORK',
'Sung to accompany work activities, including planting, grinding, harvesting, processing, '
'tool-making.',
'Hunting songs (esp. songs sung to celebrate successful hunts, songs sung to prepare for hunts).',
),
('STORY',
'Sung to recount historical or mythological events, narrate a sequence of activities '
'by one or more persons, etc. (e.g., a ballad).',
'Lullabies that include stories.',
),
('PRAISE',
'Sung to express admiration for the traits or accomplishments of a person, animal, '
'location, or item of property.',
'A song expressing love for another person or explicitly religious songs (like devotionals).',
),
]
class Dataset(DatasetWithSocieties):
dir = pathlib.Path(__file__).parent
id = "dplace-dataset-ccmc"
def cmd_download(self, args):
self.raw_dir.download(
'https://zenodo.org/records/8378337/files/NHS2-metadata.csv?download=1',
'NHS2-metadata.csv')
self.raw_dir.download(
'https://zenodo.org/records/8378337/files/NHS2-songs.zip?download=1',
'NHS2-songs.zip')
def cmd_makecldf(self, args):
assert git_describe(self.raw_dir / 'glottolog-cldf') == args.glottolog_version
data_schema(args.writer.cldf) # Add schema for data.
self.schema(args.writer.cldf) # Add schema for societies.
# Add custom schema:
args.writer.cldf.add_columns(
'LanguageTable',
{
'name': 'HRAF_region',
'dc:description':
'indicates an approximate geographical location where the song was recorded, '
'using Human Relations Area Files categories '
'(see https://ehrafworldcultures.yale.edu)',
}
)
args.writer.cldf['CodeTable'].common_props['dc:description'] = \
("The codes for the single parameter 'CCMC1' are the 10 categories, describing song "
"type.")
t = args.writer.cldf.add_component('MediaTable')
t.common_props['dc:description'] = \
"10-second excerpts of the source audio, selected at random from only portions of " \
"the recording that contain an audible singer."
args.writer.cldf.add_columns(
'ValueTable',
{
'name': 'Song_ID',
'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#mediaReference'},
)
args.writer.cldf['LanguageTable'].common_props['dc:description'] = '{}\n{}'.format(
args.writer.cldf['LanguageTable'].common_props['dc:description'],
"Note that a society's name and location in this dataset is taken from the "
'corresponding language or dialect in Glottolog.'
)
glangs = {
l['ID']: l for l in
self.raw_dir.joinpath('glottolog-cldf', 'cldf').read_csv('languages.csv', dicts=True)}
etclangs = {r['glottocode']: r for r in self.etc_dir.read_csv('languages.csv', dicts=True)}
args.writer.objects['ParameterTable'].append(dict(
ID=self.with_prefix(1),
Name='NHS2 song',
Description="A Value for this parameter represents a song in the NHS discography, "
"coded for type using one of 10 categories.",
))
seen_types = set()
known_types = {label.capitalize(): (incl, excl) for label, incl, excl in TYPES}
song_dir = self.cldf_dir / 'songs'
if not song_dir.exists():
song_dir.mkdir()
with zipfile.ZipFile(self.raw_dir / 'NHS2-songs.zip') as songs:
# song, type, region, glottocode
for glottocode, rows_ in itertools.groupby(
sorted(
self.raw_dir.read_csv('NHS2-metadata.csv', dicts=True),
key=lambda r: (r['glottocode'], r['type'], r['song'])),
lambda r: r['glottocode'],
):
rows_ = list(rows_)
glottocode = glottocode.strip()
glang = glangs[glottocode]
loc = etclangs.get(glottocode, glang)
self.add_society(
args.writer,
ID=self.with_prefix(glottocode),
Name=glang['Name'],
Glottocode=glottocode,
Latitude=loc['Latitude'],
Longitude=loc['Longitude'],
HRAF_region=rows_[0]['region'],
)
for type, rows in itertools.groupby(rows_, lambda r: r['type']):
if type not in seen_types:
args.writer.objects['CodeTable'].append(dict(
ID=self.with_prefix('1-{}'.format(type)),
Var_ID=self.with_prefix(1),
Name=type,
Description='Inclusion criteria: {}\nExclusion criteria: {}'.format(
*known_types[type])
))
seen_types.add(type)
for row in rows:
args.writer.objects['ValueTable'].append(dict(
ID=row['song'],
Var_ID=self.with_prefix(1),
Soc_ID=self.with_prefix(glottocode),
Code_ID=self.with_prefix('1-{}'.format(type)),
Value=type,
Song_ID=row['song'],
))
args.writer.objects['MediaTable'].append(dict(
ID=row['song'],
Name=row['song'],
Media_Type='audio/mpeg',
Download_URL='songs/{}.mp3'.format(row['song']),
))
song_dir.joinpath('{}.mp3'.format(row['song'])).write_bytes(
songs.read('corpus_processed/{}.mp3'.format(row['song'])))