-
Notifications
You must be signed in to change notification settings - Fork 2
/
collect_cpc_titles
executable file
·47 lines (39 loc) · 1.55 KB
/
collect_cpc_titles
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env bash
# SPDX-FileCopyrightText: 2022 Robin Vobruba <[email protected]>
#
# SPDX-License-Identifier: CC0-1.0
# Exit immediately on each error and unset variable;
# see: https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
#set -Eeu
# For human readability, we want to display the class and sub-class titles.
# We get them from the "CPC Title List" download found on this site:
# https://www.cooperativepatentclassification.org/cpcSchemeAndDefinitions/bulk
# Which at the time of our last download was:
ARCH_URL="https://www.cooperativepatentclassification.org/sites/default/files/cpc/bulk/CPCTitleList202205.zip"
EX_DIR="output/CPCTitleListLatest"
ARCH_FILE="$(basename "$ARCH_URL")"
OUT_FILE="output/cpc-titles.csv"
mkdir -p "$(dirname "$EX_DIR")"
mkdir -p "$(dirname "$ARCH_FILE")"
mkdir -p "$(dirname "$OUT_FILE")"
# Download the archive
if ! [ -e "$ARCH_FILE" ]
then
echo "Downloading archive ..."
curl --silent --output "$ARCH_FILE" "$ARCH_URL"
fi
# Extract the archive
echo "Extracting archive ..."
rm -Rf "${EX_DIR:?}/"*
unzip -q -d "$EX_DIR" "$ARCH_FILE"
echo "Converting data ..."
# Convert Windows to UNIX EOL (end-of-line) characters
sed -i 's/\r$//' output/CPCTitleListLatest/cpc-section-*.txt
# Filter out only main and sub titles,
# and convert to CSV format
grep -r --no-filename '^[A-Z]\([0-9][0-9]\)\?[[:space:]]' output/CPCTitleListLatest/ \
| sed -e 's|^\([A-Z]\([0-9][0-9]\)\?\)[[:space:]]\+\(.*\)$|\"\1\",\"\3"|' \
> "$OUT_FILE"
echo "CPC Titles successfully extracted to '$OUT_FILE'."
echo "done."