-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature : add LOV Migration Script #24
Changes from 14 commits
7358191
d379d06
dd23c7f
255971c
9cefc65
30ea364
43de6b5
c12cf02
276389c
585d8a2
fc5114c
2a4f654
3aaeb97
c89db6c
1108bfd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,328 @@ | ||
#!/usr/bin/env ruby | ||
|
||
# Exit cleanly from an early interrupt | ||
Signal.trap("INT") { exit 1 } | ||
|
||
require 'optparse' | ||
|
||
options = {} | ||
OptionParser.new do |opts| | ||
opts.banner = "Usage: lov_migrator [options]" | ||
opts.on( '-a', '--all') do | ||
options[:vocabs] = [:all] | ||
end | ||
opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym| | ||
ontologies_acronyms = acronym | ||
end | ||
|
||
# Display the help screen, all programs are assumed to have this option. | ||
opts.on( '-h', '--help', 'Display this screen' ) do | ||
puts opts | ||
exit | ||
end | ||
end.parse! | ||
|
||
raise OptionParser::MissingArgument if options[:vocabs].nil? | ||
|
||
require 'bundler/setup' | ||
require 'rdf' | ||
require 'rdf/n3' | ||
require 'csv' | ||
require 'open-uri' | ||
require 'digest' | ||
require 'benchmark' | ||
require 'parallel' | ||
|
||
CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ] | ||
CSV_ADDED_ATTRS = [ :destination, :who, :comment ] | ||
CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv' | ||
CSV_FILENAME = "vocabs.csv" | ||
|
||
module LOVMigrator | ||
|
||
|
||
module Models | ||
class Model | ||
def extract_uri_or_value(node) | ||
case node | ||
when RDF::URI | ||
node.to_s | ||
when RDF::Literal | ||
if node.has_language? | ||
{ value: node.object, language: node.language.to_s } | ||
else | ||
node.object | ||
end | ||
else | ||
node.to_s | ||
end | ||
end | ||
|
||
def get_value(vocab, predicate, list: false) | ||
values = Array(vocab[RDF::URI.new(predicate)]) | ||
.map { |value| extract_uri_or_value(value) } | ||
list ? values : values.first | ||
end | ||
end | ||
class Vocabulary < Model | ||
|
||
def extract_vocabulary_info(graph, vocab_uri, vocab) | ||
return unless vocab | ||
|
||
info = { | ||
uri: vocab_uri, | ||
namespace: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespaceUri"), | ||
prefix: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespacePrefix"), | ||
title: get_value(vocab, "http://purl.org/dc/terms/title", list: true), | ||
description: get_value(vocab, "http://purl.org/dc/terms/description", list: true), | ||
keyword: get_value(vocab, "http://www.w3.org/ns/dcat#keyword", list: true), | ||
issued: get_value(vocab, "http://purl.org/dc/terms/issued"), | ||
modified: get_value(vocab, "http://purl.org/dc/terms/modified"), | ||
|
||
isDefinedBy: get_value(vocab, "http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), | ||
homepage: get_value(vocab, "http://xmlns.com/foaf/0.1/homepage"), | ||
creator: get_value(vocab, "http://purl.org/dc/terms/creator", list: true), | ||
contributor: get_value(vocab, "http://purl.org/dc/terms/contributor", list: true), | ||
publisher: get_value(vocab, "http://purl.org/dc/terms/publisher"), | ||
language: get_value(vocab, "http://purl.org/dc/terms/language", list: true), | ||
type: get_value(vocab, RDF.type), | ||
occurrences: get_value(vocab, "http://purl.org/vocommons/voaf#occurrencesInDatasets"), | ||
reused_by_datasets: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByDatasets"), | ||
reused_by_vocabs: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByVocabularies"), | ||
distribution: get_value(vocab, "http://www.w3.org/ns/dcat#distribution"), | ||
} | ||
|
||
review = get_value(vocab, "http://purl.org/stuff/rev#hasReview") | ||
review = graph.dig(review) | ||
if review | ||
info[:review] = { | ||
creator: get_value(review, "http://purl.org/dc/terms/creator"), | ||
date: get_value(review, "http://purl.org/dc/terms/date"), | ||
text: get_value(review, "http://purl.org/stuff/rev#text") | ||
} | ||
end | ||
|
||
info[:lastModifiedInLOVAt] = get_value(vocab, "http://purl.org/dc/terms/modified") | ||
|
||
info | ||
end | ||
|
||
end | ||
|
||
class Agent | ||
|
||
def extract_agent_info(graph, agent_uri) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to update to work with the graph as an hash |
||
info = { | ||
name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object, | ||
sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
} | ||
end | ||
|
||
end | ||
|
||
class Distribution | ||
|
||
def extract_distribution_info(graph, distribution_uri) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to update to work with the graph as an hash |
||
info_distribution = { | ||
uri: distribution_uri, | ||
issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, | ||
language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
# Relations | ||
extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) }, | ||
imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) } | ||
} | ||
info_distribution | ||
end | ||
|
||
end | ||
end | ||
|
||
class DumpParser | ||
def download_file(url, destination) | ||
URI.open(url) do |file| | ||
File.open(destination, 'wb') do |output| | ||
output.write(file.read) | ||
end | ||
end | ||
puts "File downloaded successfully as #{destination}" | ||
end | ||
|
||
def file_hash(file_path) | ||
Digest::SHA256.file(file_path).hexdigest | ||
end | ||
|
||
def files_identical?(file1, file2) | ||
File.exist?(file2) && file_hash(file1) == file_hash(file2) | ||
end | ||
|
||
# Updates the local file if the downloaded file is different | ||
def update_local_file(url, local_file_path) | ||
downloaded_file_name = 'lov.n3.gz.tmp' | ||
|
||
download_file(url, downloaded_file_name) | ||
|
||
if files_identical?(downloaded_file_name, local_file_path) | ||
File.delete(downloaded_file_name) # Clean up the temporary file | ||
return false | ||
else | ||
File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one | ||
# decompress the doanloaded zip | ||
Zlib::GzipReader.open(local_file_path) do |gzip| | ||
File.open('lov.n3', 'w') do |file| | ||
file.write(gzip.read) | ||
end | ||
end | ||
|
||
return true | ||
end | ||
end | ||
|
||
def parse_n3_file(file_path) | ||
# The RDF type you want to filter for | ||
vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") | ||
|
||
vocabularies_hash = {} | ||
vocabulary_subjects = Set.new | ||
|
||
# Single-pass to read the Turtle file without breaking syntax | ||
RDF::Turtle::Reader.open(file_path) do |reader| | ||
# Collect all statements in an array (could be large, but streaming the file) | ||
reader.each_statement do |statement| | ||
subject = statement.subject | ||
|
||
# If the statement is of type `voaf:Vocabulary`, track the subject | ||
if statement.predicate == RDF.type && statement.object == vocabulary_type | ||
vocabulary_subjects.add(subject) | ||
vocabularies_hash[subject] ||= {} | ||
end | ||
|
||
# Collect all triples related to the subject if it's identified as a vocabulary | ||
if vocabulary_subjects.include?(subject) | ||
vocabularies_hash[subject] ||= {} | ||
old_value = vocabularies_hash[subject][statement.predicate] | ||
if old_value.nil? | ||
vocabularies_hash[subject][statement.predicate] = statement.object | ||
else | ||
vocabularies_hash[subject][statement.predicate] = Array(old_value) + [statement.object] | ||
end | ||
end | ||
end | ||
|
||
end | ||
vocabularies_hash | ||
|
||
end | ||
end | ||
|
||
class CSVGenerator | ||
|
||
def initialize_csv(filename) | ||
CSV.open(filename, "w", force_quotes: true) do |csv| | ||
csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row | ||
end | ||
end | ||
|
||
# We copy the added attributes from dispatch csv | ||
def copy_added_values_to_csv(vocab) | ||
csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h | ||
CSV_ADDED_ATTRS.each do |attr| | ||
vocab[attr] = csv_vocab[attr] | ||
end | ||
vocab | ||
end | ||
|
||
def append_to_csv(vocab, filename) | ||
vocab = copy_added_values_to_csv(vocab) if File.exist?(CSV_DISPATCH_FILENAME) | ||
CSV.open(filename, "a", force_quotes: true) do |csv| | ||
filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) } | ||
csv << filtered_row | ||
end | ||
end | ||
|
||
def find_csv_row_by_prefix(file_path, prefix_value) | ||
CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| | ||
if row[:prefix] == prefix_value | ||
return row | ||
end | ||
end | ||
nil | ||
end | ||
|
||
|
||
def format_value(value) | ||
case value | ||
when Array | ||
value.map { |v| format_value(v) }.join("\n") | ||
when Hash | ||
if value[:language] | ||
"\"#{value[:value]}\"@#{value[:language]}" | ||
else | ||
value.to_s | ||
end | ||
else | ||
value.to_s.empty? ? "None" : value.to_s | ||
end | ||
end | ||
|
||
end | ||
end | ||
|
||
def print_vocabulary_info(info) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to remove not needed in the final version |
||
puts "-------------------------------------------------------------------" | ||
info.each do |key, value| | ||
formatted_key = key.to_s.split('_').map(&:capitalize).join(' ') | ||
puts "#{formatted_key}: #{format_value(value)}" | ||
end | ||
puts "\n" | ||
end | ||
|
||
def logger(text, &block) | ||
puts ">> #{text} starting..." | ||
time = Benchmark.realtime do | ||
block.call | ||
end | ||
puts "#{text} finished in #{time} seconds" | ||
end | ||
|
||
# Start of the script | ||
def main | ||
parser = LOVMigrator::DumpParser.new | ||
updated = false | ||
logger("Download dump file from LOV") do | ||
updated = parser.update_local_file("https://lov.linkeddata.es/lov.n3.gz", "lov.n3.gz") | ||
end | ||
if updated | ||
puts "The local file was updated." | ||
else | ||
puts "The local file remains unchanged." | ||
end | ||
|
||
csv_generator = LOVMigrator::CSVGenerator.new | ||
logger("Initialize CSV #{CSV_FILENAME}") do | ||
csv_generator.initialize_csv(CSV_FILENAME) | ||
end | ||
|
||
graph = [] | ||
logger('Parsing the n3 in memory') do | ||
graph = parser.parse_n3_file('lov.n3') | ||
end | ||
puts "Found #{graph.size} vocabularies" | ||
if graph.empty? | ||
puts "No vocabularies found in the file." | ||
else | ||
logger("Start creating CSV #{CSV_FILENAME}") do | ||
graph.each do |vocab_uri, vocab_data| | ||
vocab_info = LOVMigrator::Models::Vocabulary.new.extract_vocabulary_info(graph, vocab_uri, vocab_data) | ||
csv_generator.append_to_csv(vocab_info, CSV_FILENAME) | ||
end | ||
end | ||
end | ||
end | ||
|
||
main |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this need to be an argument of the script