Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature : add LOV Migration Script #24

Closed
Closed
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 328 additions & 0 deletions bin/lov_migrator
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/usr/bin/env ruby

# Exit cleanly from an early interrupt
Signal.trap("INT") { exit 1 }

require 'optparse'

options = {}
OptionParser.new do |opts|
opts.banner = "Usage: lov_migrator [options]"
opts.on( '-a', '--all') do
options[:vocabs] = [:all]
end
opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym|
ontologies_acronyms = acronym
end

# Display the help screen, all programs are assumed to have this option.
opts.on( '-h', '--help', 'Display this screen' ) do
puts opts
exit
end
end.parse!

raise OptionParser::MissingArgument if options[:vocabs].nil?

require 'bundler/setup'
require 'rdf'
require 'rdf/n3'
require 'csv'
require 'open-uri'
require 'digest'
require 'benchmark'
require 'parallel'

CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ]
CSV_ADDED_ATTRS = [ :destination, :who, :comment ]
CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this need to be an argument of the script

CSV_FILENAME = "vocabs.csv"

module LOVMigrator


module Models
class Model
def extract_uri_or_value(node)
case node
when RDF::URI
node.to_s
when RDF::Literal
if node.has_language?
{ value: node.object, language: node.language.to_s }
else
node.object
end
else
node.to_s
end
end

def get_value(vocab, predicate, list: false)
values = Array(vocab[RDF::URI.new(predicate)])
.map { |value| extract_uri_or_value(value) }
list ? values : values.first
end
end
class Vocabulary < Model

def extract_vocabulary_info(graph, vocab_uri, vocab)
return unless vocab

info = {
uri: vocab_uri,
namespace: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespaceUri"),
prefix: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespacePrefix"),
title: get_value(vocab, "http://purl.org/dc/terms/title", list: true),
description: get_value(vocab, "http://purl.org/dc/terms/description", list: true),
keyword: get_value(vocab, "http://www.w3.org/ns/dcat#keyword", list: true),
issued: get_value(vocab, "http://purl.org/dc/terms/issued"),
modified: get_value(vocab, "http://purl.org/dc/terms/modified"),

isDefinedBy: get_value(vocab, "http://www.w3.org/2000/01/rdf-schema#isDefinedBy"),
homepage: get_value(vocab, "http://xmlns.com/foaf/0.1/homepage"),
creator: get_value(vocab, "http://purl.org/dc/terms/creator", list: true),
contributor: get_value(vocab, "http://purl.org/dc/terms/contributor", list: true),
publisher: get_value(vocab, "http://purl.org/dc/terms/publisher"),
language: get_value(vocab, "http://purl.org/dc/terms/language", list: true),
type: get_value(vocab, RDF.type),
occurrences: get_value(vocab, "http://purl.org/vocommons/voaf#occurrencesInDatasets"),
reused_by_datasets: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByDatasets"),
reused_by_vocabs: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByVocabularies"),
distribution: get_value(vocab, "http://www.w3.org/ns/dcat#distribution"),
}

review = get_value(vocab, "http://purl.org/stuff/rev#hasReview")
review = graph.dig(review)
if review
info[:review] = {
creator: get_value(review, "http://purl.org/dc/terms/creator"),
date: get_value(review, "http://purl.org/dc/terms/date"),
text: get_value(review, "http://purl.org/stuff/rev#text")
}
end

info[:lastModifiedInLOVAt] = get_value(vocab, "http://purl.org/dc/terms/modified")

info
end

end

class Agent

def extract_agent_info(graph, agent_uri)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to update to work with the graph as an hash

info = {
name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object,
sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) },
}
end

end

class Distribution

def extract_distribution_info(graph, distribution_uri)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to update to work with the graph as an hash

info_distribution = {
uri: distribution_uri,
issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object,
language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) },
title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) },
# Relations
extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) },
specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) },
generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) },
hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) },
hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) },
metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) },
imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) }
}
info_distribution
end

end
end

class DumpParser
def download_file(url, destination)
URI.open(url) do |file|
File.open(destination, 'wb') do |output|
output.write(file.read)
end
end
puts "File downloaded successfully as #{destination}"
end

def file_hash(file_path)
Digest::SHA256.file(file_path).hexdigest
end

def files_identical?(file1, file2)
File.exist?(file2) && file_hash(file1) == file_hash(file2)
end

# Updates the local file if the downloaded file is different
def update_local_file(url, local_file_path)
downloaded_file_name = 'lov.n3.gz.tmp'

download_file(url, downloaded_file_name)

if files_identical?(downloaded_file_name, local_file_path)
File.delete(downloaded_file_name) # Clean up the temporary file
return false
else
File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one
# decompress the doanloaded zip
Zlib::GzipReader.open(local_file_path) do |gzip|
File.open('lov.n3', 'w') do |file|
file.write(gzip.read)
end
end

return true
end
end

def parse_n3_file(file_path)
# The RDF type you want to filter for
vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary")

vocabularies_hash = {}
vocabulary_subjects = Set.new

# Single-pass to read the Turtle file without breaking syntax
RDF::Turtle::Reader.open(file_path) do |reader|
# Collect all statements in an array (could be large, but streaming the file)
reader.each_statement do |statement|
subject = statement.subject

# If the statement is of type `voaf:Vocabulary`, track the subject
if statement.predicate == RDF.type && statement.object == vocabulary_type
vocabulary_subjects.add(subject)
vocabularies_hash[subject] ||= {}
end

# Collect all triples related to the subject if it's identified as a vocabulary
if vocabulary_subjects.include?(subject)
vocabularies_hash[subject] ||= {}
old_value = vocabularies_hash[subject][statement.predicate]
if old_value.nil?
vocabularies_hash[subject][statement.predicate] = statement.object
else
vocabularies_hash[subject][statement.predicate] = Array(old_value) + [statement.object]
end
end
end

end
vocabularies_hash

end
end

class CSVGenerator

def initialize_csv(filename)
CSV.open(filename, "w", force_quotes: true) do |csv|
csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row
end
end

# We copy the added attributes from dispatch csv
def copy_added_values_to_csv(vocab)
csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h
CSV_ADDED_ATTRS.each do |attr|
vocab[attr] = csv_vocab[attr]
end
vocab
end

def append_to_csv(vocab, filename)
vocab = copy_added_values_to_csv(vocab) if File.exist?(CSV_DISPATCH_FILENAME)
CSV.open(filename, "a", force_quotes: true) do |csv|
filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) }
csv << filtered_row
end
end

def find_csv_row_by_prefix(file_path, prefix_value)
CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row|
if row[:prefix] == prefix_value
return row
end
end
nil
end


def format_value(value)
case value
when Array
value.map { |v| format_value(v) }.join("\n")
when Hash
if value[:language]
"\"#{value[:value]}\"@#{value[:language]}"
else
value.to_s
end
else
value.to_s.empty? ? "None" : value.to_s
end
end

end
end

def print_vocabulary_info(info)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to remove not needed in the final version

puts "-------------------------------------------------------------------"
info.each do |key, value|
formatted_key = key.to_s.split('_').map(&:capitalize).join(' ')
puts "#{formatted_key}: #{format_value(value)}"
end
puts "\n"
end

def logger(text, &block)
puts ">> #{text} starting..."
time = Benchmark.realtime do
block.call
end
puts "#{text} finished in #{time} seconds"
end

# Start of the script
def main
parser = LOVMigrator::DumpParser.new
updated = false
logger("Download dump file from LOV") do
updated = parser.update_local_file("https://lov.linkeddata.es/lov.n3.gz", "lov.n3.gz")
end
if updated
puts "The local file was updated."
else
puts "The local file remains unchanged."
end

csv_generator = LOVMigrator::CSVGenerator.new
logger("Initialize CSV #{CSV_FILENAME}") do
csv_generator.initialize_csv(CSV_FILENAME)
end

graph = []
logger('Parsing the n3 in memory') do
graph = parser.parse_n3_file('lov.n3')
end
puts "Found #{graph.size} vocabularies"
if graph.empty?
puts "No vocabularies found in the file."
else
logger("Start creating CSV #{CSV_FILENAME}") do
graph.each do |vocab_uri, vocab_data|
vocab_info = LOVMigrator::Models::Vocabulary.new.extract_vocabulary_info(graph, vocab_uri, vocab_data)
csv_generator.append_to_csv(vocab_info, CSV_FILENAME)
end
end
end
end

main
Loading