diff --git a/Gemfile b/Gemfile index 7d616193..51f14d40 100644 --- a/Gemfile +++ b/Gemfile @@ -23,6 +23,8 @@ gem 'sys-proctable' gem 'request_store' gem 'parallel' gem 'json-ld' +gem 'rdf' +gem 'rdf-n3' # Monitoring gem 'cube-ruby', require: 'cube' diff --git a/bin/lov_migrator b/bin/lov_migrator new file mode 100755 index 00000000..6918dfeb --- /dev/null +++ b/bin/lov_migrator @@ -0,0 +1,328 @@ +#!/usr/bin/env ruby + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +require 'optparse' + +options = {} +OptionParser.new do |opts| + opts.banner = "Usage: lov_migrator [options]" + opts.on( '-a', '--all') do + options[:vocabs] = [:all] + end + opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym| + ontologies_acronyms = acronym + end + + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end.parse! + +raise OptionParser::MissingArgument if options[:vocabs].nil? + +require 'bundler/setup' +require 'rdf' +require 'rdf/n3' +require 'csv' +require 'open-uri' +require 'digest' +require 'benchmark' +require 'parallel' + +CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ] +CSV_ADDED_ATTRS = [ :destination, :who, :comment ] +CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv' +CSV_FILENAME = "vocabs.csv" + +module LOVMigrator + + + module Models + class Model + def extract_uri_or_value(node) + case node + when RDF::URI + node.to_s + when RDF::Literal + if node.has_language? + { value: node.object, language: node.language.to_s } + else + node.object + end + else + node.to_s + end + end + + def get_value(vocab, predicate, list: false) + values = Array(vocab[RDF::URI.new(predicate)]) + .map { |value| extract_uri_or_value(value) } + list ? values : values.first + end + end + class Vocabulary < Model + + def extract_vocabulary_info(graph, vocab_uri, vocab) + return unless vocab + + info = { + uri: vocab_uri, + namespace: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespaceUri"), + prefix: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespacePrefix"), + title: get_value(vocab, "http://purl.org/dc/terms/title", list: true), + description: get_value(vocab, "http://purl.org/dc/terms/description", list: true), + keyword: get_value(vocab, "http://www.w3.org/ns/dcat#keyword", list: true), + issued: get_value(vocab, "http://purl.org/dc/terms/issued"), + modified: get_value(vocab, "http://purl.org/dc/terms/modified"), + + isDefinedBy: get_value(vocab, "http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), + homepage: get_value(vocab, "http://xmlns.com/foaf/0.1/homepage"), + creator: get_value(vocab, "http://purl.org/dc/terms/creator", list: true), + contributor: get_value(vocab, "http://purl.org/dc/terms/contributor", list: true), + publisher: get_value(vocab, "http://purl.org/dc/terms/publisher"), + language: get_value(vocab, "http://purl.org/dc/terms/language", list: true), + type: get_value(vocab, RDF.type), + occurrences: get_value(vocab, "http://purl.org/vocommons/voaf#occurrencesInDatasets"), + reused_by_datasets: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByDatasets"), + reused_by_vocabs: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByVocabularies"), + distribution: get_value(vocab, "http://www.w3.org/ns/dcat#distribution"), + } + + review = get_value(vocab, "http://purl.org/stuff/rev#hasReview") + review = graph.dig(review) + if review + info[:review] = { + creator: get_value(review, "http://purl.org/dc/terms/creator"), + date: get_value(review, "http://purl.org/dc/terms/date"), + text: get_value(review, "http://purl.org/stuff/rev#text") + } + end + + info[:lastModifiedInLOVAt] = get_value(vocab, "http://purl.org/dc/terms/modified") + + info + end + + end + + class Agent + + def extract_agent_info(graph, agent_uri) + info = { + name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object, + sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + } + end + + end + + class Distribution + + def extract_distribution_info(graph, distribution_uri) + info_distribution = { + uri: distribution_uri, + issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, + language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + # Relations + extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) } + } + info_distribution + end + + end + end + + class DumpParser + def download_file(url, destination) + URI.open(url) do |file| + File.open(destination, 'wb') do |output| + output.write(file.read) + end + end + puts "File downloaded successfully as #{destination}" + end + + def file_hash(file_path) + Digest::SHA256.file(file_path).hexdigest + end + + def files_identical?(file1, file2) + File.exist?(file2) && file_hash(file1) == file_hash(file2) + end + + # Updates the local file if the downloaded file is different + def update_local_file(url, local_file_path) + downloaded_file_name = 'lov.n3.gz.tmp' + + download_file(url, downloaded_file_name) + + if files_identical?(downloaded_file_name, local_file_path) + File.delete(downloaded_file_name) # Clean up the temporary file + return false + else + File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one + # decompress the doanloaded zip + Zlib::GzipReader.open(local_file_path) do |gzip| + File.open('lov.n3', 'w') do |file| + file.write(gzip.read) + end + end + + return true + end + end + + def parse_n3_file(file_path) + # The RDF type you want to filter for + vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") + + vocabularies_hash = {} + vocabulary_subjects = Set.new + + # Single-pass to read the Turtle file without breaking syntax + RDF::Turtle::Reader.open(file_path) do |reader| + # Collect all statements in an array (could be large, but streaming the file) + reader.each_statement do |statement| + subject = statement.subject + + # If the statement is of type `voaf:Vocabulary`, track the subject + if statement.predicate == RDF.type && statement.object == vocabulary_type + vocabulary_subjects.add(subject) + vocabularies_hash[subject] ||= {} + end + + # Collect all triples related to the subject if it's identified as a vocabulary + if vocabulary_subjects.include?(subject) + vocabularies_hash[subject] ||= {} + old_value = vocabularies_hash[subject][statement.predicate] + if old_value.nil? + vocabularies_hash[subject][statement.predicate] = statement.object + else + vocabularies_hash[subject][statement.predicate] = Array(old_value) + [statement.object] + end + end + end + + end + vocabularies_hash + + end + end + + class CSVGenerator + + def initialize_csv(filename) + CSV.open(filename, "w", force_quotes: true) do |csv| + csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row + end + end + + # We copy the added attributes from dispatch csv + def copy_added_values_to_csv(vocab) + csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h + CSV_ADDED_ATTRS.each do |attr| + vocab[attr] = csv_vocab[attr] + end + vocab + end + + def append_to_csv(vocab, filename) + vocab = copy_added_values_to_csv(vocab) if File.exist?(CSV_DISPATCH_FILENAME) + CSV.open(filename, "a", force_quotes: true) do |csv| + filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) } + csv << filtered_row + end + end + + def find_csv_row_by_prefix(file_path, prefix_value) + CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| + if row[:prefix] == prefix_value + return row + end + end + nil + end + + + def format_value(value) + case value + when Array + value.map { |v| format_value(v) }.join("\n") + when Hash + if value[:language] + "\"#{value[:value]}\"@#{value[:language]}" + else + value.to_s + end + else + value.to_s.empty? ? "None" : value.to_s + end + end + + end +end + +def print_vocabulary_info(info) + puts "-------------------------------------------------------------------" + info.each do |key, value| + formatted_key = key.to_s.split('_').map(&:capitalize).join(' ') + puts "#{formatted_key}: #{format_value(value)}" + end + puts "\n" +end + +def logger(text, &block) + puts ">> #{text} starting..." + time = Benchmark.realtime do + block.call + end + puts "#{text} finished in #{time} seconds" +end + +# Start of the script +def main + parser = LOVMigrator::DumpParser.new + updated = false + logger("Download dump file from LOV") do + updated = parser.update_local_file("https://lov.linkeddata.es/lov.n3.gz", "lov.n3.gz") + end + if updated + puts "The local file was updated." + else + puts "The local file remains unchanged." + end + + csv_generator = LOVMigrator::CSVGenerator.new + logger("Initialize CSV #{CSV_FILENAME}") do + csv_generator.initialize_csv(CSV_FILENAME) + end + + graph = [] + logger('Parsing the n3 in memory') do + graph = parser.parse_n3_file('lov.n3') + end + puts "Found #{graph.size} vocabularies" + if graph.empty? + puts "No vocabularies found in the file." + else + logger("Start creating CSV #{CSV_FILENAME}") do + graph.each do |vocab_uri, vocab_data| + vocab_info = LOVMigrator::Models::Vocabulary.new.extract_vocabulary_info(graph, vocab_uri, vocab_data) + csv_generator.append_to_csv(vocab_info, CSV_FILENAME) + end + end + end +end + +main