From 7358191a1c2fed93ab18d2b11332f858a3ac4fce Mon Sep 17 00:00:00 2001 From: Muhammad Date: Fri, 18 Oct 2024 14:54:32 +0100 Subject: [PATCH 01/15] lov migrator script initialization --- bin/lov_migrator | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 bin/lov_migrator diff --git a/bin/lov_migrator b/bin/lov_migrator new file mode 100644 index 00000000..6889c582 --- /dev/null +++ b/bin/lov_migrator @@ -0,0 +1,28 @@ +#!/usr/bin/env ruby + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +require 'optparse' + + +options = {} +OptionParser.new do |opts| + opts.banner = "Usage: lov_migrator [options]" + opts.on( '-a', '--all') do + options[:vocabs] = [:all] + end + opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym| + ontologies_acronyms = acronym + end + + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end.parse! + + +raise OptionParser::MissingArgument if options[:vocabs].nil? + From d379d06a507773b7cc2101fa8aea9a259a8dd25e Mon Sep 17 00:00:00 2001 From: Muhammad Date: Fri, 18 Oct 2024 15:17:31 +0100 Subject: [PATCH 02/15] adding notation3 file parsing in lov migrator --- bin/lov_migrator | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) mode change 100644 => 100755 bin/lov_migrator diff --git a/bin/lov_migrator b/bin/lov_migrator old mode 100644 new mode 100755 index 6889c582..c189d733 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -26,3 +26,40 @@ end.parse! raise OptionParser::MissingArgument if options[:vocabs].nil? +require 'rdf' +require 'rdf/n3' + + +def parse_n3_file(file_path) + graph = RDF::Graph.new + RDF::N3::Reader.open(file_path) do |reader| + reader.each_statement do |statement| + graph << statement + end + end + graph +end + + +def find_vocabularies(graph) + vocab_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") + graph.query([nil, RDF.type, vocab_type]).map(&:subject) +end + + +def main + graph = parse_n3_file('lov.n3') + + vocab_uris = find_vocabularies(graph) + + if vocab_uris.empty? + puts "No vocabularies found in the file." + else + puts "Found #{vocab_uris.size} vocabularies:\n\n" + vocab_uris.each do |vocab_uri| + puts vocab_uri + end + end +end + +main \ No newline at end of file From dd23c7f067263fc3ff9728a245bc56b5dbab945b Mon Sep 17 00:00:00 2001 From: Muhammad Date: Fri, 18 Oct 2024 15:31:35 +0100 Subject: [PATCH 03/15] Add RDF vocabulary metadata extraction and mapping to ruby object --- bin/lov_migrator | 99 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index c189d733..448d0781 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -41,11 +41,107 @@ def parse_n3_file(file_path) end +def extract_uri_or_value(node) + case node + when RDF::URI + node.to_s + when RDF::Literal + if node.has_language? + { value: node.object, language: node.language.to_s } + else + node.object + end + else + node.to_s + end +end + +def extract_distribution_info(graph, distribution_uri) + info_distribution = { + uri: distribution_uri, + issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, + language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + # Relations + extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) } + } + info_distribution +end + + +def extract_vocabulary_info(graph, vocab_uri) + info = { + uri: vocab_uri, + namespace: graph.query([vocab_uri, RDF::URI("http://purl.org/vocab/vann/preferredNamespaceUri"), nil]).first&.object, + prefix: graph.query([vocab_uri, RDF::URI("http://purl.org/vocab/vann/preferredNamespacePrefix"), nil]).first&.object, + title: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + description: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/description"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + keyword: graph.query([vocab_uri, RDF::URI("http://www.w3.org/ns/dcat#keyword"), nil]).map(&:object), + issued: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, + modified: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/modified"), nil]).first&.object, + isDefinedBy: graph.query([vocab_uri, RDF::URI("http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), nil]).first&.object, + homepage: graph.query([vocab_uri, RDF::URI("http://xmlns.com/foaf/0.1/homepage"), nil]).first&.object, + creator: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/creator"), nil]).first&.object, + contributor: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/contributor"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + publisher: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/publisher"), nil]).first&.object, + language: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + + type: graph.query([vocab_uri, RDF.type, nil]).first&.object, + + occurrences: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#occurrencesInDatasets"), nil]).first&.object, + reused_by_datasets: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#reusedByDatasets"), nil]).first&.object, + reused_by_vocabs: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#reusedByVocabularies"), nil]).first&.object, + distribution: graph.query([vocab_uri, RDF::URI("http://www.w3.org/ns/dcat#distribution"), nil]).first&.object, + } + + review = graph.query([vocab_uri, RDF::URI("http://purl.org/stuff/rev#hasReview"), nil]).first&.object + if review + info[:review] = { + creator: graph.query([review, RDF::URI("http://purl.org/dc/terms/creator"), nil]).first&.object, + date: graph.query([review, RDF::URI("http://purl.org/dc/terms/date"), nil]).first&.object, + text: graph.query([review, RDF::URI("http://purl.org/stuff/rev#text"), nil]).first&.object + } + end + + info +end + + def find_vocabularies(graph) vocab_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") graph.query([nil, RDF.type, vocab_type]).map(&:subject) end +def format_value(value) + case value + when Array + value.map { |v| format_value(v) }.join("\n") + when Hash + if value[:language] + "\"#{value[:value]}\"@#{value[:language]}" + else + value.to_s + end + else + value.to_s.empty? ? "None" : value.to_s + end +end + + +def print_vocabulary_info(info) + puts "-------------------------------------------------------------------" + info.each do |key, value| + formatted_key = key.to_s.split('_').map(&:capitalize).join(' ') + puts "#{formatted_key}: #{format_value(value)}" + end + puts "\n" +end def main graph = parse_n3_file('lov.n3') @@ -57,7 +153,8 @@ def main else puts "Found #{vocab_uris.size} vocabularies:\n\n" vocab_uris.each do |vocab_uri| - puts vocab_uri + vocab_info = extract_vocabulary_info(graph, vocab_uri) + print_vocabulary_info(vocab_info) end end end From 255971c0e1ba8e40123ed189c46740d64b16b059 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Fri, 18 Oct 2024 18:34:01 +0100 Subject: [PATCH 04/15] add csv export for vocabularies metadata --- bin/lov_migrator | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/bin/lov_migrator b/bin/lov_migrator index 448d0781..5d4e50be 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -28,6 +28,11 @@ raise OptionParser::MissingArgument if options[:vocabs].nil? require 'rdf' require 'rdf/n3' +require 'csv' + +CSV_MAIN_ATTRS = [:prefix, :title, :description, :keyword, :uri, :modified] +CSV_FILENAME = "vocabs.csv" + def parse_n3_file(file_path) @@ -41,6 +46,20 @@ def parse_n3_file(file_path) end +def initialize_csv(filename) + CSV.open(filename, "w", force_quotes: true) do |csv| + csv << CSV_MAIN_ATTRS # Write header row + end +end + +def append_to_csv(obj, filename) + CSV.open(filename, "a", force_quotes: true) do |csv| + filtered_row = CSV_MAIN_ATTRS.map { |attr| format_value(obj[attr]) } + csv << filtered_row + end +end + + def extract_uri_or_value(node) case node when RDF::URI @@ -144,6 +163,10 @@ def print_vocabulary_info(info) end def main + + + initialize_csv(CSV_FILENAME) + graph = parse_n3_file('lov.n3') vocab_uris = find_vocabularies(graph) @@ -154,6 +177,8 @@ def main puts "Found #{vocab_uris.size} vocabularies:\n\n" vocab_uris.each do |vocab_uri| vocab_info = extract_vocabulary_info(graph, vocab_uri) + append_to_csv(vocab_info, CSV_FILENAME) + print_vocabulary_info(vocab_info) end end From 9cefc653b04807b0d73ede731f1c5d9f40628ed7 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Fri, 18 Oct 2024 19:03:12 +0100 Subject: [PATCH 05/15] add vocabularies dump file download + check for changes in vocabs before executing --- bin/lov_migrator | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/bin/lov_migrator b/bin/lov_migrator index 5d4e50be..8884f909 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -30,6 +30,10 @@ require 'rdf' require 'rdf/n3' require 'csv' +require 'open-uri' +require 'digest' + + CSV_MAIN_ATTRS = [:prefix, :title, :description, :keyword, :uri, :modified] CSV_FILENAME = "vocabs.csv" @@ -45,6 +49,44 @@ def parse_n3_file(file_path) graph end +def download_file(url, destination) + URI.open(url) do |file| + File.open(destination, 'wb') do |output| + output.write(file.read) + end + end + puts "File downloaded successfully as #{destination}" +end + +def file_hash(file_path) + Digest::SHA256.file(file_path).hexdigest +end + +def files_identical?(file1, file2) + File.exist?(file2) && file_hash(file1) == file_hash(file2) +end + +# Updates the local file if the downloaded file is different +def update_local_file(url, local_file_path) + downloaded_file_name = 'lov.n3.gz.tmp' + + download_file(url, downloaded_file_name) + + if files_identical?(downloaded_file_name, local_file_path) + File.delete(downloaded_file_name) # Clean up the temporary file + return false + else + File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one + # decompress the doanloaded zip + Zlib::GzipReader.open(local_file_path) do |gzip| + File.open('lov.n3', 'w') do |file| + file.write(gzip.read) + end + end + + return true + end +end def initialize_csv(filename) CSV.open(filename, "w", force_quotes: true) do |csv| @@ -164,6 +206,12 @@ end def main + if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") + puts "The local file was updated." + else + puts "The local file remains unchanged." + exit + end initialize_csv(CSV_FILENAME) From 30ea364db804116a685cbc5706ff4f0ab17424d8 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Sat, 19 Oct 2024 06:35:07 +0100 Subject: [PATCH 06/15] add creator metadata to the csv main attributes --- bin/lov_migrator | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 8884f909..11bef932 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -34,7 +34,8 @@ require 'open-uri' require 'digest' -CSV_MAIN_ATTRS = [:prefix, :title, :description, :keyword, :uri, :modified] + +CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :modified ] CSV_FILENAME = "vocabs.csv" From 43de6b55e12b8e6d73ab6bbe3bedb52edf48e4fe Mon Sep 17 00:00:00 2001 From: Muhammad Date: Sat, 19 Oct 2024 06:45:43 +0100 Subject: [PATCH 07/15] fix creator metadata to accept multiple values --- bin/lov_migrator | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 11bef932..30a4034a 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -149,7 +149,7 @@ def extract_vocabulary_info(graph, vocab_uri) modified: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/modified"), nil]).first&.object, isDefinedBy: graph.query([vocab_uri, RDF::URI("http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), nil]).first&.object, homepage: graph.query([vocab_uri, RDF::URI("http://xmlns.com/foaf/0.1/homepage"), nil]).first&.object, - creator: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/creator"), nil]).first&.object, + creator: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/creator"), nil]).map { |statement| extract_uri_or_value(statement.object) }, contributor: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/contributor"), nil]).map { |statement| extract_uri_or_value(statement.object) }, publisher: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/publisher"), nil]).first&.object, language: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, From c12cf023f1323d8c4531b9e28b0e90e68230fe92 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Sat, 19 Oct 2024 06:57:28 +0100 Subject: [PATCH 08/15] remove main function --- bin/lov_migrator | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 30a4034a..76f2a9ea 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -50,6 +50,7 @@ def parse_n3_file(file_path) graph end + def download_file(url, destination) URI.open(url) do |file| File.open(destination, 'wb') do |output| @@ -205,32 +206,31 @@ def print_vocabulary_info(info) puts "\n" end -def main +# Start of the script - if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") - puts "The local file was updated." - else - puts "The local file remains unchanged." - exit - end +if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") + puts "The local file was updated." +else + puts "The local file remains unchanged." + exit +end - initialize_csv(CSV_FILENAME) +initialize_csv(CSV_FILENAME) - graph = parse_n3_file('lov.n3') - - vocab_uris = find_vocabularies(graph) - - if vocab_uris.empty? - puts "No vocabularies found in the file." - else +graph = parse_n3_file('lov.n3') + +vocab_uris = find_vocabularies(graph) + +if vocab_uris.empty? + puts "No vocabularies found in the file." +else + if options[:vocabs].first == :all puts "Found #{vocab_uris.size} vocabularies:\n\n" vocab_uris.each do |vocab_uri| vocab_info = extract_vocabulary_info(graph, vocab_uri) append_to_csv(vocab_info, CSV_FILENAME) - - print_vocabulary_info(vocab_info) + #print_vocabulary_info(vocab_info) end - end -end + end +end -main \ No newline at end of file From 276389c70264dbeae0adbc97080780102e1436b4 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Sat, 19 Oct 2024 09:17:47 +0100 Subject: [PATCH 09/15] extract agents metadata --- bin/lov_migrator | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 76f2a9ea..48fb91e0 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -175,12 +175,25 @@ def extract_vocabulary_info(graph, vocab_uri) info end +def extract_agent_info(graph, agent_uri) + info = { + name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object, + sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + } +end def find_vocabularies(graph) vocab_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") graph.query([nil, RDF.type, vocab_type]).map(&:subject) end +def find_agents(graph) + agent_type = RDF::URI("http://xmlns.com/foaf/0.1/Person") + graph.query([nil, RDF.type, agent_type]).map(&:subject) +end + + + def format_value(value) case value when Array @@ -212,14 +225,14 @@ if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") puts "The local file was updated." else puts "The local file remains unchanged." - exit -end + end initialize_csv(CSV_FILENAME) -graph = parse_n3_file('lov.n3') +graph = parse_n3_file('test.n3') vocab_uris = find_vocabularies(graph) +agent_uris = find_agents(graph) if vocab_uris.empty? puts "No vocabularies found in the file." @@ -229,7 +242,12 @@ else vocab_uris.each do |vocab_uri| vocab_info = extract_vocabulary_info(graph, vocab_uri) append_to_csv(vocab_info, CSV_FILENAME) - #print_vocabulary_info(vocab_info) + print_vocabulary_info(vocab_info) + end + agent_uris.each do |agent_uri| + + puts extract_agent_info(graph, agent_uri) + end end end From 585d8a2d5188525e0a085a74122e9be82049e61d Mon Sep 17 00:00:00 2001 From: Muhammad Date: Sat, 19 Oct 2024 09:47:00 +0100 Subject: [PATCH 10/15] add support for manual dispatch attributes in csv --- bin/lov_migrator | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 48fb91e0..04400b87 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -36,6 +36,8 @@ require 'digest' CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :modified ] +CSV_ADDED_ATTRS = [ :destination, :who, :comment ] +CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv' CSV_FILENAME = "vocabs.csv" @@ -50,6 +52,14 @@ def parse_n3_file(file_path) graph end +def find_csv_row_by_prefix(file_path, prefix_value) + CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| + if row[:prefix] == prefix_value + return row + end + end + nil +end def download_file(url, destination) URI.open(url) do |file| @@ -92,18 +102,29 @@ end def initialize_csv(filename) CSV.open(filename, "w", force_quotes: true) do |csv| - csv << CSV_MAIN_ATTRS # Write header row + csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row end end -def append_to_csv(obj, filename) +# We copy the added attributes from dispatch csv +def copy_added_values_to_csv(vocab) + csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h + CSV_ADDED_ATTRS.each do |attr| + vocab[attr] = csv_vocab[attr] + end + vocab +end + +def append_to_csv(vocab, filename) + vocab = copy_added_values_to_csv(vocab) CSV.open(filename, "a", force_quotes: true) do |csv| - filtered_row = CSV_MAIN_ATTRS.map { |attr| format_value(obj[attr]) } + filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) } csv << filtered_row end end + def extract_uri_or_value(node) case node when RDF::URI @@ -225,7 +246,7 @@ if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") puts "The local file was updated." else puts "The local file remains unchanged." - end +end initialize_csv(CSV_FILENAME) @@ -241,7 +262,7 @@ else puts "Found #{vocab_uris.size} vocabularies:\n\n" vocab_uris.each do |vocab_uri| vocab_info = extract_vocabulary_info(graph, vocab_uri) - append_to_csv(vocab_info, CSV_FILENAME) + append_to_csv(vocab_info, CSV_FILENAME) print_vocabulary_info(vocab_info) end agent_uris.each do |agent_uri| From fc5114cde2b281a4d6612dd87abf18ffb2de6299 Mon Sep 17 00:00:00 2001 From: Muhammad Date: Mon, 21 Oct 2024 09:19:15 +0100 Subject: [PATCH 11/15] add the 'date of latest modification in LOV' attribute to csv --- bin/lov_migrator | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 04400b87..7ec42c97 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -35,7 +35,7 @@ require 'digest' -CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :modified ] +CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ] CSV_ADDED_ATTRS = [ :destination, :who, :comment ] CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv' CSV_FILENAME = "vocabs.csv" @@ -193,6 +193,8 @@ def extract_vocabulary_info(graph, vocab_uri) } end + info[:lastModifiedInLOVAt] = graph.query([RDF::URI("https://lov.linkeddata.es/dataset/lov/vocabs/#{info[:prefix]}"), RDF::URI("http://purl.org/dc/terms/modified") , nil]).first&.object + info end @@ -241,12 +243,12 @@ def print_vocabulary_info(info) end # Start of the script - +''' if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") puts "The local file was updated." else puts "The local file remains unchanged." -end +end''' initialize_csv(CSV_FILENAME) From 2a4f654b076e20d1b2cff9b1da9ced05f9fee7ac Mon Sep 17 00:00:00 2001 From: Syphax Date: Wed, 23 Oct 2024 08:36:17 +0200 Subject: [PATCH 12/15] update the parse_n3_file function to be faster using a ruby hash --- bin/lov_migrator | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 7ec42c97..c4be6427 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -56,8 +56,8 @@ def find_csv_row_by_prefix(file_path, prefix_value) CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| if row[:prefix] == prefix_value return row - end - end + end + end nil end @@ -100,6 +100,43 @@ def update_local_file(url, local_file_path) end end + def parse_n3_file(file_path) + # The RDF type you want to filter for + vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") + + vocabularies_hash = {} + vocabulary_subjects = Set.new + + # Single-pass to read the Turtle file without breaking syntax + RDF::Turtle::Reader.open(file_path) do |reader| + # Collect all statements in an array (could be large, but streaming the file) + reader.each_statement do |statement| + subject = statement.subject + + # If the statement is of type `voaf:Vocabulary`, track the subject + if statement.predicate == RDF.type && statement.object == vocabulary_type + vocabulary_subjects.add(subject) + vocabularies_hash[subject] ||= {} + end + + # Collect all triples related to the subject if it's identified as a vocabulary + if vocabulary_subjects.include?(subject) + vocabularies_hash[subject] ||= {} + old_value = vocabularies_hash[subject][statement.predicate] + if old_value.nil? + vocabularies_hash[subject][statement.predicate] = statement.object + else + vocabularies_hash[subject][statement.predicate] = Array(old_value) + [statement.object] + end + end + end + + end + vocabularies_hash + + end + end + def initialize_csv(filename) CSV.open(filename, "w", force_quotes: true) do |csv| csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row From 3aaeb9743229cd43b5a24ddda5911acbac11d541 Mon Sep 17 00:00:00 2001 From: Syphax Date: Wed, 23 Oct 2024 08:37:19 +0200 Subject: [PATCH 13/15] create module to for the models to migrate Vocabulary, Agent and distribution --- bin/lov_migrator | 111 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 14 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index c4be6427..6b32384d 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -40,27 +40,110 @@ CSV_ADDED_ATTRS = [ :destination, :who, :comment ] CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv' CSV_FILENAME = "vocabs.csv" +module LOVMigrator + + + module Models + class Model + def extract_uri_or_value(node) + case node + when RDF::URI + node.to_s + when RDF::Literal + if node.has_language? + { value: node.object, language: node.language.to_s } + else + node.object + end + else + node.to_s + end + end + def get_value(vocab, predicate, list: false) + values = Array(vocab[RDF::URI.new(predicate)]) + .map { |value| extract_uri_or_value(value) } + list ? values : values.first + end + end + class Vocabulary < Model + + def extract_vocabulary_info(graph, vocab_uri, vocab) + return unless vocab + + info = { + uri: vocab_uri, + namespace: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespaceUri"), + prefix: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespacePrefix"), + title: get_value(vocab, "http://purl.org/dc/terms/title", list: true), + description: get_value(vocab, "http://purl.org/dc/terms/description", list: true), + keyword: get_value(vocab, "http://www.w3.org/ns/dcat#keyword", list: true), + issued: get_value(vocab, "http://purl.org/dc/terms/issued"), + modified: get_value(vocab, "http://purl.org/dc/terms/modified"), + + isDefinedBy: get_value(vocab, "http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), + homepage: get_value(vocab, "http://xmlns.com/foaf/0.1/homepage"), + creator: get_value(vocab, "http://purl.org/dc/terms/creator", list: true), + contributor: get_value(vocab, "http://purl.org/dc/terms/contributor", list: true), + publisher: get_value(vocab, "http://purl.org/dc/terms/publisher"), + language: get_value(vocab, "http://purl.org/dc/terms/language", list: true), + type: get_value(vocab, RDF.type), + occurrences: get_value(vocab, "http://purl.org/vocommons/voaf#occurrencesInDatasets"), + reused_by_datasets: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByDatasets"), + reused_by_vocabs: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByVocabularies"), + distribution: get_value(vocab, "http://www.w3.org/ns/dcat#distribution"), + } + + review = get_value(vocab, "http://purl.org/stuff/rev#hasReview") + review = graph.dig(review) + if review + info[:review] = { + creator: get_value(review, "http://purl.org/dc/terms/creator"), + date: get_value(review, "http://purl.org/dc/terms/date"), + text: get_value(review, "http://purl.org/stuff/rev#text") + } + end + + info[:lastModifiedInLOVAt] = get_value(vocab, "http://purl.org/dc/terms/modified") + + info + end -def parse_n3_file(file_path) - graph = RDF::Graph.new - RDF::N3::Reader.open(file_path) do |reader| - reader.each_statement do |statement| - graph << statement end - end - graph -end -def find_csv_row_by_prefix(file_path, prefix_value) - CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| - if row[:prefix] == prefix_value - return row + class Agent + + def extract_agent_info(graph, agent_uri) + info = { + name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object, + sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + } end + + end + + class Distribution + + def extract_distribution_info(graph, distribution_uri) + info_distribution = { + uri: distribution_uri, + issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, + language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + # Relations + extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) }, + imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) } + } + info_distribution end - nil -end + end + end def download_file(url, destination) URI.open(url) do |file| File.open(destination, 'wb') do |output| From c89db6ca062f169cae2e7a1c9d6612b732585ab6 Mon Sep 17 00:00:00 2001 From: Syphax Date: Wed, 23 Oct 2024 08:38:04 +0200 Subject: [PATCH 14/15] dispatch the code into two modules one for the dump parsing and another for the csv generator --- bin/lov_migrator | 305 ++++++++++++++++++----------------------------- 1 file changed, 118 insertions(+), 187 deletions(-) diff --git a/bin/lov_migrator b/bin/lov_migrator index 6b32384d..6918dfeb 100755 --- a/bin/lov_migrator +++ b/bin/lov_migrator @@ -5,7 +5,6 @@ Signal.trap("INT") { exit 1 } require 'optparse' - options = {} OptionParser.new do |opts| opts.banner = "Usage: lov_migrator [options]" @@ -23,17 +22,16 @@ OptionParser.new do |opts| end end.parse! - raise OptionParser::MissingArgument if options[:vocabs].nil? +require 'bundler/setup' require 'rdf' require 'rdf/n3' require 'csv' - require 'open-uri' require 'digest' - - +require 'benchmark' +require 'parallel' CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ] CSV_ADDED_ATTRS = [ :destination, :who, :comment ] @@ -144,45 +142,47 @@ module LOVMigrator end end -def download_file(url, destination) - URI.open(url) do |file| - File.open(destination, 'wb') do |output| - output.write(file.read) + + class DumpParser + def download_file(url, destination) + URI.open(url) do |file| + File.open(destination, 'wb') do |output| + output.write(file.read) + end + end + puts "File downloaded successfully as #{destination}" end - end - puts "File downloaded successfully as #{destination}" -end -def file_hash(file_path) - Digest::SHA256.file(file_path).hexdigest -end + def file_hash(file_path) + Digest::SHA256.file(file_path).hexdigest + end -def files_identical?(file1, file2) - File.exist?(file2) && file_hash(file1) == file_hash(file2) -end + def files_identical?(file1, file2) + File.exist?(file2) && file_hash(file1) == file_hash(file2) + end -# Updates the local file if the downloaded file is different -def update_local_file(url, local_file_path) - downloaded_file_name = 'lov.n3.gz.tmp' - - download_file(url, downloaded_file_name) + # Updates the local file if the downloaded file is different + def update_local_file(url, local_file_path) + downloaded_file_name = 'lov.n3.gz.tmp' + + download_file(url, downloaded_file_name) + + if files_identical?(downloaded_file_name, local_file_path) + File.delete(downloaded_file_name) # Clean up the temporary file + return false + else + File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one + # decompress the doanloaded zip + Zlib::GzipReader.open(local_file_path) do |gzip| + File.open('lov.n3', 'w') do |file| + file.write(gzip.read) + end + end - if files_identical?(downloaded_file_name, local_file_path) - File.delete(downloaded_file_name) # Clean up the temporary file - return false - else - File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one - # decompress the doanloaded zip - Zlib::GzipReader.open(local_file_path) do |gzip| - File.open('lov.n3', 'w') do |file| - file.write(gzip.read) + return true end end - return true - end -end - def parse_n3_file(file_path) # The RDF type you want to filter for vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") @@ -220,139 +220,59 @@ end end end -def initialize_csv(filename) - CSV.open(filename, "w", force_quotes: true) do |csv| - csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row - end -end - -# We copy the added attributes from dispatch csv -def copy_added_values_to_csv(vocab) - csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h - CSV_ADDED_ATTRS.each do |attr| - vocab[attr] = csv_vocab[attr] - end - vocab -end - -def append_to_csv(vocab, filename) - vocab = copy_added_values_to_csv(vocab) - CSV.open(filename, "a", force_quotes: true) do |csv| - filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) } - csv << filtered_row - end -end - - + class CSVGenerator -def extract_uri_or_value(node) - case node - when RDF::URI - node.to_s - when RDF::Literal - if node.has_language? - { value: node.object, language: node.language.to_s } - else - node.object + def initialize_csv(filename) + CSV.open(filename, "w", force_quotes: true) do |csv| + csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row + end end - else - node.to_s - end -end - -def extract_distribution_info(graph, distribution_uri) - info_distribution = { - uri: distribution_uri, - issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, - language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - # Relations - extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) } - } - info_distribution -end - -def extract_vocabulary_info(graph, vocab_uri) - info = { - uri: vocab_uri, - namespace: graph.query([vocab_uri, RDF::URI("http://purl.org/vocab/vann/preferredNamespaceUri"), nil]).first&.object, - prefix: graph.query([vocab_uri, RDF::URI("http://purl.org/vocab/vann/preferredNamespacePrefix"), nil]).first&.object, - title: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - description: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/description"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - keyword: graph.query([vocab_uri, RDF::URI("http://www.w3.org/ns/dcat#keyword"), nil]).map(&:object), - issued: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object, - modified: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/modified"), nil]).first&.object, - isDefinedBy: graph.query([vocab_uri, RDF::URI("http://www.w3.org/2000/01/rdf-schema#isDefinedBy"), nil]).first&.object, - homepage: graph.query([vocab_uri, RDF::URI("http://xmlns.com/foaf/0.1/homepage"), nil]).first&.object, - creator: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/creator"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - contributor: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/contributor"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - publisher: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/publisher"), nil]).first&.object, - language: graph.query([vocab_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - - type: graph.query([vocab_uri, RDF.type, nil]).first&.object, - - occurrences: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#occurrencesInDatasets"), nil]).first&.object, - reused_by_datasets: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#reusedByDatasets"), nil]).first&.object, - reused_by_vocabs: graph.query([vocab_uri, RDF::URI("http://purl.org/vocommons/voaf#reusedByVocabularies"), nil]).first&.object, - distribution: graph.query([vocab_uri, RDF::URI("http://www.w3.org/ns/dcat#distribution"), nil]).first&.object, - } - - review = graph.query([vocab_uri, RDF::URI("http://purl.org/stuff/rev#hasReview"), nil]).first&.object - if review - info[:review] = { - creator: graph.query([review, RDF::URI("http://purl.org/dc/terms/creator"), nil]).first&.object, - date: graph.query([review, RDF::URI("http://purl.org/dc/terms/date"), nil]).first&.object, - text: graph.query([review, RDF::URI("http://purl.org/stuff/rev#text"), nil]).first&.object - } - end - - info[:lastModifiedInLOVAt] = graph.query([RDF::URI("https://lov.linkeddata.es/dataset/lov/vocabs/#{info[:prefix]}"), RDF::URI("http://purl.org/dc/terms/modified") , nil]).first&.object - - info -end - -def extract_agent_info(graph, agent_uri) - info = { - name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object, - sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) }, - } -end - -def find_vocabularies(graph) - vocab_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary") - graph.query([nil, RDF.type, vocab_type]).map(&:subject) -end + # We copy the added attributes from dispatch csv + def copy_added_values_to_csv(vocab) + csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h + CSV_ADDED_ATTRS.each do |attr| + vocab[attr] = csv_vocab[attr] + end + vocab + end -def find_agents(graph) - agent_type = RDF::URI("http://xmlns.com/foaf/0.1/Person") - graph.query([nil, RDF.type, agent_type]).map(&:subject) -end + def append_to_csv(vocab, filename) + vocab = copy_added_values_to_csv(vocab) if File.exist?(CSV_DISPATCH_FILENAME) + CSV.open(filename, "a", force_quotes: true) do |csv| + filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) } + csv << filtered_row + end + end + def find_csv_row_by_prefix(file_path, prefix_value) + CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row| + if row[:prefix] == prefix_value + return row + end + end + nil + end -def format_value(value) - case value - when Array - value.map { |v| format_value(v) }.join("\n") - when Hash - if value[:language] - "\"#{value[:value]}\"@#{value[:language]}" - else - value.to_s + def format_value(value) + case value + when Array + value.map { |v| format_value(v) }.join("\n") + when Hash + if value[:language] + "\"#{value[:value]}\"@#{value[:language]}" + else + value.to_s + end + else + value.to_s.empty? ? "None" : value.to_s + end end - else - value.to_s.empty? ? "None" : value.to_s + end end - def print_vocabulary_info(info) puts "-------------------------------------------------------------------" info.each do |key, value| @@ -362,36 +282,47 @@ def print_vocabulary_info(info) puts "\n" end +def logger(text, &block) + puts ">> #{text} starting..." + time = Benchmark.realtime do + block.call + end + puts "#{text} finished in #{time} seconds" +end + # Start of the script -''' -if update_local_file("https://lov.linkeddata.es/lov.n3.gz","lov.n3.gz") - puts "The local file was updated." -else - puts "The local file remains unchanged." -end''' - -initialize_csv(CSV_FILENAME) - -graph = parse_n3_file('test.n3') - -vocab_uris = find_vocabularies(graph) -agent_uris = find_agents(graph) - -if vocab_uris.empty? - puts "No vocabularies found in the file." -else - if options[:vocabs].first == :all - puts "Found #{vocab_uris.size} vocabularies:\n\n" - vocab_uris.each do |vocab_uri| - vocab_info = extract_vocabulary_info(graph, vocab_uri) - append_to_csv(vocab_info, CSV_FILENAME) - print_vocabulary_info(vocab_info) - end - agent_uris.each do |agent_uri| - - puts extract_agent_info(graph, agent_uri) - +def main + parser = LOVMigrator::DumpParser.new + updated = false + logger("Download dump file from LOV") do + updated = parser.update_local_file("https://lov.linkeddata.es/lov.n3.gz", "lov.n3.gz") + end + if updated + puts "The local file was updated." + else + puts "The local file remains unchanged." + end + + csv_generator = LOVMigrator::CSVGenerator.new + logger("Initialize CSV #{CSV_FILENAME}") do + csv_generator.initialize_csv(CSV_FILENAME) + end + + graph = [] + logger('Parsing the n3 in memory') do + graph = parser.parse_n3_file('lov.n3') + end + puts "Found #{graph.size} vocabularies" + if graph.empty? + puts "No vocabularies found in the file." + else + logger("Start creating CSV #{CSV_FILENAME}") do + graph.each do |vocab_uri, vocab_data| + vocab_info = LOVMigrator::Models::Vocabulary.new.extract_vocabulary_info(graph, vocab_uri, vocab_data) + csv_generator.append_to_csv(vocab_info, CSV_FILENAME) + end end end -end +end +main From 1108bfdee9b32e6401acfc54c299859d27968b3c Mon Sep 17 00:00:00 2001 From: Muhammad Date: Thu, 24 Oct 2024 21:44:06 +0100 Subject: [PATCH 15/15] add `rdf` and `rdf/n3` gems to the Gemfile --- Gemfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Gemfile b/Gemfile index 7d616193..51f14d40 100644 --- a/Gemfile +++ b/Gemfile @@ -23,6 +23,8 @@ gem 'sys-proctable' gem 'request_store' gem 'parallel' gem 'json-ld' +gem 'rdf' +gem 'rdf-n3' # Monitoring gem 'cube-ruby', require: 'cube'