ontoportal-lirmm · muhammedBkf · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/bin/lov_migrator b/bin/lov_migrator
@@ -0,0 +1,328 @@
+#!/usr/bin/env ruby
+
+# Exit cleanly from an early interrupt
+Signal.trap("INT") { exit 1 }
+
+require 'optparse'
+
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: lov_migrator [options]"
+  opts.on( '-a', '--all') do 
+    options[:vocabs] = [:all]
+  end
+  opts.on('-v', '--vocabularies PREFIX1, PREFIX2', 'Comma-separated list of vocabularies to update or import') do |acronym|
+    ontologies_acronyms = acronym
+  end
+
+  # Display the help screen, all programs are assumed to have this option.
+  opts.on( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+end.parse!
+
+raise OptionParser::MissingArgument if options[:vocabs].nil?
+
+require 'bundler/setup'
+require 'rdf'
+require 'rdf/n3'
+require 'csv'
+require 'open-uri'
+require 'digest'
+require 'benchmark'
+require 'parallel'
+
+CSV_MAIN_ATTRS = [ :prefix, :title, :description, :keyword, :creator, :uri, :lastModifiedInLOVAt ]
+CSV_ADDED_ATTRS = [ :destination, :who, :comment ]
+CSV_DISPATCH_FILENAME = 'LOV_vocabularies_dispatch.csv'
+CSV_FILENAME = "vocabs.csv"
+
+module LOVMigrator
+
+
+  module Models
+    class Model
+      def extract_uri_or_value(node)
+        case node
+        when RDF::URI
+          node.to_s
+        when RDF::Literal
+          if node.has_language?
+            { value: node.object, language: node.language.to_s }
+          else
+            node.object
+          end
+        else
+          node.to_s
+        end
+      end
+
+      def get_value(vocab, predicate, list: false)
+        values = Array(vocab[RDF::URI.new(predicate)])
+                   .map { |value| extract_uri_or_value(value) }
+        list ? values : values.first
+      end
+    end
+    class Vocabulary < Model
+
+      def extract_vocabulary_info(graph, vocab_uri, vocab)
+        return unless vocab
+
+        info = {
+          uri: vocab_uri,
+          namespace: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespaceUri"),
+          prefix: get_value(vocab, "http://purl.org/vocab/vann/preferredNamespacePrefix"),
+          title: get_value(vocab, "http://purl.org/dc/terms/title", list: true),
+          description: get_value(vocab, "http://purl.org/dc/terms/description", list: true),
+          keyword: get_value(vocab, "http://www.w3.org/ns/dcat#keyword", list: true),
+          issued: get_value(vocab, "http://purl.org/dc/terms/issued"),
+          modified: get_value(vocab, "http://purl.org/dc/terms/modified"),
+
+          isDefinedBy: get_value(vocab, "http://www.w3.org/2000/01/rdf-schema#isDefinedBy"),
+          homepage: get_value(vocab, "http://xmlns.com/foaf/0.1/homepage"),
+          creator: get_value(vocab, "http://purl.org/dc/terms/creator", list: true),
+          contributor: get_value(vocab, "http://purl.org/dc/terms/contributor", list: true),
+          publisher: get_value(vocab, "http://purl.org/dc/terms/publisher"),
+          language: get_value(vocab, "http://purl.org/dc/terms/language", list: true),
+          type: get_value(vocab, RDF.type),
+          occurrences: get_value(vocab, "http://purl.org/vocommons/voaf#occurrencesInDatasets"),
+          reused_by_datasets: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByDatasets"),
+          reused_by_vocabs: get_value(vocab, "http://purl.org/vocommons/voaf#reusedByVocabularies"),
+          distribution: get_value(vocab, "http://www.w3.org/ns/dcat#distribution"),
+        }
+
+        review = get_value(vocab, "http://purl.org/stuff/rev#hasReview")
+        review = graph.dig(review)
+        if review
+          info[:review] = {
+            creator: get_value(review, "http://purl.org/dc/terms/creator"),
+            date: get_value(review, "http://purl.org/dc/terms/date"),
+            text: get_value(review, "http://purl.org/stuff/rev#text")
+          }
+        end
+
+        info[:lastModifiedInLOVAt] = get_value(vocab, "http://purl.org/dc/terms/modified")
+
+        info
+      end
+
+    end
+
+    class Agent
+
+      def extract_agent_info(graph, agent_uri)
+        info = {
+          name: graph.query([agent_uri, RDF::URI("http://xmlns.com/foaf/0.1/name"), nil]).first&.object,
+          sameAs: graph.query([agent_uri, RDF::URI("http://www.w3.org/2002/07/owl#sameAs"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+        }
+      end
+
+    end
+
+    class Distribution
+
+      def extract_distribution_info(graph, distribution_uri)
+        info_distribution = {
+          uri: distribution_uri,
+          issued: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/issued"), nil]).first&.object,
+          language: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/language"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          title: graph.query([distribution_uri, RDF::URI("http://purl.org/dc/terms/title"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          # Relations
+          extends: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#extends"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          specializes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#specializes"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          generalizes: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#generalizes"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          hasEquivalencesWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasEquivalencesWith"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          hasDisjunctionsWith: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#hasDisjunctionsWith"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          metadataVoc: graph.query([distribution_uri, RDF::URI("http://purl.org/vocommons/voaf#metadataVoc"), nil]).map { |statement| extract_uri_or_value(statement.object) },
+          imports: graph.query([distribution_uri, RDF::URI("http://www.w3.org/2002/07/owl#imports"), nil]).map { |statement| extract_uri_or_value(statement.object) }
+        }
+        info_distribution
+      end
+
+    end
+  end
+
+  class DumpParser
+    def download_file(url, destination)
+      URI.open(url) do |file|
+        File.open(destination, 'wb') do |output|
+          output.write(file.read)
+        end
+      end
+      puts "File downloaded successfully as #{destination}"
+    end
+
+    def file_hash(file_path)
+      Digest::SHA256.file(file_path).hexdigest
+    end
+
+    def files_identical?(file1, file2)
+      File.exist?(file2) && file_hash(file1) == file_hash(file2)
+    end
+
+    # Updates the local file if the downloaded file is different
+    def update_local_file(url, local_file_path)
+      downloaded_file_name = 'lov.n3.gz.tmp'
+
+      download_file(url, downloaded_file_name)
+
+      if files_identical?(downloaded_file_name, local_file_path)
+        File.delete(downloaded_file_name) # Clean up the temporary file
+        return false
+      else
+        File.rename(downloaded_file_name, local_file_path) # Replace the old file with the new one
+        # decompress the doanloaded zip
+        Zlib::GzipReader.open(local_file_path) do |gzip|
+          File.open('lov.n3', 'w') do |file|
+            file.write(gzip.read)
+          end
+        end
+
+        return true
+      end
+    end
+
+    def parse_n3_file(file_path)
+      # The RDF type you want to filter for
+      vocabulary_type = RDF::URI("http://purl.org/vocommons/voaf#Vocabulary")
+
+      vocabularies_hash = {}
+      vocabulary_subjects = Set.new
+
+      # Single-pass to read the Turtle file without breaking syntax
+      RDF::Turtle::Reader.open(file_path) do |reader|
+        # Collect all statements in an array (could be large, but streaming the file)
+        reader.each_statement do |statement|
+          subject = statement.subject
+
+          # If the statement is of type `voaf:Vocabulary`, track the subject
+          if statement.predicate == RDF.type && statement.object == vocabulary_type
+            vocabulary_subjects.add(subject)
+            vocabularies_hash[subject] ||= {}
+          end
+
+          # Collect all triples related to the subject if it's identified as a vocabulary
+          if vocabulary_subjects.include?(subject)
+            vocabularies_hash[subject] ||= {}
+            old_value = vocabularies_hash[subject][statement.predicate]
+            if old_value.nil?
+              vocabularies_hash[subject][statement.predicate] = statement.object
+            else
+              vocabularies_hash[subject][statement.predicate] = Array(old_value) + [statement.object]
+            end
+          end
+        end
+
+      end
+      vocabularies_hash
+
+    end
+  end
+
+  class CSVGenerator
+
+    def initialize_csv(filename)
+      CSV.open(filename, "w", force_quotes: true) do |csv|
+        csv << CSV_MAIN_ATTRS + CSV_ADDED_ATTRS # Write header row
+      end
+    end
+
+    # We copy the added attributes from dispatch csv
+    def copy_added_values_to_csv(vocab)
+      csv_vocab = find_csv_row_by_prefix(CSV_DISPATCH_FILENAME, vocab[:prefix]).to_h
+      CSV_ADDED_ATTRS.each do |attr|
+        vocab[attr] = csv_vocab[attr]
+      end
+      vocab
+    end
+
+    def append_to_csv(vocab, filename)
+      vocab = copy_added_values_to_csv(vocab) if File.exist?(CSV_DISPATCH_FILENAME)
+      CSV.open(filename, "a", force_quotes: true) do |csv|
+        filtered_row = (CSV_MAIN_ATTRS + CSV_ADDED_ATTRS).map { |attr| format_value(vocab[attr]) }
+        csv << filtered_row
+      end
+    end
+
+    def find_csv_row_by_prefix(file_path, prefix_value)
+      CSV.foreach(file_path, headers: true, header_converters: :symbol) do |row|
+        if row[:prefix] == prefix_value
+          return row
+        end
+      end
+      nil
+    end
+
+
+    def format_value(value)
+      case value
+      when Array
+        value.map { |v| format_value(v) }.join("\n")
+      when Hash
+        if value[:language]
+          "\"#{value[:value]}\"@#{value[:language]}"
+        else
+          value.to_s
+        end
+      else
+        value.to_s.empty? ? "None" : value.to_s
+      end
+    end
+
+  end
+end
+
+def print_vocabulary_info(info)
+  puts "-------------------------------------------------------------------"
+  info.each do |key, value|
+    formatted_key = key.to_s.split('_').map(&:capitalize).join(' ')
+    puts "#{formatted_key}: #{format_value(value)}"
+  end
+  puts "\n"
+end
+
+def logger(text, &block)
+  puts ">> #{text} starting..."
+  time = Benchmark.realtime do
+    block.call
+  end
+  puts "#{text} finished in #{time} seconds"
+end
+
+# Start of the script
+def main
+  parser = LOVMigrator::DumpParser.new
+  updated = false
+  logger("Download dump file from LOV") do
+    updated = parser.update_local_file("https://lov.linkeddata.es/lov.n3.gz", "lov.n3.gz")
+  end
+  if updated
+    puts "The local file was updated."
+  else
+    puts "The local file remains unchanged."
+  end
+
+  csv_generator = LOVMigrator::CSVGenerator.new
+  logger("Initialize CSV #{CSV_FILENAME}") do
+    csv_generator.initialize_csv(CSV_FILENAME)
+  end
+
+  graph = []
+  logger('Parsing the n3 in memory') do
+    graph = parser.parse_n3_file('lov.n3')
+  end
+  puts "Found #{graph.size} vocabularies"
+  if graph.empty?
+    puts "No vocabularies found in the file."
+  else
+    logger("Start creating CSV #{CSV_FILENAME}") do
+      graph.each do |vocab_uri, vocab_data|
+        vocab_info = LOVMigrator::Models::Vocabulary.new.extract_vocabulary_info(graph, vocab_uri, vocab_data)
+        csv_generator.append_to_csv(vocab_info, CSV_FILENAME)
+      end
+    end
+  end
+end
+
+main