Data exports (#101)

Data Exports functionality is responsible for: * Generating transcription data files and saving them to Azure Blob Storage upon transcription approval. this includes a raw data file (.json), a consensus text file (.txt), a metadata file (.csv), and a line metadata file (.csv) * Removing transcription data files from storage if transcription is unapproved * Downloading all transcription files pertaining to a requested project, workflow, group, or single transcription, zipping the files, and sending them to the user * Generating a single csv file containing the metadata for all transcriptions included in the collection (handled by the `AggregateMetadataFileGenerator` class), which is included in the zip file
zooniverse · Mar 4, 2020 · 2b991e4 · 2b991e4
1 parent c1d9c4f
commit 2b991e4
Show file tree

Hide file tree

Showing 32 changed files with 1,934 additions and 43 deletions.
diff --git a/Gemfile b/Gemfile
@@ -8,6 +8,12 @@ gem 'puma', '~> 4.3'
 gem 'panoptes-client'
 gem 'pundit'
 
+# Connect to Azure Storage with Rails Active Storage
+gem 'azure-storage'
+gem 'azure-storage-blob'
+
+gem 'rubyzip'
+
 # jsonapi.rb is a bundle that incorporates fast_jsonapi (serialization),
 # ransack (filtration), and some RSpec matchers along with some
 # boilerplate for pagination and error handling

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -56,6 +56,22 @@ GEM
       minitest (~> 5.1)
       tzinfo (~> 1.1)
       zeitwerk (~> 2.2)
+    azure-core (0.1.15)
+      faraday (~> 0.9)
+      faraday_middleware (~> 0.10)
+      nokogiri (~> 1.6)
+    azure-storage (0.15.0.preview)
+      azure-core (~> 0.1)
+      faraday (~> 0.9)
+      faraday_middleware (~> 0.10)
+      nokogiri (~> 1.6, >= 1.6.8)
+    azure-storage-blob (1.1.0)
+      azure-core (~> 0.1.13)
+      azure-storage-common (~> 1.0)
+      nokogiri (~> 1.6, >= 1.6.8)
+    azure-storage-common (1.1.0)
+      azure-core (~> 0.1.13)
+      nokogiri (~> 1.6, >= 1.6.8)
     bootsnap (1.4.5)
       msgpack (~> 1.0)
     builder (3.2.3)
@@ -214,6 +230,7 @@ GEM
       rspec-mocks (~> 3.9.0)
       rspec-support (~> 3.9.0)
     rspec-support (3.9.0)
+    rubyzip (2.1.0)
     sentry-raven (2.13.0)
       faraday (>= 0.7.6, < 1.0)
     simplecov (0.17.1)
@@ -251,6 +268,8 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
+  azure-storage
+  azure-storage-blob
   bootsnap (>= 1.4.2)
   coveralls
   factory_bot_rails
@@ -267,6 +286,7 @@ DEPENDENCIES
   rack-cors
   rails (~> 6.0.1)
   rspec-rails
+  rubyzip
   sentry-raven
   simplecov
   spring

diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
@@ -5,8 +5,8 @@ class ApplicationController < ActionController::Base
 
   attr_reader :current_user, :auth_token
   before_action :set_user
-  after_action :verify_authorized, except: :index
-  after_action :verify_policy_scoped, only: :index
+  after_action :verify_authorized, except: [:index]
+  after_action :verify_policy_scoped, only: [:index]
 
   include ErrorExtender
   include JSONAPI::Pagination
@@ -85,4 +85,10 @@ def jsonapi_meta(resources)
     pagination = jsonapi_pagination_meta(resources)
     { pagination: pagination } if pagination.present?
   end
+
+  def send_export_file(zip_file)
+    File.open(zip_file, 'r') do |f|
+      send_data f.read, filename: 'export.zip', type: 'application/zip'
+    end
+  end
 end
diff --git a/app/controllers/projects_controller.rb b/app/controllers/projects_controller.rb
@@ -10,6 +10,16 @@ def show
     render jsonapi: @project
   end
 
+  def export
+    @project = Project.find(params[:id])
+    authorize @project
+
+    data_storage = DataExports::DataStorage.new
+    data_storage.zip_project_files(@project) do |zip_file|
+      send_export_file zip_file
+    end
+  end
+
   private
 
   def allowed_filters

diff --git a/app/controllers/transcriptions_controller.rb b/app/controllers/transcriptions_controller.rb
@@ -1,6 +1,8 @@
 class TranscriptionsController < ApplicationController
   include JSONAPI::Deserialization
 
+  class NoExportableTranscriptionsError < StandardError; end
+
   before_action :status_filter_to_int, only: :index
 
   def index
@@ -19,17 +21,52 @@ def update
     raise ActionController::BadRequest if type_invalid?
     raise ActionController::BadRequest unless whitelisted_attributes?
 
-    if approve?
+    if approving?
       authorize @transcription, :approve?
     else
       authorize @transcription
     end
 
     update_attrs['updated_by'] = current_user.login
     @transcription.update!(update_attrs)
+
+    if @transcription.status_previously_changed?
+      if approving?
+        @transcription.upload_files_to_storage
+      else
+        @transcription.remove_files_from_storage
+      end
+    end
+
     render jsonapi: @transcription
   end
 
+  def export
+    @transcription = Transcription.find(params[:id])
+    authorize @transcription
+
+    data_storage = DataExports::DataStorage.new
+    data_storage.zip_transcription_files(@transcription) do |zip_file|
+      send_export_file zip_file
+    end
+  end
+
+  def export_group
+    workflow = Workflow.find(params[:workflow_id])
+    authorize workflow
+
+    @transcriptions = Transcription.where(group_id: params[:group_id], workflow_id: params[:workflow_id])
+
+    if @transcriptions.empty?
+      raise NoExportableTranscriptionsError.new("No exportable transcriptions found for group id '#{params[:group_id]}'")
+    end
+
+    data_storage = DataExports::DataStorage.new
+    data_storage.zip_group_files(@transcriptions) do |zip_file|
+      send_export_file zip_file
+    end
+  end
+
   private
 
   def update_attrs
@@ -73,7 +110,7 @@ def whitelisted_attributes?
     update_attrs.keys.all? { |key| update_attr_whitelist.include? key }
   end
 
-  def approve?
+  def approving?
     update_attrs["status"] == "approved"
   end
 

diff --git a/app/controllers/workflows_controller.rb b/app/controllers/workflows_controller.rb
@@ -10,6 +10,16 @@ def show
     render jsonapi: @workflow
   end
 
+  def export
+    @workflow = Workflow.find(params[:id])
+    authorize @workflow
+
+    data_storage = DataExports::DataStorage.new
+    data_storage.zip_workflow_files(@workflow) do |zip_file|
+      send_export_file zip_file
+    end
+  end
+
   private
 
   def allowed_filters

diff --git a/app/models/transcription.rb b/app/models/transcription.rb
@@ -1,5 +1,6 @@
 class Transcription < ApplicationRecord
   belongs_to :workflow
+  has_many_attached :export_files
 
   validates :status, presence: true
   validates :group_id, presence: true
@@ -10,9 +11,25 @@ class Transcription < ApplicationRecord
     in_progress: 1,
     ready: 2, # ready as in "ready for approval"
     unseen: 3
-
   }
 
+  def upload_files_to_storage
+    file_generator = DataExports::TranscriptionFileGenerator.new(self)
+    file_generator.generate_transcription_files.each do |temp_file|
+      # get filename without the temfile's randomly generated unique string
+      basename = File.basename(temp_file)
+      filename = basename.split('-').first + File.extname(basename)
+      export_files.attach(io: temp_file, filename: filename)
+
+      temp_file.close
+      temp_file.unlink
+    end
+  end
+
+  def remove_files_from_storage
+    export_files.map(&:purge)
+  end
+
   private
   def text_json_is_not_nil
     if text.nil?

diff --git a/app/policies/application_policy.rb b/app/policies/application_policy.rb
@@ -16,6 +16,14 @@ def show?
     admin? || (logged_in? && viewer?)
   end
 
+  def export?
+    admin? || (logged_in? && editor?)
+  end
+
+  def export_group?
+    admin? || (logged_in? && editor?)
+  end
+
   def admin?
     logged_in? && user.admin
   end

diff --git a/app/policies/transcription_policy.rb b/app/policies/transcription_policy.rb
@@ -35,4 +35,4 @@ def viewer_policy_scope
       end
     end
   end
-end
+end
diff --git a/app/services/data_exports/aggregate_metadata_file_generator.rb b/app/services/data_exports/aggregate_metadata_file_generator.rb
@@ -0,0 +1,78 @@
+require 'csv'
+
+module DataExports
+  # Helper class for aggregating metadata from individual transcriptions
+  # within a group/workflow/project into a single csv file
+  class AggregateMetadataFileGenerator
+    class << self
+      # Public: add metadata csv file to group folder
+      def generate_group_file(transcriptions, output_folder)
+        metadata_rows = compile_transcription_metadata(transcriptions)
+        generate_csv(output_folder, metadata_rows)
+      end
+
+      # Public: add metadata csv file to workflow folder
+      def generate_workflow_file(workflow, output_folder)
+        metadata_rows = compile_workflow_metadata(workflow)
+        generate_csv(output_folder, metadata_rows)
+      end
+
+      def generate_project_file(project, output_folder)
+        metadata_rows = []
+        project.workflows.each do |w|
+          metadata_rows += compile_workflow_metadata(w)
+        end
+
+        generate_csv(output_folder, metadata_rows)
+      end
+
+      private
+
+      # Private: for each transcription, extracts transcription metadata from metadata
+      # storage file, adds it to the metadata_rows array, which will be passed to a
+      # csv file generator.
+      # @param metadata_rows [Array]: collection of metadata rows for the current
+      # group/workflow/project being processed
+      # returns updated metadata_rows array
+      def compile_transcription_metadata(transcriptions)
+        metadata_rows = []
+        metadata_file_regex = /^transcription_metadata_.*\.csv$/
+
+        transcriptions.each do |transcription|
+          transcription.export_files.each do |storage_file|
+            is_transcription_metadata_file = metadata_file_regex.match storage_file.filename.to_s
+            if is_transcription_metadata_file
+              rows = CSV.parse(storage_file.download)
+
+              # add header if it's the first transcription being added
+              metadata_rows << rows[0] if metadata_rows.empty?
+              # add content regardless
+              metadata_rows << rows[1]
+            end
+          end
+        end
+
+        metadata_rows
+      end
+
+      def compile_workflow_metadata(workflow)
+        metadata_rows = []
+
+        workflow.transcription_group_data.each_key do |group_key|
+          transcriptions = Transcription.where(group_id: group_key)
+          metadata_rows += compile_transcription_metadata(transcriptions)
+        end
+
+        metadata_rows
+      end
+
+      def generate_csv(output_folder, metadata_rows)
+        metadata_file = File.join(output_folder, 'transcriptions_metadata.csv')
+
+        CSV.open(metadata_file, 'wb') do |csv|
+          metadata_rows.each { |row| csv << row }
+        end
+      end
+    end
+  end
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,4 +35,4 @@ def viewer_policy_scope @@
           end
         end
       end
-    end
+    end