Skip to content

Commit

Permalink
Data exports (#101)
Browse files Browse the repository at this point in the history
Data Exports functionality is responsible for:
* Generating transcription data files and saving them to Azure Blob Storage upon transcription approval. this includes a raw data file (.json), a consensus text file (.txt), a metadata file (.csv), and a line metadata file (.csv)
* Removing transcription data files from storage if transcription is unapproved
* Downloading all transcription files pertaining to a requested project, workflow, group, or single transcription, zipping the files, and sending them to the user
* Generating a single csv file containing the metadata for all transcriptions included in the collection (handled by the `AggregateMetadataFileGenerator` class), which is included in the zip file
  • Loading branch information
nciemniak authored Mar 4, 2020
1 parent c1d9c4f commit 2b991e4
Show file tree
Hide file tree
Showing 32 changed files with 1,934 additions and 43 deletions.
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ gem 'puma', '~> 4.3'
gem 'panoptes-client'
gem 'pundit'

# Connect to Azure Storage with Rails Active Storage
gem 'azure-storage'
gem 'azure-storage-blob'

gem 'rubyzip'

# jsonapi.rb is a bundle that incorporates fast_jsonapi (serialization),
# ransack (filtration), and some RSpec matchers along with some
# boilerplate for pagination and error handling
Expand Down
20 changes: 20 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,22 @@ GEM
minitest (~> 5.1)
tzinfo (~> 1.1)
zeitwerk (~> 2.2)
azure-core (0.1.15)
faraday (~> 0.9)
faraday_middleware (~> 0.10)
nokogiri (~> 1.6)
azure-storage (0.15.0.preview)
azure-core (~> 0.1)
faraday (~> 0.9)
faraday_middleware (~> 0.10)
nokogiri (~> 1.6, >= 1.6.8)
azure-storage-blob (1.1.0)
azure-core (~> 0.1.13)
azure-storage-common (~> 1.0)
nokogiri (~> 1.6, >= 1.6.8)
azure-storage-common (1.1.0)
azure-core (~> 0.1.13)
nokogiri (~> 1.6, >= 1.6.8)
bootsnap (1.4.5)
msgpack (~> 1.0)
builder (3.2.3)
Expand Down Expand Up @@ -214,6 +230,7 @@ GEM
rspec-mocks (~> 3.9.0)
rspec-support (~> 3.9.0)
rspec-support (3.9.0)
rubyzip (2.1.0)
sentry-raven (2.13.0)
faraday (>= 0.7.6, < 1.0)
simplecov (0.17.1)
Expand Down Expand Up @@ -251,6 +268,8 @@ PLATFORMS
ruby

DEPENDENCIES
azure-storage
azure-storage-blob
bootsnap (>= 1.4.2)
coveralls
factory_bot_rails
Expand All @@ -267,6 +286,7 @@ DEPENDENCIES
rack-cors
rails (~> 6.0.1)
rspec-rails
rubyzip
sentry-raven
simplecov
spring
Expand Down
10 changes: 8 additions & 2 deletions app/controllers/application_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ class ApplicationController < ActionController::Base

attr_reader :current_user, :auth_token
before_action :set_user
after_action :verify_authorized, except: :index
after_action :verify_policy_scoped, only: :index
after_action :verify_authorized, except: [:index]
after_action :verify_policy_scoped, only: [:index]

include ErrorExtender
include JSONAPI::Pagination
Expand Down Expand Up @@ -85,4 +85,10 @@ def jsonapi_meta(resources)
pagination = jsonapi_pagination_meta(resources)
{ pagination: pagination } if pagination.present?
end

def send_export_file(zip_file)
File.open(zip_file, 'r') do |f|
send_data f.read, filename: 'export.zip', type: 'application/zip'
end
end
end
10 changes: 10 additions & 0 deletions app/controllers/projects_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ def show
render jsonapi: @project
end

def export
@project = Project.find(params[:id])
authorize @project

data_storage = DataExports::DataStorage.new
data_storage.zip_project_files(@project) do |zip_file|
send_export_file zip_file
end
end

private

def allowed_filters
Expand Down
41 changes: 39 additions & 2 deletions app/controllers/transcriptions_controller.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
class TranscriptionsController < ApplicationController
include JSONAPI::Deserialization

class NoExportableTranscriptionsError < StandardError; end

before_action :status_filter_to_int, only: :index

def index
Expand All @@ -19,17 +21,52 @@ def update
raise ActionController::BadRequest if type_invalid?
raise ActionController::BadRequest unless whitelisted_attributes?

if approve?
if approving?
authorize @transcription, :approve?
else
authorize @transcription
end

update_attrs['updated_by'] = current_user.login
@transcription.update!(update_attrs)

if @transcription.status_previously_changed?
if approving?
@transcription.upload_files_to_storage
else
@transcription.remove_files_from_storage
end
end

render jsonapi: @transcription
end

def export
@transcription = Transcription.find(params[:id])
authorize @transcription

data_storage = DataExports::DataStorage.new
data_storage.zip_transcription_files(@transcription) do |zip_file|
send_export_file zip_file
end
end

def export_group
workflow = Workflow.find(params[:workflow_id])
authorize workflow

@transcriptions = Transcription.where(group_id: params[:group_id], workflow_id: params[:workflow_id])

if @transcriptions.empty?
raise NoExportableTranscriptionsError.new("No exportable transcriptions found for group id '#{params[:group_id]}'")
end

data_storage = DataExports::DataStorage.new
data_storage.zip_group_files(@transcriptions) do |zip_file|
send_export_file zip_file
end
end

private

def update_attrs
Expand Down Expand Up @@ -73,7 +110,7 @@ def whitelisted_attributes?
update_attrs.keys.all? { |key| update_attr_whitelist.include? key }
end

def approve?
def approving?
update_attrs["status"] == "approved"
end

Expand Down
10 changes: 10 additions & 0 deletions app/controllers/workflows_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ def show
render jsonapi: @workflow
end

def export
@workflow = Workflow.find(params[:id])
authorize @workflow

data_storage = DataExports::DataStorage.new
data_storage.zip_workflow_files(@workflow) do |zip_file|
send_export_file zip_file
end
end

private

def allowed_filters
Expand Down
19 changes: 18 additions & 1 deletion app/models/transcription.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class Transcription < ApplicationRecord
belongs_to :workflow
has_many_attached :export_files

validates :status, presence: true
validates :group_id, presence: true
Expand All @@ -10,9 +11,25 @@ class Transcription < ApplicationRecord
in_progress: 1,
ready: 2, # ready as in "ready for approval"
unseen: 3

}

def upload_files_to_storage
file_generator = DataExports::TranscriptionFileGenerator.new(self)
file_generator.generate_transcription_files.each do |temp_file|
# get filename without the temfile's randomly generated unique string
basename = File.basename(temp_file)
filename = basename.split('-').first + File.extname(basename)
export_files.attach(io: temp_file, filename: filename)

temp_file.close
temp_file.unlink
end
end

def remove_files_from_storage
export_files.map(&:purge)
end

private
def text_json_is_not_nil
if text.nil?
Expand Down
8 changes: 8 additions & 0 deletions app/policies/application_policy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ def show?
admin? || (logged_in? && viewer?)
end

def export?
admin? || (logged_in? && editor?)
end

def export_group?
admin? || (logged_in? && editor?)
end

def admin?
logged_in? && user.admin
end
Expand Down
2 changes: 1 addition & 1 deletion app/policies/transcription_policy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ def viewer_policy_scope
end
end
end
end
end
78 changes: 78 additions & 0 deletions app/services/data_exports/aggregate_metadata_file_generator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
require 'csv'

module DataExports
# Helper class for aggregating metadata from individual transcriptions
# within a group/workflow/project into a single csv file
class AggregateMetadataFileGenerator
class << self
# Public: add metadata csv file to group folder
def generate_group_file(transcriptions, output_folder)
metadata_rows = compile_transcription_metadata(transcriptions)
generate_csv(output_folder, metadata_rows)
end

# Public: add metadata csv file to workflow folder
def generate_workflow_file(workflow, output_folder)
metadata_rows = compile_workflow_metadata(workflow)
generate_csv(output_folder, metadata_rows)
end

def generate_project_file(project, output_folder)
metadata_rows = []
project.workflows.each do |w|
metadata_rows += compile_workflow_metadata(w)
end

generate_csv(output_folder, metadata_rows)
end

private

# Private: for each transcription, extracts transcription metadata from metadata
# storage file, adds it to the metadata_rows array, which will be passed to a
# csv file generator.
# @param metadata_rows [Array]: collection of metadata rows for the current
# group/workflow/project being processed
# returns updated metadata_rows array
def compile_transcription_metadata(transcriptions)
metadata_rows = []
metadata_file_regex = /^transcription_metadata_.*\.csv$/

transcriptions.each do |transcription|
transcription.export_files.each do |storage_file|
is_transcription_metadata_file = metadata_file_regex.match storage_file.filename.to_s
if is_transcription_metadata_file
rows = CSV.parse(storage_file.download)

# add header if it's the first transcription being added
metadata_rows << rows[0] if metadata_rows.empty?
# add content regardless
metadata_rows << rows[1]
end
end
end

metadata_rows
end

def compile_workflow_metadata(workflow)
metadata_rows = []

workflow.transcription_group_data.each_key do |group_key|
transcriptions = Transcription.where(group_id: group_key)
metadata_rows += compile_transcription_metadata(transcriptions)
end

metadata_rows
end

def generate_csv(output_folder, metadata_rows)
metadata_file = File.join(output_folder, 'transcriptions_metadata.csv')

CSV.open(metadata_file, 'wb') do |csv|
metadata_rows.each { |row| csv << row }
end
end
end
end
end
Loading

0 comments on commit 2b991e4

Please sign in to comment.