diff --git a/Gemfile b/Gemfile index bda4eeba..66db214b 100644 --- a/Gemfile +++ b/Gemfile @@ -8,6 +8,12 @@ gem 'puma', '~> 4.3' gem 'panoptes-client' gem 'pundit' +# Connect to Azure Storage with Rails Active Storage +gem 'azure-storage' +gem 'azure-storage-blob' + +gem 'rubyzip' + # jsonapi.rb is a bundle that incorporates fast_jsonapi (serialization), # ransack (filtration), and some RSpec matchers along with some # boilerplate for pagination and error handling diff --git a/Gemfile.lock b/Gemfile.lock index 6606b65a..cda0d2f2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -56,6 +56,22 @@ GEM minitest (~> 5.1) tzinfo (~> 1.1) zeitwerk (~> 2.2) + azure-core (0.1.15) + faraday (~> 0.9) + faraday_middleware (~> 0.10) + nokogiri (~> 1.6) + azure-storage (0.15.0.preview) + azure-core (~> 0.1) + faraday (~> 0.9) + faraday_middleware (~> 0.10) + nokogiri (~> 1.6, >= 1.6.8) + azure-storage-blob (1.1.0) + azure-core (~> 0.1.13) + azure-storage-common (~> 1.0) + nokogiri (~> 1.6, >= 1.6.8) + azure-storage-common (1.1.0) + azure-core (~> 0.1.13) + nokogiri (~> 1.6, >= 1.6.8) bootsnap (1.4.5) msgpack (~> 1.0) builder (3.2.3) @@ -214,6 +230,7 @@ GEM rspec-mocks (~> 3.9.0) rspec-support (~> 3.9.0) rspec-support (3.9.0) + rubyzip (2.1.0) sentry-raven (2.13.0) faraday (>= 0.7.6, < 1.0) simplecov (0.17.1) @@ -251,6 +268,8 @@ PLATFORMS ruby DEPENDENCIES + azure-storage + azure-storage-blob bootsnap (>= 1.4.2) coveralls factory_bot_rails @@ -267,6 +286,7 @@ DEPENDENCIES rack-cors rails (~> 6.0.1) rspec-rails + rubyzip sentry-raven simplecov spring diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index bba5e3d3..6d384ac2 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -5,8 +5,8 @@ class ApplicationController < ActionController::Base attr_reader :current_user, :auth_token before_action :set_user - after_action :verify_authorized, except: :index - after_action :verify_policy_scoped, only: :index + after_action :verify_authorized, except: [:index] + after_action :verify_policy_scoped, only: [:index] include ErrorExtender include JSONAPI::Pagination @@ -85,4 +85,10 @@ def jsonapi_meta(resources) pagination = jsonapi_pagination_meta(resources) { pagination: pagination } if pagination.present? end + + def send_export_file(zip_file) + File.open(zip_file, 'r') do |f| + send_data f.read, filename: 'export.zip', type: 'application/zip' + end + end end diff --git a/app/controllers/projects_controller.rb b/app/controllers/projects_controller.rb index 0cbc70f3..15913366 100644 --- a/app/controllers/projects_controller.rb +++ b/app/controllers/projects_controller.rb @@ -10,6 +10,16 @@ def show render jsonapi: @project end + def export + @project = Project.find(params[:id]) + authorize @project + + data_storage = DataExports::DataStorage.new + data_storage.zip_project_files(@project) do |zip_file| + send_export_file zip_file + end + end + private def allowed_filters diff --git a/app/controllers/transcriptions_controller.rb b/app/controllers/transcriptions_controller.rb index d83b84f7..a2a6799b 100644 --- a/app/controllers/transcriptions_controller.rb +++ b/app/controllers/transcriptions_controller.rb @@ -1,6 +1,8 @@ class TranscriptionsController < ApplicationController include JSONAPI::Deserialization + class NoExportableTranscriptionsError < StandardError; end + before_action :status_filter_to_int, only: :index def index @@ -19,7 +21,7 @@ def update raise ActionController::BadRequest if type_invalid? raise ActionController::BadRequest unless whitelisted_attributes? - if approve? + if approving? authorize @transcription, :approve? else authorize @transcription @@ -27,9 +29,44 @@ def update update_attrs['updated_by'] = current_user.login @transcription.update!(update_attrs) + + if @transcription.status_previously_changed? + if approving? + @transcription.upload_files_to_storage + else + @transcription.remove_files_from_storage + end + end + render jsonapi: @transcription end + def export + @transcription = Transcription.find(params[:id]) + authorize @transcription + + data_storage = DataExports::DataStorage.new + data_storage.zip_transcription_files(@transcription) do |zip_file| + send_export_file zip_file + end + end + + def export_group + workflow = Workflow.find(params[:workflow_id]) + authorize workflow + + @transcriptions = Transcription.where(group_id: params[:group_id], workflow_id: params[:workflow_id]) + + if @transcriptions.empty? + raise NoExportableTranscriptionsError.new("No exportable transcriptions found for group id '#{params[:group_id]}'") + end + + data_storage = DataExports::DataStorage.new + data_storage.zip_group_files(@transcriptions) do |zip_file| + send_export_file zip_file + end + end + private def update_attrs @@ -73,7 +110,7 @@ def whitelisted_attributes? update_attrs.keys.all? { |key| update_attr_whitelist.include? key } end - def approve? + def approving? update_attrs["status"] == "approved" end diff --git a/app/controllers/workflows_controller.rb b/app/controllers/workflows_controller.rb index c9158d6a..666da645 100644 --- a/app/controllers/workflows_controller.rb +++ b/app/controllers/workflows_controller.rb @@ -10,6 +10,16 @@ def show render jsonapi: @workflow end + def export + @workflow = Workflow.find(params[:id]) + authorize @workflow + + data_storage = DataExports::DataStorage.new + data_storage.zip_workflow_files(@workflow) do |zip_file| + send_export_file zip_file + end + end + private def allowed_filters diff --git a/app/models/transcription.rb b/app/models/transcription.rb index 91d41ce6..e176f629 100644 --- a/app/models/transcription.rb +++ b/app/models/transcription.rb @@ -1,5 +1,6 @@ class Transcription < ApplicationRecord belongs_to :workflow + has_many_attached :export_files validates :status, presence: true validates :group_id, presence: true @@ -10,9 +11,25 @@ class Transcription < ApplicationRecord in_progress: 1, ready: 2, # ready as in "ready for approval" unseen: 3 - } + def upload_files_to_storage + file_generator = DataExports::TranscriptionFileGenerator.new(self) + file_generator.generate_transcription_files.each do |temp_file| + # get filename without the temfile's randomly generated unique string + basename = File.basename(temp_file) + filename = basename.split('-').first + File.extname(basename) + export_files.attach(io: temp_file, filename: filename) + + temp_file.close + temp_file.unlink + end + end + + def remove_files_from_storage + export_files.map(&:purge) + end + private def text_json_is_not_nil if text.nil? diff --git a/app/policies/application_policy.rb b/app/policies/application_policy.rb index 964952d2..bed65023 100644 --- a/app/policies/application_policy.rb +++ b/app/policies/application_policy.rb @@ -16,6 +16,14 @@ def show? admin? || (logged_in? && viewer?) end + def export? + admin? || (logged_in? && editor?) + end + + def export_group? + admin? || (logged_in? && editor?) + end + def admin? logged_in? && user.admin end diff --git a/app/policies/transcription_policy.rb b/app/policies/transcription_policy.rb index 1fff2966..d5aa4693 100644 --- a/app/policies/transcription_policy.rb +++ b/app/policies/transcription_policy.rb @@ -35,4 +35,4 @@ def viewer_policy_scope end end end -end \ No newline at end of file +end diff --git a/app/services/data_exports/aggregate_metadata_file_generator.rb b/app/services/data_exports/aggregate_metadata_file_generator.rb new file mode 100644 index 00000000..e0ac08cf --- /dev/null +++ b/app/services/data_exports/aggregate_metadata_file_generator.rb @@ -0,0 +1,78 @@ +require 'csv' + +module DataExports + # Helper class for aggregating metadata from individual transcriptions + # within a group/workflow/project into a single csv file + class AggregateMetadataFileGenerator + class << self + # Public: add metadata csv file to group folder + def generate_group_file(transcriptions, output_folder) + metadata_rows = compile_transcription_metadata(transcriptions) + generate_csv(output_folder, metadata_rows) + end + + # Public: add metadata csv file to workflow folder + def generate_workflow_file(workflow, output_folder) + metadata_rows = compile_workflow_metadata(workflow) + generate_csv(output_folder, metadata_rows) + end + + def generate_project_file(project, output_folder) + metadata_rows = [] + project.workflows.each do |w| + metadata_rows += compile_workflow_metadata(w) + end + + generate_csv(output_folder, metadata_rows) + end + + private + + # Private: for each transcription, extracts transcription metadata from metadata + # storage file, adds it to the metadata_rows array, which will be passed to a + # csv file generator. + # @param metadata_rows [Array]: collection of metadata rows for the current + # group/workflow/project being processed + # returns updated metadata_rows array + def compile_transcription_metadata(transcriptions) + metadata_rows = [] + metadata_file_regex = /^transcription_metadata_.*\.csv$/ + + transcriptions.each do |transcription| + transcription.export_files.each do |storage_file| + is_transcription_metadata_file = metadata_file_regex.match storage_file.filename.to_s + if is_transcription_metadata_file + rows = CSV.parse(storage_file.download) + + # add header if it's the first transcription being added + metadata_rows << rows[0] if metadata_rows.empty? + # add content regardless + metadata_rows << rows[1] + end + end + end + + metadata_rows + end + + def compile_workflow_metadata(workflow) + metadata_rows = [] + + workflow.transcription_group_data.each_key do |group_key| + transcriptions = Transcription.where(group_id: group_key) + metadata_rows += compile_transcription_metadata(transcriptions) + end + + metadata_rows + end + + def generate_csv(output_folder, metadata_rows) + metadata_file = File.join(output_folder, 'transcriptions_metadata.csv') + + CSV.open(metadata_file, 'wb') do |csv| + metadata_rows.each { |row| csv << row } + end + end + end + end +end diff --git a/app/services/data_exports/data_storage.rb b/app/services/data_exports/data_storage.rb new file mode 100644 index 00000000..7608f64d --- /dev/null +++ b/app/services/data_exports/data_storage.rb @@ -0,0 +1,123 @@ +require 'fileutils' +require 'securerandom' + +module DataExports + class NoStoredFilesFoundError < StandardError; end + + class DataStorage + # Public: downloads all transcription files for a given transcription + # returns path to zip file + def zip_transcription_files(transcription) + if transcription.export_files.attached? + Dir.mktmpdir do |directory_path| + transcription_folder = download_transcription_files(transcription, directory_path) + yield zip_files(directory_path, transcription_folder) + end + else + raise NoStoredFilesFoundError.new("No stored files found for transcription id '#{transcription.id}'") + end + end + + # Public : downloads all transcription group files for a given group + # returns path to zip file + def zip_group_files(transcriptions) + Dir.mktmpdir do |directory_path| + group_folder = File.join(directory_path, "group_#{transcriptions.first.group_id}") + FileUtils.mkdir_p(group_folder) + + AggregateMetadataFileGenerator.generate_group_file(transcriptions, group_folder) + + transcriptions.each do |transcription| + download_transcription_files(transcription, group_folder) if transcription.export_files.attached? + end + + yield zip_files(directory_path, group_folder) + end + end + + # Public : downloads all files for a given workflow + # returns path to zip file + def zip_workflow_files(workflow) + Dir.mktmpdir do |directory_path| + workflow_folder = download_workflow_files(workflow, directory_path) + AggregateMetadataFileGenerator.generate_workflow_file(workflow, workflow_folder) + + yield zip_files(directory_path, workflow_folder) + end + end + + # Public : downloads all files for a given project + # returns path to zip file + def zip_project_files(project) + Dir.mktmpdir do |directory_path| + project_folder = File.join(directory_path, "project_#{project.id}") + FileUtils.mkdir_p(project_folder) + + AggregateMetadataFileGenerator.generate_project_file(project, project_folder) + + project.workflows.each do |w| + download_workflow_files(w, project_folder) + end + + yield zip_files(directory_path, project_folder) + end + end + + private + + # download transcription files for a given transcription from storage to disk + # @param transcription [Transcription]: the transcription we want to retrieve files for + # @param directory_path [String]: path within which we will create the transcription file folder + # returns location of generated transcription folder + def download_transcription_files(transcription, directory_path) + transcription_folder = File.join(directory_path, "transcription_#{transcription.id}") + FileUtils.mkdir_p(transcription_folder) + + metadata_file_regex = /^transcription_metadata_.*\.csv$/ + transcription.export_files.each do |storage_file| + is_transcription_metadata_file = metadata_file_regex.match storage_file.filename.to_s + unless is_transcription_metadata_file + download_path = File.join(transcription_folder, storage_file.filename.to_s) + file = File.open(download_path, 'w') + file.write(storage_file.download) + file.close + end + end + + transcription_folder + end + + # download workflow's transcription files from storage to disk + # @param directory_path [String]: path within which we will create the workflow file folder + # returns location of generated workflow folder + def download_workflow_files(workflow, directory_path) + workflow_folder = File.join(directory_path, "workflow_#{workflow.id}") + FileUtils.mkdir_p(workflow_folder) + + workflow.transcription_group_data.each_key do |group_key| + group_folder = File.join(workflow_folder, "group_#{group_key}") + FileUtils.mkdir_p(group_folder) + + transcriptions = Transcription.where(group_id: group_key) + transcriptions.each do |t| + download_transcription_files(t, group_folder) + end + end + + workflow_folder + end + + # @param output_directory [String]: directory into which generated zip file will be output + # @param input_directory [String]: directory to zip + # returns location of zip file + def zip_files(output_directory, input_directory) + zip_file_path = File.join(output_directory, "export.zip") + zip_generator = ZipFileGenerator.new(input_directory, zip_file_path) + zip_generator.write + + FileUtils.rm_rf(input_directory) + + zip_file_path + end + end +end diff --git a/app/services/data_exports/transcription_file_generator.rb b/app/services/data_exports/transcription_file_generator.rb new file mode 100644 index 00000000..f5b01d82 --- /dev/null +++ b/app/services/data_exports/transcription_file_generator.rb @@ -0,0 +1,184 @@ +require 'csv' + +module DataExports + class TranscriptionFileGenerator + def initialize(transcription) + @transcription = transcription + end + + def generate_transcription_files + [ + write_raw_data_to_file, + write_consensus_text_to_file, + write_metadata_to_file, + write_line_metadata_to_file + ] + end + + private + + # Private: creates raw data file as tempfile, + # returns tempfile + def write_raw_data_to_file + file = Tempfile.new(["raw_data_#{@transcription.id}-", '.json']) + file.write(@transcription.text) + file.rewind + + file + end + + # Private: creates consensus text file, + # returns location of the file + def write_consensus_text_to_file + file = Tempfile.new(["consensus_text_#{@transcription.id}-", '.txt']) + file.write(consensus_text) + file.rewind + + file + end + + # Private: retrieves and returns consensus text + def consensus_text + full_consensus_text = '' + frame_regex = /^frame/ + + # if we find a frame, iterate through the lines of the frame + frames = @transcription.text.filter { |key, _value| frame_regex.match(key) } + frames.each_value do |value| + value.each do |line| + line_text = if line['edited_consensus_text'].present? + line['edited_consensus_text'] + else + line_text = line['consensus_text'] + end + + full_consensus_text.concat line_text + '\n' + end + # new line after every frame + full_consensus_text.concat '\n' + end + + full_consensus_text + end + + # Private: creates transcription metadata file, + # returns location of the file + def write_metadata_to_file + file = Tempfile.new(["transcription_metadata_#{@transcription.id}-", '.csv']) + + CSV.open(file.path, 'wb') do |csv| + transcription_metadata.each do |csv_line| + csv << csv_line + end + end + + file.rewind + file + end + + # Private: creates transcription line metadata file, + # returns location of the file + def write_line_metadata_to_file + file = Tempfile.new(["transcription_line_metadata_#{@transcription.id}-", '.csv']) + + CSV.open(file.path, 'wb') do |csv| + transcription_line_metadata.each do |csv_line| + csv << csv_line + end + end + + file.rewind + file + end + + # Private: retrieve and return transcription metadata formatted as + # array of csv lines + def transcription_metadata + csv_lines = [] + csv_lines << [ + 'transcription id', + 'internal id', + 'reducer', + 'caesar parameters', + 'date approved', + 'user who approved', + 'text edited (T/F)', + 'number of pages' + ] + csv_lines << [ + @transcription.id, + @transcription.internal_id, + @transcription.reducer, + @transcription.parameters, + @transcription.updated_at, + @transcription.updated_by, + is_text_edited?, + @transcription.total_pages + ] + end + + def is_text_edited? + # iterate through each 'frame' aka 'page' of transcription + frame_regex = /^frame/ + @transcription.text.any? do |key, lines| + frame_regex.match(key) && lines.any? { |line| line['edited_consensus_text'].present? } + end + end + + # Private: retrieve and return transcription line metadata formatted as + # array of csv lines + def transcription_line_metadata + csv_lines = [] + csv_lines << [ + 'consensus text', + 'line number', + 'line slope', + 'consensus score', + 'line edited (T/F)', + 'original transcriber username', + 'line editor username', + 'flagged for low consensus (T/F)', + 'page number', + 'column', + 'number of transcribers', + 'line coordinates' + ] + + frame_regex = /^frame/ + @transcription.text.filter { |key, _value| frame_regex.match(key) } + .each_with_index do |(key, value), page_index| + # if we find a frame, iterate through the lines of the frame + page = page_index + 1 + + value.each_with_index do |line, line_index| + line_number = line_index + 1 + column = line['gutter_label'] + 1 + num_transcribers = line['user_ids'].count + line_coordinates = { + 'clusters_x': line['clusters_x'], + 'clusters_y': line['clusters_y'] + } + line_edited = line['edited_consensus_text'].present? + consensus_text = line_edited ? line['edited_consensus_text'] : line['consensus_text'] + + csv_lines << [ + consensus_text, + line_number, + line['line_slope'], + line['consensus_score'], + line_edited, + line['original_transcriber'], + line['line_editor'], + line['low_consensus'], + page, + column, + num_transcribers, + line_coordinates + ] + end + end + + csv_lines + end + end +end \ No newline at end of file diff --git a/app/services/project_role_checker.rb b/app/services/project_role_checker.rb index 95a18293..70dbb947 100644 --- a/app/services/project_role_checker.rb +++ b/app/services/project_role_checker.rb @@ -1,5 +1,5 @@ class ProjectRoleChecker - attr_reader :user, :records, :viewer_project_ids + attr_reader :user, :records, :viewer_project_ids, :editor_project_ids EDITOR_ROLES = %w(owner collaborator expert scientist moderator) APPROVER_ROLES = %w(owner collaborator) @@ -9,6 +9,7 @@ def initialize(user, records) @user = user @records = records @viewer_project_ids = get_viewer_project_ids + @editor_project_ids = get_editor_project_ids end def can_edit? @@ -30,6 +31,10 @@ def get_viewer_project_ids user_project_ids(user.roles, VIEWER_ROLES) end + def get_editor_project_ids + user_project_ids(user.roles, EDITOR_ROLES) + end + private def user_project_ids(user_roles, allowed_roles) diff --git a/app/services/zip_file_generator.rb b/app/services/zip_file_generator.rb new file mode 100644 index 00000000..c7edb739 --- /dev/null +++ b/app/services/zip_file_generator.rb @@ -0,0 +1,46 @@ +require 'zip' + +class ZipFileGenerator + # Initialize with the directory to zip and the location of the output archive. + def initialize(input_dir, output_file) + @input_dir = File.expand_path(input_dir) + @output_file = File.expand_path(output_file) + end + + def write + # remove entries referencing curr folder and parent folder + entries = Dir.entries(@input_dir) - %w[. ..] + + Zip::File.open(@output_file, ::Zip::File::CREATE) do |zipfile| + write_entries entries, '', zipfile + end + end + + private + + # A helper method to make the recursion work. + def write_entries(entries, path, zipfile) + entries.each do |e| + # relative path of file being added to the zip + relative_path = path == '' ? e : File.join(path, e) + # full path + full_path = File.join(@input_dir, relative_path) + + if File.directory? full_path + recursively_zip_directory(full_path, zipfile, relative_path) + else + put_into_archive(full_path, zipfile, relative_path) + end + end + end + + def recursively_zip_directory(full_path, zipfile, relative_path) + zipfile.mkdir relative_path + subdir = Dir.entries(full_path) - %w[. ..] + write_entries subdir, relative_path, zipfile + end + + def put_into_archive(full_path, zipfile, relative_path) + zipfile.add(relative_path, full_path) + end +end \ No newline at end of file diff --git a/config/credentials/staging.yml.enc b/config/credentials/staging.yml.enc index e7739ae8..7ae40d44 100644 --- a/config/credentials/staging.yml.enc +++ b/config/credentials/staging.yml.enc @@ -1 +1 @@ -sbnWBw7OlGo9mjzBnjfDjIuqlP1FbI7Ctwy6rM193RJSY9kCTOD7L7dT/k425K23d5szQGLzV3EF7JRemCpRRrBfWd8nx3XsVYak1Be80ujTxyFrhKQ+8Do3l/xcFOzfWmfBtBaZpx/DfYV2Vc4gZ03fJtzikaIUl7j/4JrDrZc1AOasPLU81gOSWDLAQ7RdB6CCLVUifA3tdnXAXCKgDW8kf7WjxN6549E0/sk6kZNyWLcO9tR+C4dwkbtEMvD6ADtz7xeYnGaKAodvvyPiIA7H2E0cIB/jwLj+VZNAWCzqSFsS6E+ZnG3tJHYD9LlORYYRE7NMYl3gyQ5h9jIpEH6SlUp96lh9fJZ3Sy4vUVaSVhuDWSZiV2U5HXNBIt7Hyw67Y5vJGjnKdjz/c/64DgkdgADmrCupNEngCJbVetxIW95mkKWYjCf87lOHOsf3+HH33a2pEgUVcNp3jrRpc7hOI8ZdYimKjQKlyprns44g8//tUsZ6nrlo0PH9sYP0rYBdmsmwdD4iGML/CLzAVb/jj1FrOWC43zNjwd70aJhBsjr9L465DWvwE84RvArhF8OZ1KWVBWs+wMeJ5jDqJHw8N1pBnRpj6Xkq3En8N6TNR92+gfrqNXGEwJKyjHXeZkWN/RNKoGhNH2M6VbkMnmm2lk6rScK/lzrXZeuHZZhWLbZcBBm/8RZvc7e9p2FQzqzgGs1hddctEEf5SaupPUnymS1/TX7osGshvOhSYhJcqJXTtOVF0fWg+NcybKDRBa68FilKFgdc8nn7lN6dEQpSTdC4Nf8fMexFNI4Bwr8Q2wXEuTimXuoumTll63TaAnuFSqHE5QSe0CgTCGFxb7Q8H7bjYnEtIXfYurocXNKO+yK3mVLfb1dBJRQlHJqlULJkC7Tly1dSQSenImBw+Ja4x1uI2Mi0Tn4+dV2PPdl5+F5O5kmtXp1IH2ID5ZcGSg2sYtIPcy9eB1FVJ3DfW86uW5pl1iclrtlHyU0axsUllenQ+5mUKxQkPEyqn25PnhEmY8Gqm2Ij0BIXOsHBrOlIVQ/Ojgg+D+YYE1DF3wgonW0J82YenruhRVJsxRjs4RUF2qzx4VCYAdQvlavjPsyVeM1O9+ykr3DUpS/9LU+r9KHFZ29iYgvRHAL3Nx1wo0UmGAnUDIWzKDwNJ1nEAcI3fhBXK6259ikx2Je+X0iVrEx3QS3rBlUXfH0FBlC+e+RkfADGoFL2ih0KEbWQVX2d4qY=--jeFwzzKQ6hew1p+I--F2Hm/WhheOY7uaxd7cHmCg== \ No newline at end of file +wOA+5wnS+OBUOG9OJeGjdVe8xW/NATZkwsge+naNp3be0FzkKrtlB8XB+DvkQ9RxH/oujksrlhftnmnun/y7iPcEKMd13srgS/oe85jf9Hai2JKiH36z2RddJkgsyttOfehxo34vp8OfJgEz6TonN3YA9Nz9hEfQv/gzHcvSJaJjYqq011jdolZ7O3+rKJrxCWsFvY3mT0462RxnxhppMZJIXPrfQ5RJKanLygZ+yZxiEJIRT2o/jJzKYjELoyWQok5HJ1E0KmaNF1OQIwRm0OIf8+1oH9qDSPFnQKH4rYrx+o1xyZo5KgOHV8MEHvrYZur9dkKGR/jvVEGM2JO9/1Y1dBUPdYK5oLGFN24JLTMwrQWMgreoIj4m/yAmx0elum15Gt0r+fH0v1j6OQx8ZFbxoHoMHRtSrQxmUnDLgCNpVrz5ylc0bHFzfcFLHV6FQRaTDA2kxHLbxHpeHv/jcCxfJL2daQ3ketl0ZJdpDCarm6dUKxoEhOsYO8jgKofMvkeeCIARpT9nNLa6pvpZHlbXDLq4Snsm5XejCHnsY7ObnLK5TF6kGziE1JMkdGsrVBvwKoZCThHADRtyrew0qiMrhPnjxzKB6yK2BRnTY0kmvaruh4GMeDkxCF213xWBCeY+I7S1g/dl2EpD0S5taPh+AFfLnpNkWsql1wsWSFbTVR+bTlO5Bq6eF+XNuZhcLcheHIu1QJz4owF93wY5yRCpiZKXc1QkhXBdxdHSN1ZexnDg8uWvuPlN0K02MaK9M3+rkWYgJc4bFfycfzRWslVNCVoTCsqZXjzhNipw7sCa61XkyyQxbUC/c/S7ReBkOFLoK0GDK89KpeyjF++kF0hCiQDyOgr8V4TNqfkFHqD093izExtJRlna58RZso2cSrJTR8HYO2kDfpTb5EymCxb3X80vW9QHc90XTpaXZ5ETFMWkvdZlOKBoovfPDogGiNt9Wi7IOZcp7c0Tg6xtGS0CQwxCPg+a8SGZR6CvjtJr9g3snC/bqkyclu86wuXCtvwVVR4M0U98YWl6gTghgbNIGLXiV4RSm0g4YxDfOWEVvuYBGUvT00tekRSQQNAPw/QBI7qH/Q7x8WXmW+1pTFC0lkDNWCndgJouNBewGAKHfDH4MTlSkSjaXe+QYwxyJJBMV0LS3Ky6XLOVcfPqbqiN0Wll1QGF3gz+l0/ly4gE+MRNWOvvnsu7BHKoFQNJBbfJovyA0I1sbdaevIZnZqFoeiaTYtMd8fJ21pM1mHY6jywV64gRoA8ALDvaioBFGNKBIDcYZHEOzFTA853CbP8P/P0NhoEXvpn9NtjXlYmF88uQelSH/imeadAVtZuQUB5iBwZs8tf37MQnwlt4byIcbuWduCzl537pc+RnPeEIdxvi34RwDRdnOzwGGE0pH4lYd7sT+v2OxNR905pnE9u/xXrsYgrA6CVkFGkb/WjrDghnfEIFboAdBsTo--OjsnloPnn0n5E+Vu--VbEL5CiOjTX33D1gClRZuA== \ No newline at end of file diff --git a/config/environments/production.rb b/config/environments/production.rb index 694aa243..bb267454 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -28,8 +28,8 @@ # config.action_dispatch.x_sendfile_header = 'X-Sendfile' # for Apache # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for NGINX - # Store uploaded files on the local file system (see config/storage.yml for options). - config.active_storage.service = :local + # Store uploaded files on azure (see config/storage.yml for options). + config.active_storage.service = :azure # Mount Action Cable outside main process or domain. # config.action_cable.mount_path = nil diff --git a/config/environments/staging.rb b/config/environments/staging.rb index 694aa243..bb267454 100644 --- a/config/environments/staging.rb +++ b/config/environments/staging.rb @@ -28,8 +28,8 @@ # config.action_dispatch.x_sendfile_header = 'X-Sendfile' # for Apache # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for NGINX - # Store uploaded files on the local file system (see config/storage.yml for options). - config.active_storage.service = :local + # Store uploaded files on azure (see config/storage.yml for options). + config.active_storage.service = :azure # Mount Action Cable outside main process or domain. # config.action_cable.mount_path = nil diff --git a/config/routes.rb b/config/routes.rb index f04c2c76..0e5f373d 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -3,7 +3,14 @@ post '/import', to: 'caesar#import' - resources :projects, only: [:index, :show] - resources :workflows, only: [:index, :show] - resources :transcriptions, only: [:index, :show, :update] + resources :projects, only: [:index, :show] do + get 'export', on: :member + end + resources :workflows, only: [:index, :show] do + get 'export', on: :member + end + resources :transcriptions, only: [:index, :show, :update] do + get 'export', on: :member + get 'export_group', on: :collection + end end diff --git a/config/storage.yml b/config/storage.yml index d32f76e8..5f9097b4 100644 --- a/config/storage.yml +++ b/config/storage.yml @@ -6,29 +6,8 @@ local: service: Disk root: <%= Rails.root.join("storage") %> -# Use rails credentials:edit to set the AWS secrets (as aws:access_key_id|secret_access_key) -# amazon: -# service: S3 -# access_key_id: <%= Rails.application.credentials.dig(:aws, :access_key_id) %> -# secret_access_key: <%= Rails.application.credentials.dig(:aws, :secret_access_key) %> -# region: us-east-1 -# bucket: your_own_bucket - -# Remember not to checkin your GCS keyfile to a repository -# google: -# service: GCS -# project: your_project -# credentials: <%= Rails.root.join("path/to/gcs.keyfile") %> -# bucket: your_own_bucket - -# Use rails credentials:edit to set the Azure Storage secret (as azure_storage:storage_access_key) -# microsoft: -# service: AzureStorage -# storage_account_name: your_account_name -# storage_access_key: <%= Rails.application.credentials.dig(:azure_storage, :storage_access_key) %> -# container: your_container_name - -# mirror: -# service: Mirror -# primary: local -# mirrors: [ amazon, google, microsoft ] +azure: + service: AzureStorage + storage_account_name: <%= Rails.application.credentials.dig(:azure, :storage_account_name) %> + storage_access_key: <%= Rails.application.credentials.dig(:azure, :storage_access_key) %> + container: 'data-exports' \ No newline at end of file diff --git a/db/migrate/20200127204118_create_active_storage_tables.active_storage.rb b/db/migrate/20200127204118_create_active_storage_tables.active_storage.rb new file mode 100644 index 00000000..0b2ce257 --- /dev/null +++ b/db/migrate/20200127204118_create_active_storage_tables.active_storage.rb @@ -0,0 +1,27 @@ +# This migration comes from active_storage (originally 20170806125915) +class CreateActiveStorageTables < ActiveRecord::Migration[5.2] + def change + create_table :active_storage_blobs do |t| + t.string :key, null: false + t.string :filename, null: false + t.string :content_type + t.text :metadata + t.bigint :byte_size, null: false + t.string :checksum, null: false + t.datetime :created_at, null: false + + t.index [ :key ], unique: true + end + + create_table :active_storage_attachments do |t| + t.string :name, null: false + t.references :record, null: false, polymorphic: true, index: false + t.references :blob, null: false + + t.datetime :created_at, null: false + + t.index [ :record_type, :record_id, :name, :blob_id ], name: "index_active_storage_attachments_uniqueness", unique: true + t.foreign_key :active_storage_blobs, column: :blob_id + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 76f52b2c..db8039a5 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -15,6 +15,27 @@ # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" + create_table "active_storage_attachments", force: :cascade do |t| + t.string "name", null: false + t.string "record_type", null: false + t.bigint "record_id", null: false + t.bigint "blob_id", null: false + t.datetime "created_at", null: false + t.index ["blob_id"], name: "index_active_storage_attachments_on_blob_id" + t.index ["record_type", "record_id", "name", "blob_id"], name: "index_active_storage_attachments_uniqueness", unique: true + end + + create_table "active_storage_blobs", force: :cascade do |t| + t.string "key", null: false + t.string "filename", null: false + t.string "content_type" + t.text "metadata" + t.bigint "byte_size", null: false + t.string "checksum", null: false + t.datetime "created_at", null: false + t.index ["key"], name: "index_active_storage_blobs_on_key", unique: true + end + create_table "projects", force: :cascade do |t| t.string "slug", null: false t.datetime "created_at", precision: 6, null: false @@ -56,4 +77,5 @@ t.datetime "updated_at", precision: 6, null: false end + add_foreign_key "active_storage_attachments", "active_storage_blobs", column: "blob_id" end diff --git a/spec/controllers/projects_controller_spec.rb b/spec/controllers/projects_controller_spec.rb index 340dcea7..7120642d 100644 --- a/spec/controllers/projects_controller_spec.rb +++ b/spec/controllers/projects_controller_spec.rb @@ -98,4 +98,47 @@ end end end + + describe '#export' do + let (:project) { create(:project, slug: "lizard_king/underground_fortress") } + let (:workflow) { create(:workflow, project: project)} + let(:transcription) { create(:transcription, :unedited_json_blob, workflow: workflow) } + let(:export_params) { { id: project.id } } + + before do + transcription.export_files.attach(blank_file_blob) + end + + it 'returns successfully' do + get :export, params: export_params + expect(response).to have_http_status(:ok) + end + + it 'should have a response with content-type of application/zip' do + get :export, params: export_params + expect(response.header["Content-Type"]).to eq("application/zip") + end + + describe 'roles' do + context 'as a viewer' do + let(:viewer) { create(:user, roles: { project.id => ['tester']}) } + before { allow(controller).to receive(:current_user).and_return viewer } + + it 'returns a 403 Forbidden when exporting a project' do + get :export, params: export_params + expect(response).to have_http_status(:forbidden) + end + end + + context 'as an editor' do + let(:editor) { create(:user, roles: { project.id => ['moderator']}) } + before { allow(controller).to receive(:current_user).and_return editor } + + it 'returns successfully when exporting a project' do + get :export, params: export_params + expect(response).to have_http_status(:ok) + end + end + end + end end diff --git a/spec/controllers/transcriptions_controller_spec.rb b/spec/controllers/transcriptions_controller_spec.rb index 334b47f1..e548a5f0 100644 --- a/spec/controllers/transcriptions_controller_spec.rb +++ b/spec/controllers/transcriptions_controller_spec.rb @@ -159,6 +159,24 @@ expect(Transcription.find(transcription.id).updated_by).to eq(admin_user.login) end + context 'when transcription status changes' do + context 'when a transcription is approved' do + it 'attaches 4 data files to the transcription' do + update_params[:data][:attributes][:status] = 'approved' + patch :update, params: update_params + expect(transcription.export_files.count).to eq(4) + end + end + + context 'when a transcription is unapproved' do + it 'removes attached data files from storage' do + update_params[:data][:attributes][:status] = 'ready' + patch :update, params: update_params + expect(transcription.export_files.attached?).to be_falsey + end + end + end + context 'validates the input' do it 'is not valid JSON:API' do busted_params = { id: transcription.id, "data": { "nothing": "garbage" } } @@ -259,4 +277,88 @@ end end end + + context 'exporting transcriptions' do + let(:transcription) { create(:transcription, group_id: 'FROG_LADS_777' ) } + let(:second_transcription) { create(:transcription, group_id: 'FROG_LADS_777' ) } + let(:export_single_params) { { id: transcription.id } } + let(:export_group_params) { { group_id: transcription.group_id, workflow_id: transcription.workflow_id } } + + before do + transcription.export_files.attach(blank_file_blob) + end + + describe '#export' do + context 'exporting a single transcription' do + it 'returns successfully' do + get :export, params: export_single_params + expect(response).to have_http_status(:ok) + end + + it 'should have a response with content-type of application/zip' do + get :export, params: export_single_params + expect(response.header["Content-Type"]).to eq("application/zip") + end + end + + describe 'roles' do + context 'as a viewer' do + let(:viewer) { create(:user, roles: { transcription.workflow.project.id => ['tester']}) } + before { allow(controller).to receive(:current_user).and_return viewer } + + it 'returns a 403 Forbidden when exporting one transcription' do + get :export, params: export_single_params + expect(response).to have_http_status(:forbidden) + end + end + + context 'as an editor' do + let(:editor) { create(:user, roles: { transcription.workflow.project.id => ['moderator']}) } + before { allow(controller).to receive(:current_user).and_return editor } + + it 'returns successfully for a single transcription export' do + get :export, params: export_single_params + expect(response).to have_http_status(:ok) + end + end + end + end + + describe '#export_group' do + # TO DO: create example for no trans in group + context 'when group contains at least one transcription' do + it 'returns successfully with content-type of application/zip' do + get :export_group, params: export_group_params + expect(response).to have_http_status(:ok) + expect(response.header['Content-Type']).to eq('application/zip') + end + + describe 'roles' do + context 'as a viewer' do + let(:viewer) { create(:user, roles: { transcription.workflow.project.id => ['tester']}) } + before { allow(controller).to receive(:current_user).and_return viewer } + + it 'returns a 403 error' do + get :export_group, params: export_group_params + expect(response).to have_http_status(:forbidden) + end + end + + context 'as an editor' do + it 'returns successfully' do + get :export_group, params: export_group_params + expect(response).to have_http_status(:ok) + end + end + end + end + + context 'when group contains no transcriptions' do + it 'returns an error' do + get :export_group, params: { group_id: 'MICE_IN_TANKS', workflow_id: transcription.workflow_id } + expect(response).to have_http_status(:error) + end + end + end + end end diff --git a/spec/controllers/workflows_controller_spec.rb b/spec/controllers/workflows_controller_spec.rb index 906cc08e..b0dac7bb 100644 --- a/spec/controllers/workflows_controller_spec.rb +++ b/spec/controllers/workflows_controller_spec.rb @@ -43,7 +43,7 @@ "updated_at" => '2019-12-16 00:00:00 UTC', "updated_by" => 'The Dark Master', "transcription_count" => 1 - }, + }, "SECOND" => { "updated_at" => '2019-12-18 00:00:00 UTC', "updated_by" => 'The Grey Tiger', @@ -131,4 +131,46 @@ end end end + + describe '#export' do + let (:workflow) { create(:workflow)} + let(:transcription) { create(:transcription, workflow: workflow) } + let(:export_params) { { id: workflow.id } } + + before do + transcription.export_files.attach(blank_file_blob) + end + + it 'returns successfully' do + get :export, params: export_params + expect(response).to have_http_status(:ok) + end + + it 'should have a response with content-type of application/zip' do + get :export, params: export_params + expect(response.header["Content-Type"]).to eq("application/zip") + end + + describe 'roles' do + context 'as a viewer' do + let(:viewer) { create(:user, roles: { workflow.project.id => ['tester']}) } + before { allow(controller).to receive(:current_user).and_return viewer } + + it 'returns a 403 Forbidden when exporting a workflow' do + get :export, params: export_params + expect(response).to have_http_status(:forbidden) + end + end + + context 'as an editor' do + let(:editor) { create(:user, roles: { workflow.project.id => ['moderator']}) } + before { allow(controller).to receive(:current_user).and_return editor } + + it 'returns successfully when exporting a workflow' do + get :export, params: export_params + expect(response).to have_http_status(:ok) + end + end + end + end end diff --git a/spec/factories/transcriptions.rb b/spec/factories/transcriptions.rb index cbc56951..c71648b9 100644 --- a/spec/factories/transcriptions.rb +++ b/spec/factories/transcriptions.rb @@ -2,7 +2,804 @@ factory :transcription do workflow group_id { "GROUP1A" } - text { { structure: "tbd" } } + text { { "checkout_this": "metadata" } } status { 1 } end + + trait :unedited_json_blob do + text { { + "frame0": [{ + "user_ids": [11], + "clusters_x": [1311.1291866028707, 666.5167464114833], + "clusters_y": [788.11004784689, 781.8516746411483], + "line_slope": 179.26252336428178, + "slope_label": 1, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["[deletion][/deletion]"] + ], + "extract_index": [1], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "[deletion][/deletion]", + "consensus_score": 1.0, + "edited_consensus_text": "" + }, { + "user_ids": [1325796.0, 1325796.0, 1325796.0, 1325796.0, 1325796.0, 1325361.0, 1325361.0], + "clusters_x": [913.9868111445219, 610.6879194904875], + "clusters_y": [266.98410480295087, 271.5937382180552], + "line_slope": 179.26252336428178, + "slope_label": 1, + "gutter_label": 0, + "number_views": 7, + "clusters_text": [ + ["Ms", "Ms", "", "", "", "Ms", ""], + ["", "Z", "", "", "", "", ""], + ["", "B", "", "", "", "", ""], + ["", "Oak", "leland", "oakes", "oakes", "", "oakes"] + ], + "extract_index": [0, 0, 0, 0, 0, 0, 0], + "gold_standard": [false, false, false, false, false, false, false], + "low_consensus": true, + "consensus_text": "Ms Z B oakes", + "consensus_score": 2.0 + }, { + "user_ids": [1325796.0, 22], + "clusters_x": [1181.3243243243244, 860.2162162162163], + "clusters_y": [222.93243243243245, 228.82432432432432], + "line_slope": 179.26252336428178, + "slope_label": 1, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["test", "test"] + ], + "extract_index": [0, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "test", + "consensus_score": 2.0 + }, { + "user_ids": [1325889.0, 1325803.0, 1325796.0, 1325796.0, 1325361.0, 1325361.0], + "clusters_x": [778.8178944102211, 1384.2271593758257], + "clusters_y": [138.78157983107482, 128.2726430438015], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 6, + "clusters_text": [ + ["John's", "John's", "John", "John", "John", "John's"], + ["Lelaud", "Lelaud", "leland", "leland", "leland", "Lelaud"], + ["Sept", "Sept", "", "", "", "Sept"], + ["18th", "18th", "", "", "", "18th"], + ["1856", "1856", "", "", "", "1856"] + ], + "extract_index": [1, 0, 0, 1, 0, 1], + "gold_standard": [false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "John's Lelaud Sept 18th 1856", + "consensus_score": 3.0 + }, { + "user_ids": [1325889.0, 1325796.0, 1325796.0, 1325796.0, 1325361.0, 1325361.0, 1325361.0, 33, 44, 1325361.0, 1325796.0, 1325361.0], + "clusters_x": [608.7537704148516, 1000.84140625], + "clusters_y": [260.2930537695804, 249.22500000000002], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 12, + "clusters_text": [ + ["Mr", "Z", "", "", "Mr", "Me", "Mr", "", "", "Mr", "", "Mr"], + ["Le.", "b", "b", "", "L", "L", "L", "", "", "LB", "", "L"], + ["B", "", "", "", "B", "B", "B", "", "", "", "", "B"], + ["Oakes", "oakes", "oakes", "oakes", "Oakes", "Oakes", "Oakes", "oakes", "oakes", "Oakes", "oakes", "oakes"] + ], + "extract_index": [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], + "gold_standard": [false, false, false, false, false, false, false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "Mr L B oakes", + "consensus_score": 5.25 + }, { + "user_ids": [1325889.0, 1325796.0, 1325361.0, 1325361.0, 1325361.0, 1325841.0], + "clusters_x": [667.4541769397135, 1390.6588578268288], + "clusters_y": [305.36150800329324, 304.03032820448317], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 6, + "clusters_text": [ + ["Dear", "Dear", "dear", "dear", "Dear", "Dear"], + ["Sir", "sir", "sir", "sir", "Sir", "Sir."], + ["I", "", "I", "I", "", "I"], + ["have", "", "have", "have", "", "have"], + ["just", "", "just", "just", "", "just"], + ["recieved", "", "received", "received", "", "received"] + ], + "extract_index": [2, 2, 0, 0, 1, 0], + "gold_standard": [false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "Dear sir I have just received", + "consensus_score": 3.6666666666666665 + }, { + "user_ids": [1325889.0, 1325361.0, 1325361.0], + "clusters_x": [599.8771015810208, 1389.1773296798501], + "clusters_y": [346.6687695098003, 349.7927641488854], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 3, + "clusters_text": [ + ["information", "information", "information"], + ["that", "that", "that"], + ["a", "", "a"], + ["fellow", "", "fellow"], + ["of", "", "of"], + ["mine", "", "mine"] + ], + "extract_index": [3, 1, 0], + "gold_standard": [false, false, false], + "low_consensus": true, + "consensus_text": "information that a fellow of mine", + "consensus_score": 2.3333333333333335 + }, { + "user_ids": [1325889.0, 1325800.0, 1325361.0], + "clusters_x": [610.1383391401181, 1396.512905737758], + "clusters_y": [391.8689431145336, 388.5538903107793], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 3, + "clusters_text": [ + ["Moses,", "Moses,", "Moses,"], + ["a", "a", "a"], + ["small", "small", "small"], + ["black", "black", "black"], + ["man", "man", "man"], + ["who", "who", "who"], + ["has", "he", "has"] + ], + "extract_index": [4, 0, 0], + "gold_standard": [false, false, false], + "low_consensus": true, + "consensus_text": "Moses, a small black man who has", + "consensus_score": 2.857142857142857 + }, { + "user_ids": [1325889.0], + "clusters_x": [588.4872830039322, 1389.1773296798501], + "clusters_y": [431.47057416528503, 433.9456593172971], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["been"], + ["runaway"], + ["for"], + ["some"], + ["months"], + ["was"] + ], + "extract_index": [5], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "been runaway for some months was", + "consensus_score": 1.0 + }, { + "user_ids": [1325889.0], + "clusters_x": [595.9125384599686, 1391.6524148318622], + "clusters_y": [474.78456432549694, 477.2596494775091], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["lodged"], + ["in"], + ["the"], + ["workhouse"], + ["or"], + ["at"], + ["least"] + ], + "extract_index": [6], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "lodged in the workhouse or at least", + "consensus_score": 1.0 + }, { + "user_ids": [1325361.0], + "clusters_x": [1114.2187434541593, 1339.3248982901018], + "clusters_y": [534.9570468158536, 528.6456592970889], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["yesterday"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "yesterday", + "consensus_score": 1.0 + }, { + "user_ids": [1325361.0], + "clusters_x": [604.8309959169183, 1369.359666252441], + "clusters_y": [601.722350434937, 599.9808272678856], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["having"], + ["made"], + ["up"], + ["my"], + ["mind"], + ["to"], + ["sell"], + ["him"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "having made up my mind to sell him", + "consensus_score": 1.0 + }, { + "user_ids": [1325361.0, 1325361.0], + "clusters_x": [998.5099722768056, 1164.7098436042772], + "clusters_y": [652.7696138327956, 648.562022153619], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["same", "same"] + ], + "extract_index": [1, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "same", + "consensus_score": 2.0 + }, { + "user_ids": [1325361.0], + "clusters_x": [853.6358695652174, 948.4184782608695], + "clusters_y": [684.1290760869565, 691.5339673913044], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["delay"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "delay", + "consensus_score": 1.0 + }, { + "user_ids": [1325361.0, 1325361.0], + "clusters_x": [871.3144654088051, 1139.8155136268344], + "clusters_y": [736.6572327044025, 730.9444444444445], + "line_slope": -0.221176437611867, + "slope_label": 0, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["not", "not"], + ["sell", "sell"], + ["him", "him"] + ], + "extract_index": [0, 1], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "not sell him", + "consensus_score": 2.0 + }, { + "user_ids": [55], + "clusters_x": [1373.712918660287, 1254.8038277511962], + "clusters_y": [653.555023923445, 641.0382775119617], + "line_slope": -173.9909940425054, + "slope_label": 2, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["What"], + ["what?"] + ], + "extract_index": [2], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "What what?", + "consensus_score": 1.0 + }], + "frame1": [{ + "user_ids": [1857.0, 1857.0], + "clusters_x": [874.9671737389912, 1206.417934347478], + "clusters_y": [55.59727782225781, 123.14411529223378], + "line_slope": 11.518659344879472, + "slope_label": 1, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [], + "extract_index": [0, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "", + "consensus_score": 0.0 + }, { + "user_ids": [1325796.0, 1325796.0], + "clusters_x": [921.6460722404647, 1315.224450618843], + "clusters_y": [265.9831649406416, 265.9831649406416], + "line_slope": -0.000000000000004174735093328781, + "slope_label": 0, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["your", "yours"], + ["respectfully", "respectfully"] + ], + "extract_index": [0, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "your respectfully", + "consensus_score": 1.5 + }], + "transcribed_lines": 18, + "aggregation_version": "3.3.0", + "low_consensus_lines": 15 + } } + end + + trait :edited_json_blob do + text { + { + "frame0": [{ + "seen": false, + "flagged": false, + "user_ids": [1325796, 1325796, 1325361], + "clusters_x": [904.8769647573996, 604.9414062892324], + "clusters_y": [268.05475082105517, 269.75898766659634], + "line_slope": 179.67665775917354, + "line_editor": "wgranger-test", + "slope_label": 1, + "gutter_label": 0, + "number_views": 3, + "clusters_text": [ + ["Ms", "z", ""], + ["Z", "b", ""], + ["B", "", ""], + ["Oakes", "oakes", "oakes"] + ], + "extract_index": [0, 0, 0], + "gold_standard": [false, false, false], + "low_consensus": true, + "consensus_text": "Ms Z B oakes", + "consensus_score": 1.25, + "edited_consensus_text": "Ms. Z B Oakes" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796, 27], + "clusters_x": [1069.5852842809365, 797.267558528428], + "clusters_y": [74.433110367893, 146.61371237458195], + "line_slope": 165.15454791791615, + "line_editor": "", + "slope_label": 2, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["john", "john"], + ["leland", "leland"] + ], + "extract_index": [0, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "john leland", + "consensus_score": 2, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796, 1325796, 1325796, 11, 22], + "clusters_x": [788.6663088447522, 1396.2876008804108], + "clusters_y": [137.53115373432394, 136.10051357300074], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 5, + "clusters_text": [ + ["John", "John", "John", "John's", "John's"], + ["Leland", "Leland", "Leland", "leland", "leland"] + ], + "extract_index": [1, 1, 0, 0, 0], + "gold_standard": [false, false, false, false, false], + "low_consensus": false, + "consensus_text": "John Leland", + "consensus_score": 3, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796, 1325796, 1325796, 1325796, 1325841, 1325361, 1325361, 1325361], + "clusters_x": [605.631423708881, 920.2989360994401], + "clusters_y": [259.49853208997695, 263.41288611118017], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 8, + "clusters_text": [ + ["Ms.", "Ms.", "Mrs.", "", "", "mr", "Mr", ""], + ["Le", "Z", "Z", "", "", "L", "L", "Hey"], + ["B", "B", "b", "", "", "B", "B", "B"], + ["Oakes", "Oakes", "oak", "oakes", "oakes", "Oakes", "Oakes", "Oakes"] + ], + "extract_index": [0, 0, 0, 0, 0, 0, 0, 0], + "gold_standard": [false, false, false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "Ms. Z B Oakes", + "consensus_score": 3.5, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325361, 1325361, 1325801, 1325361, 1325361, 1325361], + "clusters_x": [662.6512726948562, 1386.1038558200314], + "clusters_y": [301.13431416166986, 297.049147479391], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 6, + "clusters_text": [ + ["DEAR", "dear", "DEAR", "DEAR", "dear", "dear"], + ["SIR", "sir", "SIR", "SIR", "sir", "sir"], + ["I", "i", "I", "I", "I", ""], + ["HAVE", "have", "HAVE", "HAVE", "have", ""], + ["JUST", "receive", "JUST", "JUST", "just", ""], + ["RECEIVED", "(darke_shard)", "RECEIVED", "RECEIVED", "received", ""] + ], + "extract_index": [0, 0, 0, 0, 1, 1], + "gold_standard": [false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "DEAR SIR I HAVE JUST RECEIVED", + "consensus_score": 3.1666666666666665, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325800, 1325802, 1325801, 1325801, 1325361, 1325796], + "clusters_x": [588.9575803040453, 1355.340192373983], + "clusters_y": [345.36075208531184, 340.930257316186], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 6, + "clusters_text": [ + ["information", "information", "ingformation", "ingformation", "INFORMATION", "information"], + ["that", "that", "that", "that", "THAT", "that"], + ["a", "a", "a", "a", "A", "a"], + ["fellow", "fellow", "fellow", "fellow", "FELLOW", "fellow"], + ["of", "of", "of", "of", "OF", "of"], + ["mine", "miine", "minne", "mine", "MINE", "mine"] + ], + "extract_index": [0, 0, 0, 0, 1, 0], + "gold_standard": [false, false, false, false, false, false], + "low_consensus": false, + "consensus_text": "information that a fellow of mine", + "consensus_score": 4.333333333333333, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796, 1325803, 1325361, 1325801], + "clusters_x": [595.8977254030042, 1392.7806128888283], + "clusters_y": [394.716147523171, 384.3310097842535], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 4, + "clusters_text": [ + ["Moses,", "Moses,", "Moses", "Moses"], + ["a", "a", "a", "a"], + ["small", "small", "small", "small"], + ["black", "black", "black", "black"], + ["man", "man", "man", "man"], + ["who", "who", "who", "who"], + ["has", "has", "has", "has"] + ], + "extract_index": [1, 1, 0, 0], + "gold_standard": [false, false, false, false], + "low_consensus": false, + "consensus_text": "Moses, a small black man who has", + "consensus_score": 3.7142857142857144, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796, 1325796], + "clusters_x": [590.5723467369809, 1387.5944297956494], + "clusters_y": [433.31608437706, 431.699406723797], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["been", "been"], + ["runaway", "runaway"], + ["for", "for"], + ["some", "some"], + ["months", "months"], + ["was", "was"] + ], + "extract_index": [2, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "been runaway for some months was", + "consensus_score": 2, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [27], + "clusters_x": [800.7546230440967, 1054.2610241820769], + "clusters_y": [203.86130867709818, 201.53556187766713], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["???"], + ["LOLS"] + ], + "extract_index": [1], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "??? LOLS", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325803], + "clusters_x": [599.5662211421628, 1368.3936816524908], + "clusters_y": [480.26063183475094, 481.7506075334143], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["lodged"], + ["in"], + ["the"], + ["workhouse"], + ["or"], + ["at"], + ["here"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "lodged in the workhouse or at here", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325361], + "clusters_x": [835.2366863905326, 1125.473372781065], + "clusters_y": [509.32544378698225, 513.1952662721893], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["by"], + ["the"], + ["police"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "by the police", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325361], + "clusters_x": [856.5978260869565, 1092.0733695652175], + "clusters_y": [556.7649456521739, 550.8410326086956], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["last"], + ["february"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "last february", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796], + "clusters_x": [603.8597464035054, 1390.4738066617551], + "clusters_y": [608.713986583427, 610.2778316336422], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["having"], + ["made"], + ["up"], + ["my"], + ["mind"], + ["to"], + ["sell"], + ["him"] + ], + "extract_index": [2], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "having made up my mind to sell him", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325361], + "clusters_x": [858.0788043478261, 942.4945652173913], + "clusters_y": [679.6861413043479, 676.7241847826087], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["delay"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "delay", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325361], + "clusters_x": [867.4852071005917, 1139.6627218934912], + "clusters_y": [727.3254437869823, 729.905325443787], + "line_slope": -0.1268525161618382, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["not"], + ["sell"], + ["him"] + ], + "extract_index": [1], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "not sell him", + "consensus_score": 1, + "edited_consensus_text": "" + }], + "frame1": [{ + "seen": false, + "flagged": false, + "user_ids": [1325361, 1325361], + "clusters_x": [609.9423313524683, 1350.370071881662], + "clusters_y": [76.34385268297919, 94.28916153641592], + "line_slope": 0.7241649502254157, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 2, + "clusters_text": [ + ["by", "my"], + ["mail", "mail"], + ["signed", "signed"], + ["smile", "while"], + ["and", "and"], + ["let", "let"], + ["me", "me"] + ], + "extract_index": [0, 0], + "gold_standard": [false, false], + "low_consensus": true, + "consensus_text": "by mail signed smile and let me", + "consensus_score": 1.7142857142857142, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796], + "clusters_x": [619.7719626168225, 1166.1981308411214], + "clusters_y": [231.11775700934584, 223.78317757009347], + "line_slope": 0.7241649502254157, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["test"], + ["this"], + ["out"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "test this out", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796], + "clusters_x": [921.5738880918221, 1325.0459110473457], + "clusters_y": [263.3113342898135, 269.5667144906743], + "line_slope": 0.7241649502254157, + "line_editor": "", + "slope_label": 0, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["yours"], + ["respectfully"] + ], + "extract_index": [0], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "yours respectfully", + "consensus_score": 1, + "edited_consensus_text": "" + }, { + "seen": false, + "flagged": false, + "user_ids": [1325796], + "clusters_x": [1148.3314203730272, 599.4218077474893], + "clusters_y": [221.08751793400285, 219.52367288378764], + "line_slope": -179.83676460258107, + "line_editor": "", + "slope_label": 1, + "gutter_label": 0, + "number_views": 1, + "clusters_text": [ + ["to"], + ["the"], + ["workhouse"], + ["master"] + ], + "extract_index": [1], + "gold_standard": [false], + "low_consensus": true, + "consensus_text": "to the workhouse master", + "consensus_score": 1, + "edited_consensus_text": "" + }], + "transcribed_lines": 19, + "low_consensus_lines": 14 + } + } + end + end diff --git a/spec/fixtures/files/transcription_file.txt b/spec/fixtures/files/transcription_file.txt new file mode 100644 index 00000000..e69de29b diff --git a/spec/fixtures/files/transcription_metadata_777.csv b/spec/fixtures/files/transcription_metadata_777.csv new file mode 100644 index 00000000..62a8f60f --- /dev/null +++ b/spec/fixtures/files/transcription_metadata_777.csv @@ -0,0 +1,2 @@ +name,class,weapon +Spharghis,Turtle Whisperer,Teeth \ No newline at end of file diff --git a/spec/services/data_exports/aggregate_metadata_file_generator_spec.rb b/spec/services/data_exports/aggregate_metadata_file_generator_spec.rb new file mode 100644 index 00000000..d5c07fc5 --- /dev/null +++ b/spec/services/data_exports/aggregate_metadata_file_generator_spec.rb @@ -0,0 +1,81 @@ +require 'csv' + +RSpec.describe DataExports::AggregateMetadataFileGenerator do + let (:project) { create(:project, slug: "lizard_king/underground_fortress") } + let(:workflow) { create(:workflow, project: project)} + let(:transcription) { create(:transcription, :unedited_json_blob, workflow: workflow, group_id: "ROACH_WARRIORS") } + let(:another_transcription) { create(:transcription, :unedited_json_blob, workflow: workflow, group_id: "ROACH_WARRIORS") } + let(:transcription_group) { [transcription, another_transcription] } + + let(:parent_dir) { Dir.mktmpdir } + let(:csv_filepath) { File.join(parent_dir, 'transcriptions_metadata.csv')} + + before(:each) do + transcription.export_files.attach(transcription_metadata_blob) + another_transcription.export_files.attach(transcription_metadata_blob) + end + + describe '#generate_group_file' do + before(:each) do + described_class.generate_group_file(transcription_group, parent_dir) + end + + after(:each) do + FileUtils.rm_rf(parent_dir) + end + + it 'creates a csv file in the expected location' do + expect(File).to exist(csv_filepath) + end + + it 'creates a csv with the expected header' do + rows = CSV.read(csv_filepath) + expect(rows.first).to eq([ + 'name','class','weapon' + ]) + end + + it 'creates a csv with a header and one row per transcription with metadata' do + rows = CSV.read(csv_filepath) + expect(rows.length).to eq(3) + end + end + + describe '#generate_workflow_file' do + before(:each) do + described_class.generate_workflow_file(workflow, parent_dir) + end + + after(:each) do + FileUtils.rm_rf(parent_dir) + end + + it 'creates a csv file in the expected location' do + expect(File).to exist(csv_filepath) + end + + it 'creates a csv with a header and one row per transcription with metadata' do + rows = CSV.read(csv_filepath) + expect(rows.length).to eq(3) + end + end + + describe '#generate_project_file' do + before(:each) do + described_class.generate_project_file(project, parent_dir) + end + + after(:each) do + FileUtils.rm_rf(parent_dir) + end + + it 'creates a csv file in the expected location' do + expect(File).to exist(csv_filepath) + end + + it 'creates a csv with a header and one row per transcription with metadata' do + rows = CSV.read(csv_filepath) + expect(rows.length).to eq(3) + end + end +end \ No newline at end of file diff --git a/spec/services/data_exports/data_storage_spec.rb b/spec/services/data_exports/data_storage_spec.rb new file mode 100644 index 00000000..f22923e7 --- /dev/null +++ b/spec/services/data_exports/data_storage_spec.rb @@ -0,0 +1,71 @@ +RSpec.describe DataExports::DataStorage do + let(:project) { create(:project, slug: "lizard_king/underground_fortress") } + let(:workflow) { create(:workflow, project: project)} + let(:transcription) { create(:transcription, workflow: workflow, group_id: "ROACH_WARRIORS") } + let(:another_transcription) { create(:transcription, group_id: "ROACH_WARRIORS") } + let(:transcription_group) { [transcription, another_transcription] } + + let(:data_storage) { described_class.new } + + describe '#zip_transcription_files' do + it 'throws error when no stored files are found' do + expect { data_storage.zip_transcription_files(transcription) }.to raise_error(DataExports::NoStoredFilesFoundError) + end + + context 'when stored files are found' do + before(:each) do + transcription.export_files.attach(blank_file_blob) + end + + it "produces a zip file named export.zip" do + data_storage.zip_transcription_files(transcription) do |zip_file| + expect(zip_file).to be_a(String) + expect(File.basename(zip_file)).to eq('export.zip') + expect(File).to exist(zip_file) + end + end + end + end + + describe '#zip_group_files' do + before(:each) do + transcription.export_files.attach(blank_file_blob) + end + + it "produces a zip file named export.zip" do + data_storage.zip_group_files(transcription_group) do |zip_file| + expect(zip_file).to be_a(String) + expect(File.basename(zip_file)).to eq('export.zip') + expect(File).to exist(zip_file) + end + end + end + + describe '#zip_workflow_files' do + before(:each) do + transcription.export_files.attach(blank_file_blob) + end + + it "produces a zip file named export.zip" do + data_storage.zip_workflow_files(workflow) do |zip_file| + expect(zip_file).to be_a(String) + expect(File.basename(zip_file)).to eq('export.zip') + expect(File).to exist(zip_file) + end + end + end + + describe '#zip_project_files' do + before(:each) do + transcription.export_files.attach(blank_file_blob) + end + + it "produces a zip file named export.zip" do + data_storage.zip_project_files(project) do |zip_file| + expect(zip_file).to be_a(String) + expect(File.basename(zip_file)).to eq('export.zip') + expect(File).to exist(zip_file) + end + end + end +end \ No newline at end of file diff --git a/spec/services/data_exports/transcription_file_generator_spec.rb b/spec/services/data_exports/transcription_file_generator_spec.rb new file mode 100644 index 00000000..fe1fa943 --- /dev/null +++ b/spec/services/data_exports/transcription_file_generator_spec.rb @@ -0,0 +1,137 @@ +require 'json' + +RSpec.describe DataExports::TranscriptionFileGenerator do + context 'when transcription contains no edited lines' do + describe '#generate_transcription_files' do + let(:transcription) { create(:transcription, :unedited_json_blob) } + let(:file_generator) { described_class.new transcription } + let(:files) { file_generator.generate_transcription_files } + + # close out tempfiles that have been opened + after(:each) do + files.each do |file| + file.close + file.unlink + end + end + + it 'generates a file containing raw data' do + raw_data_file = files.detect { |f| + basename = File.basename(f) + /^raw_data_.*\.json$/.match(basename) + } + + expect(File).to exist(raw_data_file.path) + expect(eval(raw_data_file.read)).to eq(transcription.text) + end + + it 'generates a file containing consensus text' do + consensus_text_file = files.detect do |f| + basename = File.basename(f) + /^consensus_text_.*\.txt$/.match(basename) + end + + expect(File).to exist(consensus_text_file.path) + + # confirm that first line of consensus text is present in file + first_consensus_line = transcription.text['frame0'][0]['consensus_text'] + expect(consensus_text_file.read).to include(first_consensus_line) + end + + it 'generates a file containing transcription metadata with 2 rows' do + metadata_file = files.detect do |f| + basename = File.basename(f) + /^transcription_metadata_.*\.csv$/.match(basename) + end + + expect(File).to exist(metadata_file) + end + + it 'generates transcription metadata file with expected header and number of rows' do + metadata_file = files.detect do |f| + basename = File.basename(f) + /^transcription_metadata_.*\.csv$/.match(basename) + end + + rows = CSV.parse(metadata_file.read) + expect(rows[0]).to eq([ + 'transcription id', + 'internal id', + 'reducer', + 'caesar parameters', + 'date approved', + 'user who approved', + 'text edited (T/F)', + 'number of pages' + ]) + expect(rows.length).to eq(2) + end + + it 'determines that no lines were edited' do + expect(file_generator.instance_eval{ is_text_edited? }).to eq(false) + end + + it 'generates file containing line metadata' do + line_metadata_file = files.detect do |f| + basename = File.basename(f) + /^transcription_line_metadata_.*\.csv$/.match(basename) + end + + expect(File).to exist(line_metadata_file) + end + + it 'generates line metadata file with correct header' do + line_metadata_file = files.detect do |f| + basename = File.basename(f) + /^transcription_line_metadata_.*\.csv$/.match(basename) + end + + rows = CSV.parse(line_metadata_file.read) + expect(rows[0]).to eq([ + 'consensus text', + 'line number', + 'line slope', + 'consensus score', + 'line edited (T/F)', + 'original transcriber username', + 'line editor username', + 'flagged for low consensus (T/F)', + 'page number', + 'column', + 'number of transcribers', + 'line coordinates' + ]) + end + end + end + + context 'when transcription contains edited lines' do + describe '#generate_transcription_files' do + let(:transcription) { create(:transcription, :edited_json_blob) } + let(:file_generator) { described_class.new transcription } + let(:files) { file_generator.generate_transcription_files } + + it 'determines that lines were edited' do + expect(file_generator.instance_eval { is_text_edited? }).to be_truthy + end + + it 'generates a file containing consensus text' do + consensus_text_file = files.detect do |file| + basename = File.basename(file) + /^consensus_text_.*\.txt$/.match(basename) + end + + expect(File).to exist(consensus_text_file.path) + + # confirm that first line of edited consensus text is present in file + first_consensus_line = transcription.text['frame0'][0]['edited_consensus_text'] + expect(consensus_text_file.read).to include(first_consensus_line) + + files.each do |file| + file.close + file.unlink + end + end + end + end +end diff --git a/spec/services/zip_file_generator_spec.rb b/spec/services/zip_file_generator_spec.rb new file mode 100644 index 00000000..d7d786df --- /dev/null +++ b/spec/services/zip_file_generator_spec.rb @@ -0,0 +1,17 @@ +RSpec.describe ZipFileGenerator do + let(:zip_file_path) { Rails.root.join("spec/fixtures/files/test-zip.zip") } + + describe '#write' do + context 'when given a multi-level directory' do + let(:zip_generator) { + described_class.new(Rails.root.join("spec/fixtures"), zip_file_path) + } + + it 'generates a zip file' do + zip_generator.write + expect(File).to exist(zip_file_path) + File.delete(zip_file_path) + end + end + end +end diff --git a/spec/support/active_storage_helper.rb b/spec/support/active_storage_helper.rb new file mode 100644 index 00000000..9576930c --- /dev/null +++ b/spec/support/active_storage_helper.rb @@ -0,0 +1,14 @@ +module ActiveStorageHelper + # ported from https://github.com/rails/rails/blob/6-0-stable/activestorage/test/test_helper.rb + def blank_file_blob(filename: "transcription_file.txt", content_type: "text/plain", metadata: nil, record: nil) + ActiveStorage::Blob.create_and_upload! io: file_fixture(filename).open, filename: filename, content_type: content_type, metadata: metadata, record: record + end + + def transcription_metadata_blob(filename: 'transcription_metadata_777.csv', content_type: 'text/csv', metadata: nil, record: nil) + ActiveStorage::Blob.create_and_upload! io: file_fixture(filename).open, filename: filename, content_type: content_type, metadata: metadata, record: record + end +end + +RSpec.configure do |config| + config.include ActiveStorageHelper +end \ No newline at end of file