diff --git a/.vscode/launch.json b/.vscode/launch.json index 5968332..a0bcded 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -28,6 +28,20 @@ // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console "console": "internalConsole", "stopAtEntry": false + }, + { + "name": "Exporter (console)", + "type": "coreclr", + "request": "launch", + "preLaunchTask": "build", + "launchSettingsProfile": "Api", + // If you have changed target frameworks, make sure to update the program path. + "program": "${workspaceFolder}/aspnetcore/src/Exporter/bin/Debug/net6.0/CSC.PublicApi.Exporter.dll", + "args": [], + "cwd": "${workspaceFolder}/aspnetcore/src/Exporter", + // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console + "console": "internalConsole", + "stopAtEntry": false } ] } \ No newline at end of file diff --git a/aspnetcore/PublicApi.sln b/aspnetcore/PublicApi.sln index ecc8e9a..db8f740 100644 --- a/aspnetcore/PublicApi.sln +++ b/aspnetcore/PublicApi.sln @@ -48,6 +48,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ApiModels", "src\ApiModels\ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Logging", "src\Logging\Logging.csproj", "{86D2E73C-B7BF-41F3-BC35-7E807C611CEA}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Exporter", "src\Exporter\Exporter.csproj", "{74125BEC-3509-4330-B11D-B1514C15E4FB}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -98,6 +100,10 @@ Global {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Debug|Any CPU.Build.0 = Debug|Any CPU {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Release|Any CPU.ActiveCfg = Release|Any CPU {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Release|Any CPU.Build.0 = Release|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/aspnetcore/src/Exporter/Exporter.cs b/aspnetcore/src/Exporter/Exporter.cs new file mode 100644 index 0000000..73a2d44 --- /dev/null +++ b/aspnetcore/src/Exporter/Exporter.cs @@ -0,0 +1,324 @@ +using AutoMapper; +using Nest; +using CSC.PublicApi.ElasticService; +using Microsoft.Extensions.Configuration; +using CSC.PublicApi.Service.Models.FundingCall; +using CSC.PublicApi.Service.Models.FundingDecision; +using CSC.PublicApi.Service.Models.ResearchDataset; +using CSC.PublicApi.Service.Models.Publication; +using FundingCallApiModel = ResearchFi.FundingCall.FundingCall; +using FundingDecisionApiModel = ResearchFi.FundingDecision.FundingDecision; +using ResearchDatasetApiModel = ResearchFi.ResearchDataset.ResearchDataset; +using PublicationApiModel = ResearchFi.Publication.Publication; +using System.Text.Json; + +namespace BulkExport; + +// Exports documents from Elasticsearch into json files. +public class Exporter +{ + private IElasticClient _elasticClient; + private readonly IMapper _mapper; + private readonly IndexNameSettings _indexNameSettings; + private readonly IConfiguration _configuration; + private const int SingleQueryResultLimit = 1000; + private string? ExportBaseDirectory = ""; + + public Exporter(IElasticClient elasticClient, IConfiguration configuration, IMapper mapper, IndexNameSettings indexNameSettings) + { + _elasticClient = elasticClient; + _configuration = configuration; + _mapper = mapper; + _indexNameSettings = indexNameSettings; + + // Get export base directory from configuration + ExportBaseDirectory = _configuration["EXPORTER:BASEDIRECTORY"]; + if (ExportBaseDirectory == null) + { + string errorMessage = $"Export: Failed: could not set export target directory from configuration (EXPORTER:BASEDIRECTORY)"; + Console.WriteLine(errorMessage); + throw new InvalidOperationException(errorMessage); + } + else { + Console.WriteLine($"Export: target directory set to '{ExportBaseDirectory}' from configuration (EXPORTER:BASEDIRECTORY)"); + } + } + + + // Construct export file name including full path + private string GetFilename(string modelTypeFullName, long exportFileNumber) + { + string exportFileNumberPaddedString = exportFileNumber.ToString("D10"); + string fileTypeString = ""; + switch (modelTypeFullName) + { + case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": + fileTypeString = "fundingCall"; + break; + case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": + fileTypeString = "fundingDecision"; + break; + case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": + fileTypeString = "researchDataset"; + break; + case "CSC.PublicApi.Service.Models.Publication.Publication": + fileTypeString = "publication"; + break; + } + return $"{ExportBaseDirectory}{Path.DirectorySeparatorChar}{fileTypeString}-{exportFileNumberPaddedString}.json"; + } + + + /* + * Export data from Elasticsearch index into json text files + * - Get list of configured Elasticsearch indexes + * - For each index, get all documents and + * - Convert them from Elasticsearch model to API model, which ensures the json files will contain the same fields as the Public API endpoint + * - Construct export file name and path + * - Write data to json file + * - To bypass Elasticsearch limitation of 10000 result set, the "search after" feature is utilized + * - https://www.elastic.co/guide/en/elasticsearch/reference/7.17/paginate-search-results.html#search-after + * - Data is queried in smaller chunks, sorted by DocumentIndexOrder + * - This is the most efficient way to sort documents + * - Last hit of previous query is stored + * - New query will always contain "search after" section containing the last hit from previous query + */ + public void Export(JsonSerializerOptions serializerOptions) + { + // Get Elasticsearch indexes and process them + var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + foreach (var (indexName, modelType) in configuredTypesAndIndexNames) + { + long numberOfDocumentsInIndex = 0; + long numberOfQueryResults = -1; + long exportFileNumber = 0; + CountResponse? countResponse = null; + + switch (modelType.FullName) + { + /* + * FundingCall + */ + case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": + IHit? lastHitFundingCall = null; + ISearchResponse? fundingCallSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: FundingCall: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitFundingCall != null) { + fundingCallSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitFundingCall.Sorts) + ); + } else { + fundingCallSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = fundingCallSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitFundingCall = fundingCallSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in fundingCallSearchResponse.Documents) + { + FundingCallApiModel fundingCall = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(fundingCall, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: FundingCall: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + + Console.WriteLine($"Export: FundingCall: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); + break; + + + + /* + * FundingDecision + */ + case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": + IHit? lastHitFundingDecision = null; + ISearchResponse? fundingDecisionSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: FundingDecision: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitFundingDecision != null) { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitFundingDecision.Sorts) + ); + } else { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = fundingDecisionSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitFundingDecision = fundingDecisionSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in fundingDecisionSearchResponse.Documents) + { + FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: FundingDecision: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + + Console.WriteLine($"Export: FundingDecision: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); + break; + + + + /* + * ResearchDataset + */ + case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": + IHit? lastHitResearchDataset = null; + ISearchResponse? researchDatasetSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: ResearchDataset: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitResearchDataset != null) { + researchDatasetSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitResearchDataset.Sorts) + ); + } else { + researchDatasetSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = researchDatasetSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitResearchDataset= researchDatasetSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in researchDatasetSearchResponse.Documents) + { + ResearchDatasetApiModel researchDataset = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(researchDataset, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: ResearchDataset: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + break; + + + + /* + * Publication + */ + case "CSC.PublicApi.Service.Models.Publication.Publication": + IHit? lastHitPublication = null; + ISearchResponse? publicationSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: Publication: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitPublication != null) { + publicationSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitPublication.Sorts) + ); + } else { + publicationSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = publicationSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitPublication= publicationSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in publicationSearchResponse.Documents) + { + PublicationApiModel publication = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(publication, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: Publication: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + break; + + default: + break; + } + } + + Console.WriteLine("Export: completed"); + } +} diff --git a/aspnetcore/src/Exporter/Exporter.csproj b/aspnetcore/src/Exporter/Exporter.csproj new file mode 100644 index 0000000..9c96407 --- /dev/null +++ b/aspnetcore/src/Exporter/Exporter.csproj @@ -0,0 +1,18 @@ + + + + + + + + + Exe + net6.0 + enable + enable + b73bde99-06bf-406c-8e60-eb475cb00acc + CSC.PublicApi.$(MSBuildProjectName.Replace(" ", "_")) + CSC.PublicApi.$(MSBuildProjectName) + + + diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs new file mode 100644 index 0000000..32bc45e --- /dev/null +++ b/aspnetcore/src/Exporter/Program.cs @@ -0,0 +1,67 @@ +using System.Text.Encodings.Web; +using System.Text.Json; +using System.Text.Json.Serialization; +using BulkExport; +using CSC.PublicApi.ElasticService; +using CSC.PublicApi.Interface; +using CSC.PublicApi.Interface.Configuration; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; + +/* + * This application exports all documents from Elasticsearch index into json files. Data is converted from Elasticsearch model to API model. + * The purpose of this application is to enable bulk export of all data. Application is not intended to be executed automatically in production. + * Instead, it should be considered as a tool, which a developer can use when a full data dump is needed in json file format. + * + * The application uses the same configuration as Indexer and Interface applications, except that it requires an added parameter: + * EXPORTER:BASEDIRECTORY - sets the base directory where the json files are written. It must be defined without trailing slash, for example, "/tmp" + */ + +public class Program +{ + private const int DefaultQueryTimeout = 300; + public static async Task Main(string[] args) + { + var environment = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"); + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", true) + .AddJsonFile($"appsettings.{environment}.json", true) + .AddUserSecrets() + .AddEnvironmentVariables() + .Build(); + var consoleHost = CreateHostBuilder(args).Build(); + + // Define json serializer options + // https://learn.microsoft.com/en-us/dotnet/api/system.text.json.jsonserializeroptions?view=net-6.0 + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/character-encoding + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/customize-properties?pivots=dotnet-6-0 + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/ignore-properties#ignore-all-null-value-properties + var jsonSerializerOptions = new JsonSerializerOptions { + Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + WriteIndented = true, + }; + + // Start export + var exporter = consoleHost.Services.GetRequiredService(); + exporter.Export(jsonSerializerOptions); + } + + private static IHostBuilder CreateHostBuilder(string[] args) => Host + .CreateDefaultBuilder(args) + .ConfigureServices((hostContext, services) => + { + services.AddTransient(); + services.AddSettings(hostContext.Configuration); + services.AddElasticSearch(hostContext.Configuration); + services.AddAutoMapper(typeof(ApiPolicies).Assembly); + services.AddScoped(); + }) + .ConfigureHostConfiguration(configurationBuilder => configurationBuilder + .AddEnvironmentVariables() + .AddUserSecrets(typeof(Program).Assembly, true) + .Build()); +} \ No newline at end of file