From ef50a71a01f0170f21ff1b0f2b76321326f797e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Fri, 14 Jun 2024 13:03:01 +0300 Subject: [PATCH 1/6] CSCTTV-3882-exporter-console-app --- .vscode/launch.json | 14 +++++++ aspnetcore/PublicApi.sln | 6 +++ aspnetcore/src/Exporter/Exporter.csproj | 18 +++++++++ .../Exporters/FundingDecisionExporter.cs | 40 +++++++++++++++++++ aspnetcore/src/Exporter/Program.cs | 25 ++++++++++++ 5 files changed, 103 insertions(+) create mode 100644 aspnetcore/src/Exporter/Exporter.csproj create mode 100644 aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs create mode 100644 aspnetcore/src/Exporter/Program.cs diff --git a/.vscode/launch.json b/.vscode/launch.json index 5968332..a0bcded 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -28,6 +28,20 @@ // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console "console": "internalConsole", "stopAtEntry": false + }, + { + "name": "Exporter (console)", + "type": "coreclr", + "request": "launch", + "preLaunchTask": "build", + "launchSettingsProfile": "Api", + // If you have changed target frameworks, make sure to update the program path. + "program": "${workspaceFolder}/aspnetcore/src/Exporter/bin/Debug/net6.0/CSC.PublicApi.Exporter.dll", + "args": [], + "cwd": "${workspaceFolder}/aspnetcore/src/Exporter", + // For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console + "console": "internalConsole", + "stopAtEntry": false } ] } \ No newline at end of file diff --git a/aspnetcore/PublicApi.sln b/aspnetcore/PublicApi.sln index ecc8e9a..db8f740 100644 --- a/aspnetcore/PublicApi.sln +++ b/aspnetcore/PublicApi.sln @@ -48,6 +48,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ApiModels", "src\ApiModels\ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Logging", "src\Logging\Logging.csproj", "{86D2E73C-B7BF-41F3-BC35-7E807C611CEA}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Exporter", "src\Exporter\Exporter.csproj", "{74125BEC-3509-4330-B11D-B1514C15E4FB}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -98,6 +100,10 @@ Global {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Debug|Any CPU.Build.0 = Debug|Any CPU {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Release|Any CPU.ActiveCfg = Release|Any CPU {86D2E73C-B7BF-41F3-BC35-7E807C611CEA}.Release|Any CPU.Build.0 = Release|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {74125BEC-3509-4330-B11D-B1514C15E4FB}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/aspnetcore/src/Exporter/Exporter.csproj b/aspnetcore/src/Exporter/Exporter.csproj new file mode 100644 index 0000000..9c96407 --- /dev/null +++ b/aspnetcore/src/Exporter/Exporter.csproj @@ -0,0 +1,18 @@ + + + + + + + + + Exe + net6.0 + enable + enable + b73bde99-06bf-406c-8e60-eb475cb00acc + CSC.PublicApi.$(MSBuildProjectName.Replace(" ", "_")) + CSC.PublicApi.$(MSBuildProjectName) + + + diff --git a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs new file mode 100644 index 0000000..2be8b6d --- /dev/null +++ b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs @@ -0,0 +1,40 @@ +using AutoMapper; +using Nest; +using CSC.PublicApi.ElasticService; +using CSC.PublicApi.Service.Models.FundingDecision; +using FundingDecisionApiModel = ResearchFi.FundingDecision.FundingDecision; + +namespace BulkExport; + +public class FundingDecisionExporter +{ + private IElasticClient _elasticClient; + private readonly IMapper _mapper; + private readonly IndexNameSettings _indexNameSettings; + + + public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings) + { + _elasticClient = elasticClient; + _mapper = mapper; + _indexNameSettings = indexNameSettings; + + } + + public void Export() + { + Console.WriteLine("Start exporting funding decisions"); + //string type = "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision"; + //var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + var searchResponse = _elasticClient.Search (s => s.MatchAll().Index("api-dev-funding-decision")); + var docs = searchResponse.Documents; + + foreach (var doc in docs) + { + FundingDecisionApiModel fdMapped = _mapper.Map(doc); + Console.WriteLine($"data: {fdMapped.NameEn}"); + } + + Console.WriteLine($"Done exporting funding calls: {searchResponse.Documents.Count}"); + } +} diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs new file mode 100644 index 0000000..9592bd3 --- /dev/null +++ b/aspnetcore/src/Exporter/Program.cs @@ -0,0 +1,25 @@ +using BulkExport; +using CSC.PublicApi.ElasticService; +using CSC.PublicApi.Interface; +using CSC.PublicApi.Interface.Configuration; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; + +var host = Host.CreateDefaultBuilder(args) + .ConfigureServices((context, services) => { + IConfiguration configuration = new ConfigurationBuilder() + .AddUserSecrets() + .AddEnvironmentVariables() + .Build(); + + services.AddSettings(configuration); + services.AddElasticSearch(configuration); + services.AddAutoMapper(typeof(ApiPolicies).Assembly); + services.AddTransient(); + services.AddTransient(); + }) + .Build(); + +var fundingDecisionExporter = host.Services.GetRequiredService(); +fundingDecisionExporter.Export(); From e0f6580ec0c49bc3405670e93123271642323c3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Thu, 20 Jun 2024 15:39:41 +0300 Subject: [PATCH 2/6] Working example of how an Elasticsearch item is converted into API model and written to json file --- .../Exporters/FundingDecisionExporter.cs | 13 ++++++++++--- aspnetcore/src/Exporter/Program.cs | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs index 2be8b6d..97faf5f 100644 --- a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs +++ b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs @@ -3,6 +3,7 @@ using CSC.PublicApi.ElasticService; using CSC.PublicApi.Service.Models.FundingDecision; using FundingDecisionApiModel = ResearchFi.FundingDecision.FundingDecision; +using System.Text.Json; namespace BulkExport; @@ -21,18 +22,24 @@ public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, Ind } - public void Export() + public void Export(JsonSerializerOptions serializerOptions) { Console.WriteLine("Start exporting funding decisions"); //string type = "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision"; //var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + + // First test using "select all", this needs to be converted to "search after" type search var searchResponse = _elasticClient.Search (s => s.MatchAll().Index("api-dev-funding-decision")); var docs = searchResponse.Documents; foreach (var doc in docs) { - FundingDecisionApiModel fdMapped = _mapper.Map(doc); - Console.WriteLine($"data: {fdMapped.NameEn}"); + FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + + string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); + File.WriteAllText("/tmp/funding-decision-test-export.json", jsonString); + + Console.WriteLine(jsonString); } Console.WriteLine($"Done exporting funding calls: {searchResponse.Documents.Count}"); diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs index 9592bd3..1ecdabf 100644 --- a/aspnetcore/src/Exporter/Program.cs +++ b/aspnetcore/src/Exporter/Program.cs @@ -1,4 +1,7 @@ -using BulkExport; +using System.Text.Encodings.Web; +using System.Text.Json; +using System.Text.Json.Serialization; +using BulkExport; using CSC.PublicApi.ElasticService; using CSC.PublicApi.Interface; using CSC.PublicApi.Interface.Configuration; @@ -6,6 +9,18 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; +// Define json serializer options +// https://learn.microsoft.com/en-us/dotnet/api/system.text.json.jsonserializeroptions?view=net-6.0 +// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/character-encoding +// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/customize-properties?pivots=dotnet-6-0 +// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/ignore-properties#ignore-all-null-value-properties +var jsonSerializerOptions = new JsonSerializerOptions { + Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + WriteIndented = true, +}; + var host = Host.CreateDefaultBuilder(args) .ConfigureServices((context, services) => { IConfiguration configuration = new ConfigurationBuilder() @@ -22,4 +37,4 @@ .Build(); var fundingDecisionExporter = host.Services.GetRequiredService(); -fundingDecisionExporter.Export(); +fundingDecisionExporter.Export(jsonSerializerOptions); From 9a64e817e8a44f9f7770e0d1e4e29485105d43de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Thu, 1 Aug 2024 14:37:31 +0300 Subject: [PATCH 3/6] Modify Program.cs structure. Get Elasticsearch index name from configuration. --- .../Exporters/FundingDecisionExporter.cs | 41 +++++++--- aspnetcore/src/Exporter/Program.cs | 79 +++++++++++++------ 2 files changed, 83 insertions(+), 37 deletions(-) diff --git a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs index 97faf5f..0315b15 100644 --- a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs +++ b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs @@ -24,24 +24,39 @@ public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, Ind public void Export(JsonSerializerOptions serializerOptions) { - Console.WriteLine("Start exporting funding decisions"); - //string type = "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision"; - //var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + Console.WriteLine("Funding decision export: started"); + string fundingDecisionIndexName = ""; - // First test using "select all", this needs to be converted to "search after" type search - var searchResponse = _elasticClient.Search (s => s.MatchAll().Index("api-dev-funding-decision")); - var docs = searchResponse.Documents; + var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + foreach (var (indexName, modelType) in configuredTypesAndIndexNames) + { + if (modelType.FullName == "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision") + { + fundingDecisionIndexName = indexName; + break; + } + } - foreach (var doc in docs) + if (fundingDecisionIndexName != "") { - FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + // First test using "select all", this needs to be converted to "search after" type search + var searchResponse = _elasticClient.Search (s => s.MatchAll().Index(fundingDecisionIndexName)); + var docs = searchResponse.Documents; + + foreach (var doc in docs) + { + FundingDecisionApiModel fundingDecision = _mapper.Map(doc); - string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); - File.WriteAllText("/tmp/funding-decision-test-export.json", jsonString); + string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); + File.WriteAllText("/tmp/funding-decision-test-export.json", jsonString); - Console.WriteLine(jsonString); + Console.WriteLine(jsonString); + } + + Console.WriteLine($"Funding decision export: complete, export count {searchResponse.Documents.Count}"); + } + else { + Console.WriteLine($"Funding decision export: failed, index name not found from configuration"); } - - Console.WriteLine($"Done exporting funding calls: {searchResponse.Documents.Count}"); } } diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs index 1ecdabf..06dd6e9 100644 --- a/aspnetcore/src/Exporter/Program.cs +++ b/aspnetcore/src/Exporter/Program.cs @@ -9,32 +9,63 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; -// Define json serializer options -// https://learn.microsoft.com/en-us/dotnet/api/system.text.json.jsonserializeroptions?view=net-6.0 -// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/character-encoding -// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/customize-properties?pivots=dotnet-6-0 -// https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/ignore-properties#ignore-all-null-value-properties -var jsonSerializerOptions = new JsonSerializerOptions { - Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, - PropertyNamingPolicy = JsonNamingPolicy.CamelCase, - DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, - WriteIndented = true, -}; - -var host = Host.CreateDefaultBuilder(args) - .ConfigureServices((context, services) => { - IConfiguration configuration = new ConfigurationBuilder() +public class Program +{ + private const int DefaultQueryTimeout = 300; + public static async Task Main(string[] args) + { + var environment = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"); + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + //.AddJsonFile("appsettings.json") + //.AddJsonFile($"appsettings.{environment}.json", true) .AddUserSecrets() .AddEnvironmentVariables() .Build(); - services.AddSettings(configuration); - services.AddElasticSearch(configuration); - services.AddAutoMapper(typeof(ApiPolicies).Assembly); - services.AddTransient(); - services.AddTransient(); - }) - .Build(); + // Create and configure the host to support dependency injection, configuration, etc. + var consoleHost = CreateHostBuilder(args).Build(); -var fundingDecisionExporter = host.Services.GetRequiredService(); -fundingDecisionExporter.Export(jsonSerializerOptions); + // Define json serializer options + // https://learn.microsoft.com/en-us/dotnet/api/system.text.json.jsonserializeroptions?view=net-6.0 + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/character-encoding + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/customize-properties?pivots=dotnet-6-0 + // https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/ignore-properties#ignore-all-null-value-properties + var jsonSerializerOptions = new JsonSerializerOptions { + Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + WriteIndented = true, + }; + + // Export funding decisions + var fundingDecisionExporter = consoleHost.Services.GetRequiredService(); + fundingDecisionExporter.Export(jsonSerializerOptions); + } + + private static IHostBuilder CreateHostBuilder(string[] args) => Host + .CreateDefaultBuilder(args) + .ConfigureServices((hostContext, services) => + { + services.AddTransient(); + services.AddSettings(hostContext.Configuration); + services.AddElasticSearch(hostContext.Configuration); + services.AddAutoMapper(typeof(ApiPolicies).Assembly); + + // Add ElasticSearchIndexingService. + services.AddScoped(); + + services.AddMemoryCache(); + + if (!int.TryParse(hostContext.Configuration["QueryTimeout"], out var queryTimeout)) + { + queryTimeout = DefaultQueryTimeout; + } + }) + .ConfigureHostConfiguration(configurationBuilder => configurationBuilder + // Most of the configuration comes from environment variables. + .AddEnvironmentVariables() + // For local dev we get configuration from user secrets. + .AddUserSecrets(typeof(Program).Assembly, true) + .Build()); +} \ No newline at end of file From 0b8e31da3fe4d67a016ef1493575d978ad500556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Tue, 6 Aug 2024 14:51:13 +0300 Subject: [PATCH 4/6] Use Elasticsearch search-after functionality to get all documents from index --- .../Exporters/FundingDecisionExporter.cs | 98 ++++++++++++++----- 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs index 0315b15..ec2c63c 100644 --- a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs +++ b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs @@ -12,7 +12,7 @@ public class FundingDecisionExporter private IElasticClient _elasticClient; private readonly IMapper _mapper; private readonly IndexNameSettings _indexNameSettings; - + private const int SingleQueryResultLimit = 250; public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings) { @@ -24,39 +24,87 @@ public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, Ind public void Export(JsonSerializerOptions serializerOptions) { - Console.WriteLine("Funding decision export: started"); - string fundingDecisionIndexName = ""; - var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); foreach (var (indexName, modelType) in configuredTypesAndIndexNames) { - if (modelType.FullName == "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision") + switch (modelType.FullName) { - fundingDecisionIndexName = indexName; - break; - } - } + case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": + long numberOfDocumentsInIndex = 0; + long numberOfQueryResults = -1; + long exportFileNumber = 0; + IHit? lastHit = null; + ISearchResponse? fundingDecisionSearchResponse = null; - if (fundingDecisionIndexName != "") - { - // First test using "select all", this needs to be converted to "search after" type search - var searchResponse = _elasticClient.Search (s => s.MatchAll().Index(fundingDecisionIndexName)); - var docs = searchResponse.Documents; + // Number of documents in index + var countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: FundingDecision: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); - foreach (var doc in docs) - { - FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHit != null) { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHit.Sorts) + ); + } else { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = fundingDecisionSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHit = fundingDecisionSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in fundingDecisionSearchResponse.Documents) + { + ++exportFileNumber; + FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); + //File.WriteAllText("/tmp/funding-call-test-export.json", jsonString); + } + Console.WriteLine($"Export: FundingDecision: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } - string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); - File.WriteAllText("/tmp/funding-decision-test-export.json", jsonString); + Console.WriteLine($"Export: FundingDecision: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); + break; - Console.WriteLine(jsonString); + case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": + break; + + case "CSC.PublicApi.Service.Models.Infrastructure.Infrastructure": + Console.WriteLine($"Export: Infrastucture: TODO"); + break; + + case "CSC.PublicApi.Service.Models.Organization.Organization": + Console.WriteLine($"Export: Organization: TODO"); + break; + + case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": + Console.WriteLine($"Export: Research dataset: TODO"); + break; + + case "CSC.PublicApi.Service.Models.Publication.Publication": + Console.WriteLine($"Export: Publication: TODO"); + break; + + default: + break; } - - Console.WriteLine($"Funding decision export: complete, export count {searchResponse.Documents.Count}"); - } - else { - Console.WriteLine($"Funding decision export: failed, index name not found from configuration"); } + + Console.WriteLine("Export: completed"); } } From 39c5eeda64a1fe18ff62ffe4e408b4f4a701e08a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Thu, 8 Aug 2024 14:40:38 +0300 Subject: [PATCH 5/6] First working version of Exporter --- aspnetcore/src/Exporter/Exporter.cs | 296 ++++++++++++++++++ .../Exporters/FundingDecisionExporter.cs | 110 ------- aspnetcore/src/Exporter/Program.cs | 21 +- 3 files changed, 302 insertions(+), 125 deletions(-) create mode 100644 aspnetcore/src/Exporter/Exporter.cs delete mode 100644 aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs diff --git a/aspnetcore/src/Exporter/Exporter.cs b/aspnetcore/src/Exporter/Exporter.cs new file mode 100644 index 0000000..a5e7b78 --- /dev/null +++ b/aspnetcore/src/Exporter/Exporter.cs @@ -0,0 +1,296 @@ +using AutoMapper; +using Nest; +using CSC.PublicApi.ElasticService; +using CSC.PublicApi.Service.Models.FundingCall; +using CSC.PublicApi.Service.Models.FundingDecision; +using CSC.PublicApi.Service.Models.ResearchDataset; +using CSC.PublicApi.Service.Models.Publication; +using FundingCallApiModel = ResearchFi.FundingCall.FundingCall; +using FundingDecisionApiModel = ResearchFi.FundingDecision.FundingDecision; +using ResearchDatasetApiModel = ResearchFi.ResearchDataset.ResearchDataset; +using PublicationApiModel = ResearchFi.Publication.Publication; +using System.Text.Json; + +namespace BulkExport; + +// Exports documents from Elasticsearch into json files. +public class Exporter +{ + private IElasticClient _elasticClient; + private readonly IMapper _mapper; + private readonly IndexNameSettings _indexNameSettings; + private const int SingleQueryResultLimit = 1000; + private const string ExportBaseDirectory = "/tmp"; + + public Exporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings) + { + _elasticClient = elasticClient; + _mapper = mapper; + _indexNameSettings = indexNameSettings; + + } + + + + private string GetFilename(string modelTypeFullName, long exportFileNumber) + { + string exportFileNumberPaddedString = exportFileNumber.ToString("D10"); + string fileTypeString = ""; + switch (modelTypeFullName) + { + case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": + fileTypeString = "fundingCall"; + break; + case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": + fileTypeString = "fundingDecision"; + break; + case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": + fileTypeString = "researchDataset"; + break; + case "CSC.PublicApi.Service.Models.Publication.Publication": + fileTypeString = "publication"; + break; + } + return $"{ExportBaseDirectory}/{fileTypeString}-{exportFileNumberPaddedString}.json"; + } + + + public void Export(JsonSerializerOptions serializerOptions) + { + var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); + + foreach (var (indexName, modelType) in configuredTypesAndIndexNames) + { + long numberOfDocumentsInIndex = 0; + long numberOfQueryResults = -1; + long exportFileNumber = 0; + CountResponse? countResponse = null; + + switch (modelType.FullName) + { + /* + * FundingCall + */ + case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": + IHit? lastHitFundingCall = null; + ISearchResponse? fundingCallSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: FundingCall: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitFundingCall != null) { + fundingCallSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitFundingCall.Sorts) + ); + } else { + fundingCallSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = fundingCallSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitFundingCall = fundingCallSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in fundingCallSearchResponse.Documents) + { + FundingCallApiModel fundingCall = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(fundingCall, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: FundingCall: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + + Console.WriteLine($"Export: FundingCall: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); + break; + + + + /* + * FundingDecision + */ + case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": + IHit? lastHitFundingDecision = null; + ISearchResponse? fundingDecisionSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: FundingDecision: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitFundingDecision != null) { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitFundingDecision.Sorts) + ); + } else { + fundingDecisionSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = fundingDecisionSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitFundingDecision = fundingDecisionSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in fundingDecisionSearchResponse.Documents) + { + FundingDecisionApiModel fundingDecision = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: FundingDecision: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + + Console.WriteLine($"Export: FundingDecision: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); + break; + + + + /* + * ResearchDataset + */ + case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": + IHit? lastHitResearchDataset = null; + ISearchResponse? researchDatasetSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: ResearchDataset: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitResearchDataset != null) { + researchDatasetSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitResearchDataset.Sorts) + ); + } else { + researchDatasetSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = researchDatasetSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitResearchDataset= researchDatasetSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in researchDatasetSearchResponse.Documents) + { + ResearchDatasetApiModel researchDataset = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(researchDataset, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: ResearchDataset: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + break; + + + + /* + * Publication + */ + case "CSC.PublicApi.Service.Models.Publication.Publication": + IHit? lastHitPublication = null; + ISearchResponse? publicationSearchResponse = null; + + // Number of documents in index + countResponse = _elasticClient.Count(c => c.Index(indexName)); + numberOfDocumentsInIndex = countResponse.Count; + Console.WriteLine($"Export: Publication: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); + + while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { + // Get batch of documents + if (lastHitPublication != null) { + publicationSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + .SearchAfter(lastHitPublication.Sorts) + ); + } else { + publicationSearchResponse = _elasticClient.Search (s => s + .Index(indexName) + .Size(SingleQueryResultLimit) + .Query(q => q.MatchAll()) + .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) + ); + } + + numberOfQueryResults = publicationSearchResponse.Documents.Count; + if (numberOfQueryResults == 0) + { + break; + } + lastHitPublication= publicationSearchResponse.Hits.Last(); + + // Process documents: Map from Elastic index model to API model, write into text file + foreach (var doc in publicationSearchResponse.Documents) + { + PublicationApiModel publication = _mapper.Map(doc); + string jsonString = JsonSerializer.Serialize(publication, serializerOptions); + File.WriteAllText( + GetFilename(modelType.FullName, ++exportFileNumber), + jsonString + ); + } + Console.WriteLine($"Export: Publication: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); + } + break; + + default: + break; + } + } + + Console.WriteLine("Export: completed"); + } +} diff --git a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs b/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs deleted file mode 100644 index ec2c63c..0000000 --- a/aspnetcore/src/Exporter/Exporters/FundingDecisionExporter.cs +++ /dev/null @@ -1,110 +0,0 @@ -using AutoMapper; -using Nest; -using CSC.PublicApi.ElasticService; -using CSC.PublicApi.Service.Models.FundingDecision; -using FundingDecisionApiModel = ResearchFi.FundingDecision.FundingDecision; -using System.Text.Json; - -namespace BulkExport; - -public class FundingDecisionExporter -{ - private IElasticClient _elasticClient; - private readonly IMapper _mapper; - private readonly IndexNameSettings _indexNameSettings; - private const int SingleQueryResultLimit = 250; - - public FundingDecisionExporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings) - { - _elasticClient = elasticClient; - _mapper = mapper; - _indexNameSettings = indexNameSettings; - - } - - public void Export(JsonSerializerOptions serializerOptions) - { - var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); - foreach (var (indexName, modelType) in configuredTypesAndIndexNames) - { - switch (modelType.FullName) - { - case "CSC.PublicApi.Service.Models.FundingDecision.FundingDecision": - long numberOfDocumentsInIndex = 0; - long numberOfQueryResults = -1; - long exportFileNumber = 0; - IHit? lastHit = null; - ISearchResponse? fundingDecisionSearchResponse = null; - - // Number of documents in index - var countResponse = _elasticClient.Count(c => c.Index(indexName)); - numberOfDocumentsInIndex = countResponse.Count; - Console.WriteLine($"Export: FundingDecision: started from index {indexName} containing {numberOfDocumentsInIndex} documents"); - - while (numberOfQueryResults == -1 || numberOfQueryResults >= SingleQueryResultLimit) { - // Get batch of documents - if (lastHit != null) { - fundingDecisionSearchResponse = _elasticClient.Search (s => s - .Index(indexName) - .Size(SingleQueryResultLimit) - .Query(q => q.MatchAll()) - .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) - .SearchAfter(lastHit.Sorts) - ); - } else { - fundingDecisionSearchResponse = _elasticClient.Search (s => s - .Index(indexName) - .Size(SingleQueryResultLimit) - .Query(q => q.MatchAll()) - .Sort(sort => sort.Ascending(SortSpecialField.DocumentIndexOrder)) - ); - } - - numberOfQueryResults = fundingDecisionSearchResponse.Documents.Count; - if (numberOfQueryResults == 0) - { - break; - } - lastHit = fundingDecisionSearchResponse.Hits.Last(); - - // Process documents: Map from Elastic index model to API model, write into text file - foreach (var doc in fundingDecisionSearchResponse.Documents) - { - ++exportFileNumber; - FundingDecisionApiModel fundingDecision = _mapper.Map(doc); - string jsonString = JsonSerializer.Serialize(fundingDecision, serializerOptions); - //File.WriteAllText("/tmp/funding-call-test-export.json", jsonString); - } - Console.WriteLine($"Export: FundingDecision: in progress {exportFileNumber}/{numberOfDocumentsInIndex}"); - } - - Console.WriteLine($"Export: FundingDecision: complete, exported {exportFileNumber}/{numberOfDocumentsInIndex}"); - break; - - case "CSC.PublicApi.Service.Models.FundingCall.FundingCall": - break; - - case "CSC.PublicApi.Service.Models.Infrastructure.Infrastructure": - Console.WriteLine($"Export: Infrastucture: TODO"); - break; - - case "CSC.PublicApi.Service.Models.Organization.Organization": - Console.WriteLine($"Export: Organization: TODO"); - break; - - case "CSC.PublicApi.Service.Models.ResearchDataset.ResearchDataset": - Console.WriteLine($"Export: Research dataset: TODO"); - break; - - case "CSC.PublicApi.Service.Models.Publication.Publication": - Console.WriteLine($"Export: Publication: TODO"); - break; - - default: - break; - } - } - - Console.WriteLine("Export: completed"); - } -} diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs index 06dd6e9..42e204c 100644 --- a/aspnetcore/src/Exporter/Program.cs +++ b/aspnetcore/src/Exporter/Program.cs @@ -17,8 +17,8 @@ public static async Task Main(string[] args) var environment = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"); var configuration = new ConfigurationBuilder() .SetBasePath(Directory.GetCurrentDirectory()) - //.AddJsonFile("appsettings.json") - //.AddJsonFile($"appsettings.{environment}.json", true) + .AddJsonFile("appsettings.json", true) + .AddJsonFile($"appsettings.{environment}.json", true) .AddUserSecrets() .AddEnvironmentVariables() .Build(); @@ -38,29 +38,20 @@ public static async Task Main(string[] args) WriteIndented = true, }; - // Export funding decisions - var fundingDecisionExporter = consoleHost.Services.GetRequiredService(); - fundingDecisionExporter.Export(jsonSerializerOptions); + // Start export + var exporter = consoleHost.Services.GetRequiredService(); + exporter.Export(jsonSerializerOptions); } private static IHostBuilder CreateHostBuilder(string[] args) => Host .CreateDefaultBuilder(args) .ConfigureServices((hostContext, services) => { - services.AddTransient(); + services.AddTransient(); services.AddSettings(hostContext.Configuration); services.AddElasticSearch(hostContext.Configuration); services.AddAutoMapper(typeof(ApiPolicies).Assembly); - - // Add ElasticSearchIndexingService. services.AddScoped(); - - services.AddMemoryCache(); - - if (!int.TryParse(hostContext.Configuration["QueryTimeout"], out var queryTimeout)) - { - queryTimeout = DefaultQueryTimeout; - } }) .ConfigureHostConfiguration(configurationBuilder => configurationBuilder // Most of the configuration comes from environment variables. From c0661b7770f3b53aa84c0f884500a4c1c98d1e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Sa=CC=88rkikoski?= Date: Fri, 9 Aug 2024 14:02:40 +0300 Subject: [PATCH 6/6] Add comments. Get export base directory from configuration. --- aspnetcore/src/Exporter/Exporter.cs | 40 ++++++++++++++++++++++++----- aspnetcore/src/Exporter/Program.cs | 13 +++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/aspnetcore/src/Exporter/Exporter.cs b/aspnetcore/src/Exporter/Exporter.cs index a5e7b78..73a2d44 100644 --- a/aspnetcore/src/Exporter/Exporter.cs +++ b/aspnetcore/src/Exporter/Exporter.cs @@ -1,6 +1,7 @@ using AutoMapper; using Nest; using CSC.PublicApi.ElasticService; +using Microsoft.Extensions.Configuration; using CSC.PublicApi.Service.Models.FundingCall; using CSC.PublicApi.Service.Models.FundingDecision; using CSC.PublicApi.Service.Models.ResearchDataset; @@ -19,19 +20,32 @@ public class Exporter private IElasticClient _elasticClient; private readonly IMapper _mapper; private readonly IndexNameSettings _indexNameSettings; + private readonly IConfiguration _configuration; private const int SingleQueryResultLimit = 1000; - private const string ExportBaseDirectory = "/tmp"; + private string? ExportBaseDirectory = ""; - public Exporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings) + public Exporter(IElasticClient elasticClient, IConfiguration configuration, IMapper mapper, IndexNameSettings indexNameSettings) { _elasticClient = elasticClient; + _configuration = configuration; _mapper = mapper; _indexNameSettings = indexNameSettings; - - } + // Get export base directory from configuration + ExportBaseDirectory = _configuration["EXPORTER:BASEDIRECTORY"]; + if (ExportBaseDirectory == null) + { + string errorMessage = $"Export: Failed: could not set export target directory from configuration (EXPORTER:BASEDIRECTORY)"; + Console.WriteLine(errorMessage); + throw new InvalidOperationException(errorMessage); + } + else { + Console.WriteLine($"Export: target directory set to '{ExportBaseDirectory}' from configuration (EXPORTER:BASEDIRECTORY)"); + } + } + // Construct export file name including full path private string GetFilename(string modelTypeFullName, long exportFileNumber) { string exportFileNumberPaddedString = exportFileNumber.ToString("D10"); @@ -51,14 +65,28 @@ private string GetFilename(string modelTypeFullName, long exportFileNumber) fileTypeString = "publication"; break; } - return $"{ExportBaseDirectory}/{fileTypeString}-{exportFileNumberPaddedString}.json"; + return $"{ExportBaseDirectory}{Path.DirectorySeparatorChar}{fileTypeString}-{exportFileNumberPaddedString}.json"; } + /* + * Export data from Elasticsearch index into json text files + * - Get list of configured Elasticsearch indexes + * - For each index, get all documents and + * - Convert them from Elasticsearch model to API model, which ensures the json files will contain the same fields as the Public API endpoint + * - Construct export file name and path + * - Write data to json file + * - To bypass Elasticsearch limitation of 10000 result set, the "search after" feature is utilized + * - https://www.elastic.co/guide/en/elasticsearch/reference/7.17/paginate-search-results.html#search-after + * - Data is queried in smaller chunks, sorted by DocumentIndexOrder + * - This is the most efficient way to sort documents + * - Last hit of previous query is stored + * - New query will always contain "search after" section containing the last hit from previous query + */ public void Export(JsonSerializerOptions serializerOptions) { + // Get Elasticsearch indexes and process them var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames(); - foreach (var (indexName, modelType) in configuredTypesAndIndexNames) { long numberOfDocumentsInIndex = 0; diff --git a/aspnetcore/src/Exporter/Program.cs b/aspnetcore/src/Exporter/Program.cs index 42e204c..32bc45e 100644 --- a/aspnetcore/src/Exporter/Program.cs +++ b/aspnetcore/src/Exporter/Program.cs @@ -9,6 +9,15 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; +/* + * This application exports all documents from Elasticsearch index into json files. Data is converted from Elasticsearch model to API model. + * The purpose of this application is to enable bulk export of all data. Application is not intended to be executed automatically in production. + * Instead, it should be considered as a tool, which a developer can use when a full data dump is needed in json file format. + * + * The application uses the same configuration as Indexer and Interface applications, except that it requires an added parameter: + * EXPORTER:BASEDIRECTORY - sets the base directory where the json files are written. It must be defined without trailing slash, for example, "/tmp" + */ + public class Program { private const int DefaultQueryTimeout = 300; @@ -22,8 +31,6 @@ public static async Task Main(string[] args) .AddUserSecrets() .AddEnvironmentVariables() .Build(); - - // Create and configure the host to support dependency injection, configuration, etc. var consoleHost = CreateHostBuilder(args).Build(); // Define json serializer options @@ -54,9 +61,7 @@ private static IHostBuilder CreateHostBuilder(string[] args) => Host services.AddScoped(); }) .ConfigureHostConfiguration(configurationBuilder => configurationBuilder - // Most of the configuration comes from environment variables. .AddEnvironmentVariables() - // For local dev we get configuration from user secrets. .AddUserSecrets(typeof(Program).Assembly, true) .Build()); } \ No newline at end of file