diff --git a/src/AntSK.Domain/AntSK.Domain.xml b/src/AntSK.Domain/AntSK.Domain.xml index afea2f90..5dc7c7b1 100644 --- a/src/AntSK.Domain/AntSK.Domain.xml +++ b/src/AntSK.Domain/AntSK.Domain.xml @@ -193,6 +193,12 @@ 避免模型重复加载,本地缓存 + + + + + + 发送消息 diff --git a/src/AntSK.Domain/Domain/Interface/IKernelService.cs b/src/AntSK.Domain/Domain/Interface/IKernelService.cs index 2dc61ca9..251b11bb 100644 --- a/src/AntSK.Domain/Domain/Interface/IKernelService.cs +++ b/src/AntSK.Domain/Domain/Interface/IKernelService.cs @@ -6,6 +6,8 @@ namespace AntSK.Domain.Domain.Interface public interface IKernelService { Kernel GetKernelByApp(Apps app); + + Kernel GetKernelByAIModelID(string modelid); void ImportFunctionsByApp(Apps app, Kernel _kernel); Task HistorySummarize(Kernel _kernel, string questions, string history); } diff --git a/src/AntSK.Domain/Domain/Model/ImportKMSTaskReq.cs b/src/AntSK.Domain/Domain/Model/ImportKMSTaskReq.cs index 58986260..d7301109 100644 --- a/src/AntSK.Domain/Domain/Model/ImportKMSTaskReq.cs +++ b/src/AntSK.Domain/Domain/Model/ImportKMSTaskReq.cs @@ -17,11 +17,14 @@ public class ImportKMSTaskDTO public string FilePath { get; set; } = ""; public string FileName { get; set; } = ""; + + public bool IsQA { get; set; } = false; } public class ImportKMSTaskReq : ImportKMSTaskDTO { + public bool IsQA { get; set; }=false; public KmsDetails KmsDetail { get; set; } = new KmsDetails(); } @@ -32,4 +35,10 @@ public enum ImportType Text = 3, Excel=4 } + + public class QAModel + { + public string ChatModelId { get; set; } + public string Context { get; set; } + } } diff --git a/src/AntSK.Domain/Domain/Other/QAHandler.cs b/src/AntSK.Domain/Domain/Other/QAHandler.cs new file mode 100644 index 00000000..5a387791 --- /dev/null +++ b/src/AntSK.Domain/Domain/Other/QAHandler.cs @@ -0,0 +1,154 @@ +using AntSK.Domain.Domain.Model; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using Microsoft.KernelMemory.AI.OpenAI; +using Microsoft.KernelMemory.Configuration; +using Microsoft.KernelMemory.DataFormats.Text; +using Microsoft.KernelMemory.Diagnostics; +using Microsoft.KernelMemory.Extensions; +using Microsoft.KernelMemory.Pipeline; +using Newtonsoft.Json; +using RestSharp; +using System.Security.Policy; +using System.Text; + +namespace AntSK.Domain.Domain.Other +{ + public class QAHandler : IPipelineStepHandler + { + private readonly TextPartitioningOptions _options; + private readonly IPipelineOrchestrator _orchestrator; + private readonly ILogger _log; + private readonly TextChunker.TokenCounter _tokenCounter; + public QAHandler( + string stepName, + IPipelineOrchestrator orchestrator, + TextPartitioningOptions? options = null, + ILogger? log = null + ) + { + this.StepName = stepName; + this._orchestrator = orchestrator; + this._options = options ?? new TextPartitioningOptions(); + this._options.Validate(); + + this._log = log ?? DefaultLogger.Instance; + this._tokenCounter = DefaultGPTTokenizer.StaticCountTokens; + } + + /// + public string StepName { get; } + + /// + public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync( + DataPipeline pipeline, CancellationToken cancellationToken = default) + { + this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId); + + if (pipeline.Files.Count == 0) + { + this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId); + return (true, pipeline); + } + + foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files) + { + // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it) + Dictionary newFiles = new(); + + foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles) + { + var file = generatedFile.Value; + if (file.AlreadyProcessedBy(this)) + { + this._log.LogTrace("File {0} already processed by this handler", file.Name); + continue; + } + + // Partition only the original text + if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText) + { + this._log.LogTrace("Skipping file {0} (not original text)", file.Name); + continue; + } + + // Use a different partitioning strategy depending on the file type + List partitions; + List sentences; + BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); + + // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes. + if (partitionContent.ToArray().Length == 0) { continue; } + + switch (file.MimeType) + { + case MimeTypes.PlainText: + case MimeTypes.MarkDown: + { + this._log.LogDebug("Partitioning text file {0}", file.Name); + string content = partitionContent.ToString(); + + using (HttpClient httpclient = new HttpClient()) + { + httpclient.Timeout = TimeSpan.FromMinutes(10); + StringContent scontent = new StringContent(JsonConvert.SerializeObject(new QAModel() { ChatModelId = StepName, Context = content }), Encoding.UTF8, "application/json"); + HttpResponseMessage response = await httpclient.PostAsync("http://localhost:5000/api/KMS/QA", scontent); + List qaList = JsonConvert.DeserializeObject>( await response.Content.ReadAsStringAsync()); + sentences = qaList; + partitions = qaList; + } + break; + } + default: + this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType); + // Don't partition other files + continue; + } + + if (partitions.Count == 0) { continue; } + + this._log.LogDebug("Saving {0} file partitions", partitions.Count); + for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++) + { + // TODO: turn partitions in objects with more details, e.g. page number + string text = partitions[partitionNumber]; + int sectionNumber = 0; // TODO: use this to store the page number (if any) + BinaryData textData = new(text); + + int tokenCount = this._tokenCounter(text); + this._log.LogDebug("Partition size: {0} tokens", tokenCount); + + var destFile = uploadedFile.GetPartitionFileName(partitionNumber); + await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false); + + var destFileDetails = new DataPipeline.GeneratedFileDetails + { + Id = Guid.NewGuid().ToString("N"), + ParentId = uploadedFile.Id, + Name = destFile, + Size = text.Length, + MimeType = MimeTypes.PlainText, + ArtifactType = DataPipeline.ArtifactTypes.TextPartition, + PartitionNumber = partitionNumber, + SectionNumber = sectionNumber, + Tags = pipeline.Tags, + ContentSHA256 = textData.CalculateSHA256(), + }; + newFiles.Add(destFile, destFileDetails); + destFileDetails.MarkProcessedBy(this); + } + + file.MarkProcessedBy(this); + } + + // Add new files to pipeline status + foreach (var file in newFiles) + { + uploadedFile.GeneratedFiles.Add(file.Key, file.Value); + } + } + + return (true, pipeline); + } + } +} diff --git a/src/AntSK.Domain/Domain/Service/ImportKMSService.cs b/src/AntSK.Domain/Domain/Service/ImportKMSService.cs index 623fa2b7..b03d7c6a 100644 --- a/src/AntSK.Domain/Domain/Service/ImportKMSService.cs +++ b/src/AntSK.Domain/Domain/Service/ImportKMSService.cs @@ -24,18 +24,40 @@ public void ImportKMSTask(ImportKMSTaskReq req) try { var km = _kmss_Repositories.GetFirst(p => p.Id == req.KmsId); - var _memory = _kMService.GetMemoryByKMS(km.Id); string fileid = req.KmsDetail.Id; + List step = new List(); + if (req.IsQA) + { + _memory.Orchestrator.AddHandler("extract_text"); + _memory.Orchestrator.AddHandler(km.ChatModelID); + _memory.Orchestrator.AddHandler("generate_embeddings"); + _memory.Orchestrator.AddHandler("save_memory_records"); + step.Add("extract_text"); + step.Add(km.ChatModelID); + step.Add("generate_embeddings"); + step.Add("save_memory_records"); + } + switch (req.ImportType) { case ImportType.File: - //导入文件 { - var importResult = _memory.ImportDocumentAsync(new Document(fileid) - .AddFile(req.FilePath) - .AddTag(KmsConstantcs.KmsIdTag, req.KmsId) - , index: KmsConstantcs.KmsIndex).Result; + //导入文件 + if (req.IsQA) + { + var importResult = _memory.ImportDocumentAsync(new Document(fileid) + .AddFile(req.FilePath) + .AddTag(KmsConstantcs.KmsIdTag, req.KmsId) + ,index: KmsConstantcs.KmsIndex ,steps: step.ToArray()).Result; + } + else + { + var importResult = _memory.ImportDocumentAsync(new Document(fileid) + .AddFile(req.FilePath) + .AddTag(KmsConstantcs.KmsIdTag, req.KmsId) + , index: KmsConstantcs.KmsIndex).Result; + } //查询文档数量 var docTextList = _kMService.GetDocumentByFileID(km.Id, fileid).Result; string fileGuidName = Path.GetFileName(req.FilePath); @@ -48,8 +70,16 @@ public void ImportKMSTask(ImportKMSTaskReq req) case ImportType.Url: { //导入url - var importResult = _memory.ImportWebPageAsync(req.Url, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } - , index: KmsConstantcs.KmsIndex).Result; + if (req.IsQA) + { + var importResult = _memory.ImportWebPageAsync(req.Url, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } + , index: KmsConstantcs.KmsIndex, steps: step.ToArray()).Result; + } + else + { + var importResult = _memory.ImportWebPageAsync(req.Url, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } + , index: KmsConstantcs.KmsIndex).Result; + } //查询文档数量 var docTextList = _kMService.GetDocumentByFileID(km.Id, fileid).Result; req.KmsDetail.Url = req.Url; @@ -59,8 +89,16 @@ public void ImportKMSTask(ImportKMSTaskReq req) case ImportType.Text: //导入文本 { - var importResult = _memory.ImportTextAsync(req.Text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } - , index: KmsConstantcs.KmsIndex).Result; + if (req.IsQA) + { + var importResult = _memory.ImportTextAsync(req.Text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } + , index: KmsConstantcs.KmsIndex, steps: step.ToArray()).Result; + } + else + { + var importResult = _memory.ImportTextAsync(req.Text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } } + , index: KmsConstantcs.KmsIndex).Result; + } //查询文档数量 var docTextList = _kMService.GetDocumentByFileID(km.Id, fileid).Result; req.KmsDetail.Url = req.Url; @@ -71,8 +109,7 @@ public void ImportKMSTask(ImportKMSTaskReq req) case ImportType.Excel: using (var fs = File.OpenRead(req.FilePath)) { - var excelList= ExeclHelper.ExcelToList(fs); - + var excelList= ExeclHelper.ExcelToList(fs); _memory.Orchestrator.AddHandler("extract_text"); _memory.Orchestrator.AddHandler("antsk_excel_split"); _memory.Orchestrator.AddHandler("generate_embeddings"); diff --git a/src/AntSK.Domain/Domain/Service/KernelService.cs b/src/AntSK.Domain/Domain/Service/KernelService.cs index 0dd6a488..9130293f 100644 --- a/src/AntSK.Domain/Domain/Service/KernelService.cs +++ b/src/AntSK.Domain/Domain/Service/KernelService.cs @@ -18,6 +18,8 @@ using AntSK.LLM.LLamaFactory; using System.Reflection; using DocumentFormat.OpenXml.Drawing; +using Microsoft.KernelMemory; +using OpenCvSharp.ML; namespace AntSK.Domain.Domain.Service { @@ -57,7 +59,7 @@ public Kernel GetKernelByApp(Apps app) var chatHttpClient = OpenAIHttpClientHandlerUtil.GetHttpClient(chatModel.EndPoint); var builder = Kernel.CreateBuilder(); - WithTextGenerationByAIType(builder, app, chatModel, chatHttpClient); + WithTextGenerationByAIType(builder, chatModel, chatHttpClient); _kernel = builder.Build(); RegisterPluginsWithKernel(_kernel); @@ -69,7 +71,18 @@ public Kernel GetKernelByApp(Apps app) //} } - private void WithTextGenerationByAIType(IKernelBuilder builder, Apps app, AIModels chatModel, HttpClient chatHttpClient) + public Kernel GetKernelByAIModelID(string modelid) + { + var chatModel = _aIModels_Repositories.GetById(modelid); + var chatHttpClient = OpenAIHttpClientHandlerUtil.GetHttpClient(chatModel.EndPoint); + var builder = Kernel.CreateBuilder(); + WithTextGenerationByAIType(builder, chatModel, chatHttpClient); + _kernel = builder.Build(); + RegisterPluginsWithKernel(_kernel); + return _kernel; + } + + private void WithTextGenerationByAIType(IKernelBuilder builder,AIModels chatModel, HttpClient chatHttpClient) { switch (chatModel.AIType) { @@ -96,7 +109,7 @@ private void WithTextGenerationByAIType(IKernelBuilder builder, Apps app, AIMode case Model.Enum.AIType.SparkDesk: var options = new SparkDeskOptions { AppId = chatModel.EndPoint, ApiSecret = chatModel.ModelKey, ApiKey = chatModel.ModelName, ModelVersion = Sdcb.SparkDesk.ModelVersion.V3_5 }; - builder.Services.AddKeyedSingleton("spark-desk", new SparkDeskTextCompletion(options, app.Id)); + builder.Services.AddKeyedSingleton("spark-desk", new SparkDeskTextCompletion(options, chatModel.Id)); break; case Model.Enum.AIType.DashScope: diff --git a/src/AntSK/Controllers/KMSController.cs b/src/AntSK/Controllers/KMSController.cs index f1a4e89c..2acebb0e 100644 --- a/src/AntSK/Controllers/KMSController.cs +++ b/src/AntSK/Controllers/KMSController.cs @@ -1,10 +1,17 @@ -using AntSK.BackgroundTask; +using AntDesign; +using AntSK.BackgroundTask; using AntSK.Domain.Common.Map; using AntSK.Domain.Domain.Interface; using AntSK.Domain.Domain.Model; using AntSK.Domain.Domain.Model.Enum; +using AntSK.Domain.Domain.Service; using AntSK.Domain.Repositories; +using AntSK.Domain.Utils; +using Microsoft.AspNetCore.Components.Forms; using Microsoft.AspNetCore.Mvc; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Text; +using System.Text.RegularExpressions; namespace AntSK.Controllers { @@ -17,14 +24,17 @@ public class KMSController : ControllerBase { private readonly IKmsDetails_Repositories _kmsDetailsRepositories; private readonly BackgroundTaskBroker _taskBroker; + private readonly IKernelService _kernelService; public KMSController( IKmsDetails_Repositories kmsDetailsRepositories, - BackgroundTaskBroker taskBroker + BackgroundTaskBroker taskBroker, + IKernelService kernelService ) { _kmsDetailsRepositories = kmsDetailsRepositories; _taskBroker = taskBroker; + _kernelService = kernelService; } /// @@ -47,8 +57,36 @@ public async Task ImportKMSTask(ImportKMSTaskDTO model) await _kmsDetailsRepositories.InsertAsync(detail); req.KmsDetail = detail; + req.IsQA=model.IsQA; _taskBroker.QueueWorkItem(req); return Ok(); } + + [HttpPost] + public async Task QA(QAModel model) + { + var kernel = _kernelService.GetKernelByAIModelID(model.ChatModelId); + var lines = TextChunker.SplitPlainTextLines(model.Context, 299); + var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, 4000); + KernelFunction jsonFun = kernel.Plugins.GetFunction("KMSPlugin", "QA"); + + List qaList = new List(); + foreach (var para in paragraphs) + { + var qaresult = await kernel.InvokeAsync(function: jsonFun, new KernelArguments() { ["input"] = para }); + var qaListStr = qaresult.GetValue().ConvertToString(); + + string pattern = @"Q\d+:.*?A\d+:.*?(?=(Q\d+:|$))"; + RegexOptions options = RegexOptions.Singleline; + + foreach (Match match in Regex.Matches(qaListStr, pattern, options)) + { + qaList.Add(match.Value.Trim()); // Trim用于删除可能的首尾空格 + } + + } + + return Ok(qaList); + } } } \ No newline at end of file diff --git a/src/AntSK/Pages/KmsPage/KmsDetail.razor b/src/AntSK/Pages/KmsPage/KmsDetail.razor index 0646249b..375992ab 100644 --- a/src/AntSK/Pages/KmsPage/KmsDetail.razor +++ b/src/AntSK/Pages/KmsPage/KmsDetail.razor @@ -15,8 +15,7 @@
+ Title="知识库文档"> @@ -106,7 +105,7 @@ 搜索测试 - + @@ -125,6 +124,12 @@ + + + 直接切分 + QA切分 + + @@ -136,10 +141,16 @@
+ @ref="@_textForm">