Skip to content

Commit

Permalink
add document workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
michalwarda committed Mar 1, 2024
1 parent 63ecd5e commit 28d3e43
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions apps/api/lib/buildel/document_workflow.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
defmodule Buildel.DocumentWorkflow do
def process(document) do
# file ie. document.pdf
document
# get the content of the file (chunked) [Header{level: 0, metadata: {page: 0}}, Paragraph{level: 1}, Paragraph{level: 1}, ListItem{level: 2, metadata: {page: 1}}]
|> read
# build nodes with relations to other nodes (parent?, next?, previous?)
|> build_relations
# 1500 characters per chunk keeping metadata "#Zał 3. ##Pasywa, ###coś tam, ####dobra trwałe\n maszyny coś tam..."
|> build_node_chunks
# { "zał 3" => [id_chunku, id_chunku_2], "pasywa" => [chunk_3], "coś tam" => [], "dobra trwałe" =>[], "maszyny coś tam" => [] }
|> generate_keyword_nodes
# generate embeddings for each chunk
|> generate_embeddings_for_chunks
# save the chunk with embeddings in database
|> put_in_database

# document
# |> read ("abc cde\n\n fgh\n ijk\n listitem\n")
# |> build_node_chunks (["abc cde\n\n", "fgh\n ijk\n listitem\n"])
# |> generate_embeddings_for_chunks ([{embeddings: [...], chunk: "abc cde\n\n"}, {embeddings: [...], chunk: "fgh\n ijk\n listitem\n"}])
# |> put_in_database ([{embeddings: [...], chunk: "abc cde\n\n"}, {embeddings: [...], chunk: "fgh\n ijk\n listitem\n"}])
end

defp read(_document) do
[]
end

defp build_relations(_document) do
[]
end

defp build_node_chunks(_document) do
[]
end

defp generate_keyword_nodes(_document) do
[]
end

defp generate_embeddings_for_chunks(_document) do
[]
end

defp put_in_database(_document) do
[]
end

defp generate_embeddings_for_chunks(_document) do
[]
end
end

0 comments on commit 28d3e43

Please sign in to comment.