diff --git a/knowledge/pkg/datastore/embeddings/embeddings.go b/knowledge/pkg/datastore/embeddings/embeddings.go index 9c219064..5a7796e7 100644 --- a/knowledge/pkg/datastore/embeddings/embeddings.go +++ b/knowledge/pkg/datastore/embeddings/embeddings.go @@ -3,6 +3,9 @@ package embeddings import ( "errors" "fmt" + "reflect" + "strings" + "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/cohere" "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/jina" @@ -14,8 +17,6 @@ import ( "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types" "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/vertex" "github.com/mitchellh/mapstructure" - "reflect" - "strings" ) func GetSelectedEmbeddingsModelProvider(selected string, embeddingsConfig config.EmbeddingsConfig) (types.EmbeddingModelProvider, error) { diff --git a/knowledge/pkg/vectorstore/pgvector/pgvector.go b/knowledge/pkg/vectorstore/pgvector/pgvector.go index c2b8eb7d..1d907437 100644 --- a/knowledge/pkg/vectorstore/pgvector/pgvector.go +++ b/knowledge/pkg/vectorstore/pgvector/pgvector.go @@ -70,6 +70,7 @@ type VectorStore struct { collectionTableName string vectorDimensions int hnswIndex *HNSWIndex + reuseEmbeddings bool } // HNSWIndex lets you specify the HNSW index parameters. @@ -99,6 +100,7 @@ func New(ctx context.Context, dsn string, embeddingFunc cg.EmbeddingFunc) (*Vect embeddingFunc: embeddingFunc, embeddingConcurrency: env.GetIntFromEnvOrDefault(VsPgvectorEmbeddingConcurrency, 100), hnswIndex: nil, + reuseEmbeddings: true, } var err error @@ -302,6 +304,9 @@ func (v VectorStore) AddDocuments(ctx context.Context, docs []vs.Document, colle } } + // Check if doc with same content exists + reuseEmbeddingsSQL := fmt.Sprintf(`SELECT embedding FROM %s WHERE document = $1`, v.embeddingTableName) + sql := fmt.Sprintf(`INSERT INTO %s (uuid, document, embedding, cmetadata, collection_id) VALUES($1, $2, $3, $4, $5)`, v.embeddingTableName) @@ -325,6 +330,17 @@ func (v VectorStore) AddDocuments(ctx context.Context, docs []vs.Document, colle semaphore <- struct{}{} defer func() { <-semaphore }() + // Check if we can re-use embeddings + if v.reuseEmbeddings { + var embedding pgvector.Vector + err := v.conn.QueryRow(ctx, reuseEmbeddingsSQL, []byte(doc.Content)).Scan(&embedding) + if err == nil && len(embedding.Slice()) > 0 { + b.Queue(sql, doc.ID, []byte(doc.Content), embedding, doc.Metadata, cid) + slog.Debug("Reusing embedding", "docID", doc.ID) + return + } + } + vec, err := v.embeddingFunc(ctx, doc.Content) if err != nil { setSharedErr(fmt.Errorf("failed to embed document %s: %w", doc.ID, err)) diff --git a/knowledge/pkg/vectorstore/vectorstores.go b/knowledge/pkg/vectorstore/vectorstores.go index a5c806aa..a4608fba 100644 --- a/knowledge/pkg/vectorstore/vectorstores.go +++ b/knowledge/pkg/vectorstore/vectorstores.go @@ -10,7 +10,7 @@ import ( dbtypes "github.com/gptscript-ai/knowledge/pkg/index/types" "github.com/gptscript-ai/knowledge/pkg/vectorstore/chromem" "github.com/gptscript-ai/knowledge/pkg/vectorstore/pgvector" - sqlite_vec "github.com/gptscript-ai/knowledge/pkg/vectorstore/sqlite-vec" + sqlitevec "github.com/gptscript-ai/knowledge/pkg/vectorstore/sqlite-vec" "github.com/gptscript-ai/knowledge/pkg/vectorstore/types" cg "github.com/philippgille/chromem-go" ) @@ -46,7 +46,7 @@ func New(ctx context.Context, dsn string, embeddingProvider etypes.EmbeddingMode return pgvector.New(ctx, dsn, embeddingFunc) case "sqlite-vec": - return sqlite_vec.New(ctx, dsn, embeddingFunc) + return sqlitevec.New(ctx, dsn, embeddingFunc) default: return nil, fmt.Errorf("unsupported dialect: %q", dialect) }