Skip to content

Commit

Permalink
feat: re-use embeddings on document layer (copy embeddings from docs …
Browse files Browse the repository at this point in the history
…with same content)
  • Loading branch information
iwilltry42 committed Feb 14, 2025
1 parent c8216ea commit bcf9990
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 4 deletions.
5 changes: 3 additions & 2 deletions knowledge/pkg/datastore/embeddings/embeddings.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package embeddings
import (
"errors"
"fmt"
"reflect"
"strings"

"github.com/gptscript-ai/knowledge/pkg/config"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/cohere"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/jina"
Expand All @@ -14,8 +17,6 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/vertex"
"github.com/mitchellh/mapstructure"
"reflect"
"strings"
)

func GetSelectedEmbeddingsModelProvider(selected string, embeddingsConfig config.EmbeddingsConfig) (types.EmbeddingModelProvider, error) {
Expand Down
16 changes: 16 additions & 0 deletions knowledge/pkg/vectorstore/pgvector/pgvector.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ type VectorStore struct {
collectionTableName string
vectorDimensions int
hnswIndex *HNSWIndex
reuseEmbeddings bool
}

// HNSWIndex lets you specify the HNSW index parameters.
Expand Down Expand Up @@ -99,6 +100,7 @@ func New(ctx context.Context, dsn string, embeddingFunc cg.EmbeddingFunc) (*Vect
embeddingFunc: embeddingFunc,
embeddingConcurrency: env.GetIntFromEnvOrDefault(VsPgvectorEmbeddingConcurrency, 100),
hnswIndex: nil,
reuseEmbeddings: true,
}

var err error
Expand Down Expand Up @@ -302,6 +304,9 @@ func (v VectorStore) AddDocuments(ctx context.Context, docs []vs.Document, colle
}
}

// Check if doc with same content exists
reuseEmbeddingsSQL := fmt.Sprintf(`SELECT embedding FROM %s WHERE document = $1`, v.embeddingTableName)

sql := fmt.Sprintf(`INSERT INTO %s (uuid, document, embedding, cmetadata, collection_id)
VALUES($1, $2, $3, $4, $5)`, v.embeddingTableName)

Expand All @@ -325,6 +330,17 @@ func (v VectorStore) AddDocuments(ctx context.Context, docs []vs.Document, colle
semaphore <- struct{}{}
defer func() { <-semaphore }()

// Check if we can re-use embeddings
if v.reuseEmbeddings {
var embedding pgvector.Vector
err := v.conn.QueryRow(ctx, reuseEmbeddingsSQL, []byte(doc.Content)).Scan(&embedding)
if err == nil && len(embedding.Slice()) > 0 {
b.Queue(sql, doc.ID, []byte(doc.Content), embedding, doc.Metadata, cid)
slog.Debug("Reusing embedding", "docID", doc.ID)
return
}
}

vec, err := v.embeddingFunc(ctx, doc.Content)
if err != nil {
setSharedErr(fmt.Errorf("failed to embed document %s: %w", doc.ID, err))
Expand Down
4 changes: 2 additions & 2 deletions knowledge/pkg/vectorstore/vectorstores.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
dbtypes "github.com/gptscript-ai/knowledge/pkg/index/types"
"github.com/gptscript-ai/knowledge/pkg/vectorstore/chromem"
"github.com/gptscript-ai/knowledge/pkg/vectorstore/pgvector"
sqlite_vec "github.com/gptscript-ai/knowledge/pkg/vectorstore/sqlite-vec"
sqlitevec "github.com/gptscript-ai/knowledge/pkg/vectorstore/sqlite-vec"
"github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
cg "github.com/philippgille/chromem-go"
)
Expand Down Expand Up @@ -46,7 +46,7 @@ func New(ctx context.Context, dsn string, embeddingProvider etypes.EmbeddingMode

return pgvector.New(ctx, dsn, embeddingFunc)
case "sqlite-vec":
return sqlite_vec.New(ctx, dsn, embeddingFunc)
return sqlitevec.New(ctx, dsn, embeddingFunc)
default:
return nil, fmt.Errorf("unsupported dialect: %q", dialect)
}
Expand Down

0 comments on commit bcf9990

Please sign in to comment.