Skip to content

Commit

Permalink
Embeddings: fix unmarshaling old version of index (sourcegraph#51070)
Browse files Browse the repository at this point in the history
Unmarshalling indexes 2 versions back would fail because the struct we
were unmarshalling into changed shape. This was not caught by tests
because the struct that we were using to generate the encodings was the
same as the struct we were using to unmarshal (this is the kind of thing
Protobuf's back/forward compability guarantees were intended to solve).

This PR fixes the issue by reconstructing the previous shape of the
struct and providing a conversion method to the new version. This is a
band-aid fix, and we _really_ need to solve migrations for embeddings
indexes in the near term.
  • Loading branch information
camdencheek authored Apr 25, 2023
1 parent ebeeb11 commit 1d2a227
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 15 deletions.
9 changes: 4 additions & 5 deletions enterprise/internal/embeddings/index_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ func DownloadRepoEmbeddingIndex(ctx context.Context, uploadStore uploadstore.Sto
rei, err := decodeRepoEmbeddingIndex(dec)
// If decoding fails, assume it is an old index and decode with a generic decoder.
if err != nil {
originalErr := err
rei, err = DownloadIndex[RepoEmbeddingIndex](ctx, uploadStore, key)
if err != nil {
// Return both errors in case the first one is the one we care about
return nil, errors.Append(originalErr, err)
oldRei, err2 := DownloadIndex[OldRepoEmbeddingIndex](ctx, uploadStore, key)
if err2 != nil {
return nil, errors.Append(err, err2)
}
return oldRei.ToNewIndex(), nil
}

return rei, nil
Expand Down
20 changes: 10 additions & 10 deletions enterprise/internal/embeddings/index_storage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,16 @@ func TestRepoEmbeddingIndexStorage(t *testing.T) {
}

func TestRepoEmbeddingVersionMismatch(t *testing.T) {
index := &RepoEmbeddingIndex{
index := &OldRepoEmbeddingIndex{
RepoName: api.RepoName("repo"),
Revision: api.CommitID("commit"),
CodeIndex: EmbeddingIndex{
Embeddings: []int8{0, 1, 2},
CodeIndex: OldEmbeddingIndex{
Embeddings: []float32{0, 1, 2},
ColumnDimension: 3,
RowMetadata: []RepoEmbeddingRowMetadata{{FileName: "a.go", StartLine: 0, EndLine: 1}},
},
TextIndex: EmbeddingIndex{
Embeddings: []int8{10, 21, 32},
TextIndex: OldEmbeddingIndex{
Embeddings: []float32{10, 21, 32},
ColumnDimension: 3,
RowMetadata: []RepoEmbeddingRowMetadata{{FileName: "b.py", StartLine: 0, EndLine: 1}},
},
Expand All @@ -155,7 +155,7 @@ func TestRepoEmbeddingVersionMismatch(t *testing.T) {
downloadedIndex, err := DownloadRepoEmbeddingIndex(ctx, uploadStore, "index")
require.NoError(t, err)

require.Equal(t, index, downloadedIndex)
require.Equal(t, index.ToNewIndex(), downloadedIndex)
}

func getMockEmbeddingIndex(nRows int, columnDimension int) EmbeddingIndex {
Expand All @@ -165,10 +165,10 @@ func getMockEmbeddingIndex(nRows int, columnDimension int) EmbeddingIndex {
}

rowMetadata := make([]RepoEmbeddingRowMetadata, nRows)
for _, row := range rowMetadata {
row.StartLine = rand.Int()
row.EndLine = rand.Int()
row.FileName = fmt.Sprintf("path/to/file/%d_%d.go", row.StartLine, row.EndLine)
for i := range rowMetadata {
rowMetadata[i].StartLine = rand.Int()
rowMetadata[i].EndLine = rand.Int()
rowMetadata[i].FileName = fmt.Sprintf("path/to/file/%d_%d.go", rowMetadata[i].StartLine, rowMetadata[i].EndLine)
}

return EmbeddingIndex{
Expand Down
36 changes: 36 additions & 0 deletions enterprise/internal/embeddings/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,39 @@ type EmbeddingSearchResult struct {
// Experimental: Clients should not rely on any particular format of debug
Debug string `json:"debug,omitempty"`
}

// DEPRECATED: to support decoding old indexes, we need a struct
// we can decode into directly. This struct is the same shape
// as the old indexes and should not be changed without migrating
// all existing indexes to the new format.
type OldRepoEmbeddingIndex struct {
RepoName api.RepoName
Revision api.CommitID
CodeIndex OldEmbeddingIndex
TextIndex OldEmbeddingIndex
}

func (o *OldRepoEmbeddingIndex) ToNewIndex() *RepoEmbeddingIndex {
return &RepoEmbeddingIndex{
RepoName: o.RepoName,
Revision: o.Revision,
CodeIndex: o.CodeIndex.ToNewIndex(),
TextIndex: o.TextIndex.ToNewIndex(),
}
}

type OldEmbeddingIndex struct {
Embeddings []float32
ColumnDimension int
RowMetadata []RepoEmbeddingRowMetadata
Ranks []float32
}

func (o *OldEmbeddingIndex) ToNewIndex() EmbeddingIndex {
return EmbeddingIndex{
Embeddings: Quantize(o.Embeddings),
ColumnDimension: o.ColumnDimension,
RowMetadata: o.RowMetadata,
Ranks: o.Ranks,
}
}

0 comments on commit 1d2a227

Please sign in to comment.