From d30db6f958ac993f3e6c56487540836d933542de Mon Sep 17 00:00:00 2001 From: Chris Wendt Date: Wed, 10 Nov 2021 11:06:39 -0700 Subject: [PATCH] Load deps in batches (#216) --- README.md | 2 + cmd/lsif-go/args.go | 7 +- cmd/lsif-go/index.go | 4 +- cmd/lsif-go/main.go | 2 +- internal/indexer/implementation.go | 142 ++++++++++++++++++++--------- internal/indexer/indexer.go | 97 +++++++++----------- internal/indexer/indexer_test.go | 6 +- internal/indexer/moniker.go | 5 +- 8 files changed, 153 insertions(+), 112 deletions(-) diff --git a/README.md b/README.md index f763ebc8..f08fa7ec 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,8 @@ Stats: Packages traversed: 40 ``` +If lsif-go is using too much memory, try setting `--dep-batch-size=100` to only load 100 dependencies into memory at once (~1GB overhead). Lowering the batch size will decrease the overhead further, but increase the runtime a lot more because loading a batch has a fixed cost of ~500ms and each additional package loaded within a batch only adds ~10ms. + Use `lsif-go --help` for more information. ## Updating your index diff --git a/cmd/lsif-go/args.go b/cmd/lsif-go/args.go index 9ed96a2c..451dcb4f 100644 --- a/cmd/lsif-go/args.go +++ b/cmd/lsif-go/args.go @@ -26,7 +26,7 @@ var ( verbosity int noOutput bool noAnimation bool - skipDeps bool + depBatchSize int ) func init() { @@ -45,13 +45,12 @@ func init() { app.Flag("repository-remote", "Specifies the canonical name of the repository remote.").Default(defaultRepositoryRemote.Value()).StringVar(&repositoryRemote) app.Flag("module-version", "Specifies the version of the module defined by module-root.").Default(defaultModuleVersion.Value()).StringVar(&moduleVersion) - // Feature options - app.Flag("skip-deps", "Do not load depedencies - reduces memory usage but omits interface implementation data from deps.").Default("true").BoolVar(&skipDeps) - // Verbosity options app.Flag("quiet", "Do not output to stdout or stderr.").Short('q').Default("false").BoolVar(&noOutput) app.Flag("verbose", "Output debug logs.").Short('v').CounterVar(&verbosity) app.Flag("no-animation", "Do not animate output.").Default("false").BoolVar(&noAnimation) + + app.Flag("dep-batch-size", "How many dependencies to load at once to limit memory usage (e.g. 100). 0 means load all at once.").Default("0").IntVar(&depBatchSize) } func parseArgs(args []string) (err error) { diff --git a/cmd/lsif-go/index.go b/cmd/lsif-go/index.go index 65788132..1116fb71 100644 --- a/cmd/lsif-go/index.go +++ b/cmd/lsif-go/index.go @@ -12,7 +12,7 @@ import ( "github.com/sourcegraph/sourcegraph/lib/codeintel/lsif/protocol/writer" ) -func writeIndex(repositoryRoot, repositoryRemote, projectRoot, moduleName, moduleVersion string, dependencies map[string]gomod.GoModule, projectDependencies []string, outFile string, outputOptions output.Options, skipDeps bool) error { +func writeIndex(repositoryRoot, repositoryRemote, projectRoot, moduleName, moduleVersion string, dependencies map[string]gomod.GoModule, projectDependencies []string, outFile string, outputOptions output.Options, depBatchSize int) error { start := time.Now() out, err := os.Create(outFile) @@ -44,7 +44,7 @@ func writeIndex(repositoryRoot, repositoryRemote, projectRoot, moduleName, modul writer.NewJSONWriter(out), packageDataCache, outputOptions, - skipDeps, + depBatchSize, ) if err := indexer.Index(); err != nil { diff --git a/cmd/lsif-go/main.go b/cmd/lsif-go/main.go index 6875b642..6022ed9d 100644 --- a/cmd/lsif-go/main.go +++ b/cmd/lsif-go/main.go @@ -75,7 +75,7 @@ func mainErr() (err error) { projectDependencies, outFile, outputOptions, - skipDeps, + depBatchSize, ); err != nil { return fmt.Errorf("failed to index: %v", err) } diff --git a/internal/indexer/implementation.go b/internal/indexer/implementation.go index d245893d..902bfd13 100644 --- a/internal/indexer/implementation.go +++ b/internal/indexer/implementation.go @@ -1,8 +1,8 @@ package indexer import ( - "go/ast" "go/types" + "runtime" "strings" "github.com/sourcegraph/lsif-go/internal/output" @@ -11,16 +11,23 @@ import ( ) type implDef struct { - defInfo *DefinitionInfo - ident *ast.Ident - methods []*types.Selection - methodsByName map[string]*types.Selection - pkg *packages.Package - typeName *types.TypeName + defInfo *DefinitionInfo + identIsExported bool + methods []string + methodsByName map[string]methodInfo + monikerPackage string + monikerIdentifier string + typeNameIsExported bool + typeNameIsAlias bool +} + +type methodInfo struct { + definition *DefinitionInfo + monikerIdentifier string } func (def implDef) Exported() bool { - return def.typeName.Exported() || def.ident.IsExported() + return def.typeNameIsExported || def.identIsExported } type implEdge struct { @@ -79,7 +86,7 @@ func (rel implRelation) interfaceIxToNodeIx(idx int) int { return rel.ifaceOffset + idx } -func (rel *implRelation) linkInterfaceToReceivers(idx int, interfaceMethods []*types.Selection, methodToReceivers map[string]*intsets.Sparse) { +func (rel *implRelation) linkInterfaceToReceivers(idx int, interfaceMethods []string, methodToReceivers map[string]*intsets.Sparse) { // Empty interface - skip it. if len(interfaceMethods) == 0 { return @@ -100,7 +107,7 @@ func (rel *implRelation) linkInterfaceToReceivers(idx int, interfaceMethods []*t // If it doesn't match on the first method, then we can immediately quit. // Concrete types must _always_ implement all the methods - if initialReceivers, ok := methodToReceivers[canonicalize(interfaceMethods[0])]; !ok { + if initialReceivers, ok := methodToReceivers[interfaceMethods[0]]; !ok { return } else { candidateTypes.Copy(initialReceivers) @@ -109,7 +116,7 @@ func (rel *implRelation) linkInterfaceToReceivers(idx int, interfaceMethods []*t // Loop over the rest of the methods and find all the types that intersect // every method of the interface. for _, method := range interfaceMethods[1:] { - receivers, ok := methodToReceivers[canonicalize(method)] + receivers, ok := methodToReceivers[method] if !ok { return } @@ -130,7 +137,9 @@ func (rel *implRelation) linkInterfaceToReceivers(idx int, interfaceMethods []*t // // NOTE: if indexImplementations becomes multi-threaded then we would need to update // Indexer.ensureImplementationMoniker to ensure that it uses appropriate locking. -func (i *Indexer) indexImplementations() { +func (i *Indexer) indexImplementations() error { + var implErr error + output.WithProgress("Indexing implementations", func() { // When considering the connections we want to draw between the following four categories: // - LocalInterfaces: Interfaces created in the currently project @@ -157,7 +166,11 @@ func (i *Indexer) indexImplementations() { // ========================= // Local Implementations - localInterfaces, localConcreteTypes := i.extractInterfacesAndConcreteTypes(i.packages) + localInterfaces, localConcreteTypes, err := i.extractInterfacesAndConcreteTypes([]string{"./..."}) + if err != nil { + implErr = err + return + } // LocalConcreteTypes -> LocalInterfaces localRelation := buildImplementationRelation(localConcreteTypes, localInterfaces) @@ -169,7 +182,11 @@ func (i *Indexer) indexImplementations() { // ========================= // Remote Implementations - remoteInterfaces, remoteConcreteTypes := i.extractInterfacesAndConcreteTypes(i.depPackages) + remoteInterfaces, remoteConcreteTypes, err := i.extractInterfacesAndConcreteTypes(i.projectDependencies) + if err != nil { + implErr = err + return + } // LocalConcreteTypes -> RemoteInterfaces (exported only) localTypesToRemoteInterfaces := buildImplementationRelation(localConcreteTypes, filterToExported(remoteInterfaces)) @@ -180,6 +197,8 @@ func (i *Indexer) indexImplementations() { localInterfacesToRemoteTypes.forEachImplementation(i.emitRemoteImplementation) }, i.outputOptions) + + return implErr } // emitLocalImplementation correlates implementations for both structs/interfaces (refered to as typeDefs) and methods. @@ -209,18 +228,17 @@ func (i *Indexer) emitLocalImplementation(from implDef, tos []implDef) { fromMethodDef := i.forEachMethodImplementation(tos, fromName, fromMethod, func(to implDef, _ *DefinitionInfo) { toMethod := to.methodsByName[fromName] - toMethodDef := i.getDefinitionInfo(toMethod.Obj(), nil) // This method is from an embedded type defined in some dependency. - if toMethodDef == nil { + if toMethod.definition == nil { return } - toDocument := toMethodDef.DocumentID + toDocument := toMethod.definition.DocumentID if _, ok := methodDocToInvs[toDocument]; !ok { methodDocToInvs[toDocument] = []uint64{} } - methodDocToInvs[toDocument] = append(methodDocToInvs[toDocument], toMethodDef.RangeID) + methodDocToInvs[toDocument] = append(methodDocToInvs[toDocument], toMethod.definition.RangeID) }) if fromMethodDef == nil { @@ -248,13 +266,13 @@ func (i *Indexer) emitRemoteImplementation(from implDef, tos []implDef) { if from.defInfo == nil { continue } - i.emitImplementationMoniker(from.defInfo.ResultSetID, to.pkg, to.typeName) + i.emitImplementationMoniker(from.defInfo.ResultSetID, to.monikerPackage, to.monikerIdentifier) } for fromName, fromMethod := range from.methodsByName { i.forEachMethodImplementation(tos, fromName, fromMethod, func(to implDef, fromDef *DefinitionInfo) { toMethod := to.methodsByName[fromName] - i.emitImplementationMoniker(fromDef.ResultSetID, to.pkg, toMethod.Obj()) + i.emitImplementationMoniker(fromDef.ResultSetID, to.monikerPackage, toMethod.monikerIdentifier) }) } } @@ -267,13 +285,11 @@ func (i *Indexer) emitRemoteImplementation(from implDef, tos []implDef) { func (i *Indexer) forEachMethodImplementation( tos []implDef, fromName string, - fromMethod *types.Selection, + fromMethod methodInfo, callback func(to implDef, fromDef *DefinitionInfo), ) *DefinitionInfo { - fromMethodDef := i.getDefinitionInfo(fromMethod.Obj(), nil) - // This method is from an embedded type defined in some dependency. - if fromMethodDef == nil { + if fromMethod.definition == nil { return nil } @@ -282,27 +298,27 @@ func (i *Indexer) forEachMethodImplementation( // methods to be considered an implementation. for _, to := range tos { if _, ok := to.methodsByName[fromName]; !ok { - return fromMethodDef + return fromMethod.definition } } for _, to := range tos { // Skip aliases because their methods are redundant with // the underlying concrete type's methods. - if to.typeName.IsAlias() { + if to.typeNameIsAlias { continue } - callback(to, fromMethodDef) + callback(to, fromMethod.definition) } - return fromMethodDef + return fromMethod.definition } // extractInterfacesAndConcreteTypes constructs a list of interfaces and // concrete types from the list of given packages. -func (i *Indexer) extractInterfacesAndConcreteTypes(pkgs []*packages.Package) (interfaces []implDef, concreteTypes []implDef) { - for _, pkg := range pkgs { +func (i *Indexer) extractInterfacesAndConcreteTypes(pkgNames []string) (interfaces []implDef, concreteTypes []implDef, err error) { + visit := func(pkg *packages.Package) { for ident, obj := range pkg.TypesInfo.Defs { if obj == nil { continue @@ -321,24 +337,36 @@ func (i *Indexer) extractInterfacesAndConcreteTypes(pkgs []*packages.Package) (i methods := listMethods(obj.Type().(*types.Named)) + canonicalizedMethods := []string{} + for _, m := range methods { + canonicalizedMethods = append(canonicalizedMethods, canonicalize(m)) + } + // ignore interfaces that are empty. they are too // plentiful and don't provide useful intelligence. if len(methods) == 0 { continue } - methodsByName := map[string]*types.Selection{} + methodsByName := map[string]methodInfo{} for _, m := range methods { - methodsByName[m.Obj().Name()] = m + methodsByName[m.Obj().Name()] = methodInfo{ + definition: i.getDefinitionInfo(m.Obj(), nil), + monikerIdentifier: joinMonikerParts(makeMonikerPackage(m.Obj()), makeMonikerIdentifier(i.packageDataCache, pkg, m.Obj())), + } } + monikerPackage := makeMonikerPackage(obj) + d := implDef{ - pkg: pkg, - typeName: typeName, - ident: ident, - defInfo: i.getDefinitionInfo(typeName, ident), - methods: methods, - methodsByName: methodsByName, + monikerPackage: monikerPackage, + monikerIdentifier: joinMonikerParts(monikerPackage, makeMonikerIdentifier(i.packageDataCache, pkg, obj)), + typeNameIsExported: typeName.Exported(), + typeNameIsAlias: typeName.IsAlias(), + identIsExported: ident.IsExported(), + defInfo: i.getDefinitionInfo(typeName, ident), + methods: canonicalizedMethods, + methodsByName: methodsByName, } if types.IsInterface(obj.Type()) { interfaces = append(interfaces, d) @@ -348,7 +376,36 @@ func (i *Indexer) extractInterfacesAndConcreteTypes(pkgs []*packages.Package) (i } } - return interfaces, concreteTypes + batch := func(pkgBatch []string) error { + pkgs, err := i.loadPackage(true, pkgBatch...) + if err != nil { + return err + } + + for _, pkg := range pkgs { + visit(pkg) + } + return nil + } + + pkgBatch := []string{} + for ix, pkgName := range pkgNames { + pkgBatch = append(pkgBatch, pkgName) + + if i.depBatchSize != 0 && ix%i.depBatchSize == 0 { + err := batch(pkgBatch) + runtime.GC() // Prevent a garbage pile + if err != nil { + return nil, nil, err + } + pkgBatch = pkgBatch[:0] + } + } + if err := batch(pkgBatch); err != nil { + return nil, nil, err + } + + return interfaces, concreteTypes, nil } // buildImplementationRelation builds a map from concrete types to all the interfaces that they implement. @@ -363,11 +420,10 @@ func buildImplementationRelation(concreteTypes, interfaces []implDef) implRelati methodToReceivers := map[string]*intsets.Sparse{} for idx, t := range concreteTypes { for _, method := range t.methods { - key := canonicalize(method) - if _, ok := methodToReceivers[key]; !ok { - methodToReceivers[key] = &intsets.Sparse{} + if _, ok := methodToReceivers[method]; !ok { + methodToReceivers[method] = &intsets.Sparse{} } - methodToReceivers[key].Insert(idx) + methodToReceivers[method].Insert(idx) } } diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index d98352e8..537752a4 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -60,7 +60,6 @@ type Indexer struct { packageInformationIDs map[string]uint64 // name -> packageInformationID packageDataCache *PackageDataCache // hover text and moniker path cache packages []*packages.Package // index target packages - depPackages []*packages.Package // dependency packages projectID uint64 // project vertex identifier packagesByFile map[string][]*packages.Package emittedDocumentationResults map[ObjectLike]uint64 // type object -> documentationResult vertex ID @@ -79,7 +78,7 @@ type Indexer struct { importMonikerChannel chan importMonikerReference - skipDeps bool + depBatchSize int } func New( @@ -94,7 +93,7 @@ func New( jsonWriter writer.JSONWriter, packageDataCache *PackageDataCache, outputOptions output.Options, - skipDeps bool, + depBatchSize int, ) *Indexer { return &Indexer{ repositoryRoot: repositoryRoot, @@ -124,7 +123,7 @@ func New( packageDataCache: packageDataCache, stripedMutex: newStripedMutex(), importMonikerChannel: make(chan importMonikerReference, 512), - skipDeps: skipDeps, + depBatchSize: depBatchSize, } } @@ -153,7 +152,10 @@ func (i *Indexer) Index() error { // Implementations needs all references to be complete. i.stopImportMonikerReferenceTracker(wg) - i.indexImplementations() + err := i.indexImplementations() + if err != nil { + return errors.Wrap(err, "indexing implementations") + } // Link sets of items to corresponding ranges and results. i.linkReferenceResultsToRanges() @@ -213,9 +215,6 @@ var loadMode = packages.NeedDeps | packages.NeedFiles | packages.NeedImports | p // cachedPackages makes sure that we only load packages once per execution var cachedPackages map[string][]*packages.Package = map[string][]*packages.Package{} -// cachedDepPackages makes sure that we only load dependency packages once per execution -var cachedDepPackages map[string][]*packages.Package = map[string][]*packages.Package{} - // packages populates the packages field containing an AST for each package within the configured // project root. // @@ -229,53 +228,10 @@ func (i *Indexer) loadPackages(deduplicate bool) error { defer wg.Done() defer close(errs) - config := &packages.Config{ - Mode: loadMode, - Dir: i.projectRoot, - Tests: true, - Logf: i.packagesLoadLogger, - } - - load := func(cache map[string][]*packages.Package, patterns ...string) ([]*packages.Package, bool) { - // Make sure we only load packages once per execution. - pkgs, ok := cache[i.projectRoot] - if !ok { - var err error - pkgs, err = packages.Load(config, patterns...) - if err != nil { - errs <- errors.Wrap(err, "packages.Load") - return nil, true - } - - cache[i.projectRoot] = pkgs - } - - if !deduplicate { - return pkgs, false - } - - keep := make([]*packages.Package, 0, len(pkgs)) - for _, pkg := range pkgs { - if i.shouldVisitPackage(pkg, pkgs) { - keep = append(keep, pkg) - } - } - return keep, false - } - - var hasError bool - - i.packages, hasError = load(cachedPackages, "./...") - if hasError { - return - } - - i.depPackages = []*packages.Package{} - if !i.skipDeps { - i.depPackages, hasError = load(cachedDepPackages, i.projectDependencies...) - if hasError { - return - } + var err error + i.packages, err = i.loadPackage(deduplicate, "./...") + if err != nil { + errs <- err } i.packagesByFile = map[string][]*packages.Package{} @@ -291,6 +247,37 @@ func (i *Indexer) loadPackages(deduplicate bool) error { return <-errs } +func (i *Indexer) loadPackage(deduplicate bool, patterns ...string) ([]*packages.Package, error) { + if len(patterns) == 1 && patterns[0] == "./..." && i.packages != nil { + return i.packages, nil + } + + config := &packages.Config{ + Mode: loadMode, + Dir: i.projectRoot, + Tests: true, + Logf: i.packagesLoadLogger, + } + + // Make sure we only load packages once per execution. + pkgs, err := packages.Load(config, patterns...) + if err != nil { + return nil, errors.Wrap(err, "packages.Load") + } + + if !deduplicate { + return pkgs, nil + } + + keep := make([]*packages.Package, 0, len(pkgs)) + for _, pkg := range pkgs { + if i.shouldVisitPackage(pkg, pkgs) { + keep = append(keep, pkg) + } + } + return keep, nil +} + // shouldVisitPackage tells if the package p should be visited. // // According to the `Tests` field in https://pkg.go.dev/golang.org/x/tools/go/packages#Config diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 81ccb71b..46b8b888 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -46,7 +46,7 @@ func TestIndexer(t *testing.T) { w, NewPackageDataCache(), output.Options{}, - false, + 0, ) if err := indexer.Index(); err != nil { @@ -651,7 +651,7 @@ func TestIndexer_documentation(t *testing.T) { writer.NewJSONWriter(&buf), NewPackageDataCache(), output.Options{}, - false, + 0, ) if err := indexer.Index(); err != nil { t.Fatalf("unexpected error indexing testdata: %s", err.Error()) @@ -691,7 +691,7 @@ func TestIndexer_shouldVisitPackage(t *testing.T) { w, NewPackageDataCache(), output.Options{}, - false, + 0, ) if err := indexer.loadPackages(false); err != nil { diff --git a/internal/indexer/moniker.go b/internal/indexer/moniker.go index b8fd782f..ed20c07c 100644 --- a/internal/indexer/moniker.go +++ b/internal/indexer/moniker.go @@ -79,10 +79,7 @@ func (i *Indexer) emitImportMoniker(rangeID uint64, p *packages.Package, obj Obj // identifier (either a range or a result set identifier). This will also emit links between // the moniker vertex and the package information vertex representing the dependency containing // the identifier. -func (i *Indexer) emitImplementationMoniker(resultSet uint64, p *packages.Package, obj ObjectLike) bool { - pkg := makeMonikerPackage(obj) - monikerIdentifier := joinMonikerParts(pkg, makeMonikerIdentifier(i.packageDataCache, p, obj)) - +func (i *Indexer) emitImplementationMoniker(resultSet uint64, pkg string, monikerIdentifier string) bool { for _, moduleName := range packagePrefixes(pkg) { if module, ok := i.dependencies[moduleName]; ok { // Lazily emit package information vertex