Skip to content

Commit

Permalink
feat: limit number of results for prefix searches
Browse files Browse the repository at this point in the history
The maximum number of records that is returned when performing a prefix search
is now configurable (warcserver API).

If an exact match is not found when searching the index, pywb retries queries
using prefix matching without limit to the number of results. This often leads
to timeouts as the number of results can be large if the index is large.
  • Loading branch information
maeb committed May 16, 2024
1 parent 1d6ac16 commit 696146c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 0 deletions.
6 changes: 6 additions & 0 deletions cmd/serve/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ func NewCommand() *cobra.Command {
cmd.Flags().String("path-prefix", "", "path prefix for all server endpoints")
cmd.Flags().Bool("log-requests", false, "log incoming http requests")

// warcserver API options
cmd.Flags().Int("warcserver-prefix-max-records", 1000, "limit number of responses for prefix searches (warcserver)")

// index options
cmd.Flags().StringP("index-source", "s", "file", `index source: "file" or "kafka"`)
cmd.Flags().StringP("index-format", "o", "badger", `index format: "badger", "tikv"`)
Expand Down Expand Up @@ -240,6 +243,9 @@ func serveCmd(_ *cobra.Command, _ []string) error {
FileAPI: fileApi,
IdAPI: idApi,
WarcLoader: l,
Config: &warcserver.Config{
PrefixSearchLimit: viper.GetInt("warcserver-prefix-max-records"),
},
}, handler, mw, pathPrefix+"/warcserver")

// register core API
Expand Down
4 changes: 4 additions & 0 deletions server/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ func (c *SearchRequest) Fields() []string {
return c.fields
}

func (c *SearchRequest) SetLimit(limit int) {
c.limit = limit
}

var schemeRegExp = regexp.MustCompile(`^[a-z][a-z0-9+\-.]+(:.*)`)

func Parse(values url.Values) (req *SearchRequest, err error) {
Expand Down
11 changes: 11 additions & 0 deletions server/warcserver/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,16 @@ import (
"github.com/rs/zerolog/log"
)

type Config struct {
PrefixSearchLimit int
}

type Handler struct {
CdxAPI index.CdxAPI
FileAPI index.FileAPI
IdAPI index.IdAPI
WarcLoader loader.WarcLoader
Config *Config
}

func (h Handler) index(w http.ResponseWriter, r *http.Request) {
Expand All @@ -34,6 +39,12 @@ func (h Handler) index(w http.ResponseWriter, r *http.Request) {
return
}

// limit the number of results when prefix searching
if coreAPI.MatchType() == index.MatchTypePrefix &&
coreAPI.Limit() == 0 {
coreAPI.SetLimit(h.Config.PrefixSearchLimit)
}

start := time.Now()
count := 0
defer func() {
Expand Down

0 comments on commit 696146c

Please sign in to comment.