Skip to content

Commit

Permalink
use stats to determine how much to extract
Browse files Browse the repository at this point in the history
killer app command for the moment

dupi index <stuff>
dupi extract | awk '{print $1}' | xargs dupi unblot

added -all to unblot
  • Loading branch information
scott-cotton committed Sep 17, 2021
1 parent 3f06150 commit 48d0eda
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 18 deletions.
8 changes: 6 additions & 2 deletions blot.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ func (b *Blot) Len() int {
return len(b.Docs)
}

func (b *Blot) Next() *Doc {
func (b *Blot) Next(lim bool) *Doc {
n := len(b.Docs)
b.Docs = b.Docs[:n+1]
if lim {
b.Docs = b.Docs[:n+1]
} else {
b.Docs = append(b.Docs, Doc{})
}
return &b.Docs[n]
}
19 changes: 15 additions & 4 deletions cmd/dupi/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
"flag"
"fmt"
"io"
"log"
"math"
"os"

"github.com/go-air/dupi"
Expand All @@ -28,6 +30,7 @@ type extractCmd struct {
subCmd
index *dupi.Index
json *bool
sigma *float64
}

func newExtractCmd() *extractCmd {
Expand All @@ -37,6 +40,7 @@ func newExtractCmd() *extractCmd {
flags: flag.NewFlagSet("extract", flag.ExitOnError)}}

extract.json = extract.flags.Bool("json", false, "output json")
extract.sigma = extract.flags.Float64("sigma", 2.0, "explore blots within σ of average (higher=most probable dups, lower=more volume)")
return extract
}

Expand All @@ -52,10 +56,14 @@ func (x *extractCmd) Run(args []string) error {
return err
}
defer x.index.Close()
st, err := x.index.Stats()
if err != nil {
log.Fatal(err)
}
σ := *x.sigma
N := int(math.Round(st.BlotMean + σ*st.BlotSigma))
query := x.index.StartQuery(dupi.QueryMaxBlot)
shape := []dupi.Blot{
{Blot: 0, Docs: make([]dupi.Doc, 0, 32)},
{Blot: 0, Docs: make([]dupi.Doc, 0, 32)}}
shape := []dupi.Blot{{Blot: 0}}
for {
n, err := query.Next(shape)
if err == io.EOF {
Expand All @@ -67,6 +75,9 @@ func (x *extractCmd) Run(args []string) error {
if n == 0 {
return fmt.Errorf("Query.Next gave 0 and no error")
}
if len(shape[0].Docs) < N {
return nil
}
if *x.json {
shp2 := shape
j := 0
Expand Down Expand Up @@ -98,7 +109,7 @@ func (x *extractCmd) Run(args []string) error {
}
}
for i := range shape {
shape[i].Docs = shape[i].Docs[:0]
shape[i].Docs = nil
}
}
}
1 change: 1 addition & 0 deletions cmd/dupi/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ func newIndexCmd() *indexCmd {
index.add = index.flags.Bool("a", false, "add to a given existing index")
index.verbose = index.flags.Bool("v", false, "verbose")
index.nshat = index.flags.Int("s", 4, "num shatterers")
index.shards = index.flags.Int("n", 4, "num shards")
return index
}

Expand Down
7 changes: 5 additions & 2 deletions cmd/dupi/unblot.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@ import (

type unblotCmd struct {
subCmd
all *bool
}

func newUnblotCmd() *unblotCmd {
return &unblotCmd{
cmd := &unblotCmd{
subCmd: subCmd{name: "unblot", flags: flag.NewFlagSet("unblot", flag.ExitOnError)}}
cmd.all = cmd.flags.Bool("all", false, "output all matches")
return cmd
}

func (ub *unblotCmd) Usage() string {
Expand Down Expand Up @@ -54,7 +57,7 @@ func (ub *unblotCmd) Run(args []string) error {
m[dat] = append(m[dat], doc)
}
for k, ds := range m {
if len(ds) < 2 {
if !*ub.all && len(ds) < 2 {
continue
}
fmt.Printf("text:\n'''\n%s'''\n", k)
Expand Down
21 changes: 11 additions & 10 deletions query.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,7 @@ func (q *Query) Get(blot *Blot) error {
if err != nil {
return err
}
if !lim && len(blot.Docs) == cap(blot.Docs) {
blot.Docs = append(blot.Docs, Doc{})
blot.Docs = blot.Docs[:len(blot.Docs)-1]
}
if err = q.index.docid2Doc(docid, blot.Next()); err != nil {
if err = q.index.docid2Doc(docid, blot.Next(lim)); err != nil {
return err
}
}
Expand All @@ -97,13 +93,18 @@ func (q *Query) Next(dst []Blot) (n int, err error) {
}
continue
}
lim := dstBlot.Docs != nil
_, err = q.fillBlot(dstBlot, shardState, state.i)
if err != nil {
return
}
if len(dstBlot.Docs) <= 1 {
q.advance(shardState, state.i)
dstBlot.Docs = dstBlot.Docs[:0]
if lim {
dstBlot.Docs = dstBlot.Docs[:0]
} else {
dstBlot.Docs = nil
}
continue
}
n++
Expand All @@ -116,20 +117,20 @@ func (q *Query) fillBlot(dst *Blot, src *shard.ReadState, srcPos uint32) (int, e
docid uint32
err error
n int
lim bool
)
dst.Blot = uint32(src.Blot)*q.state.n + q.state.i
for dst.Len() < dst.Cap() {
lim = dst.Docs != nil
for !lim || dst.Len() < dst.Cap() {
docid, err = src.Next()
if err == io.EOF {
q.advance(src, srcPos)
return n, nil
} else if err != nil {
return 0, err
} else if docid == 0 {
continue
}
n++
q.index.docid2Doc(docid, dst.Next())
q.index.docid2Doc(docid, dst.Next(lim))
}
return n, err
}
Expand Down

0 comments on commit 48d0eda

Please sign in to comment.