Skip to content

Commit

Permalink
Add re2 multi-mode matching (#171)
Browse files Browse the repository at this point in the history
Co-authored-by: 江 杨 <[email protected]>
Co-authored-by: Anuraag Agrawal <[email protected]>
  • Loading branch information
3 people authored Dec 25, 2024
1 parent c3ee9c2 commit d99d8a1
Show file tree
Hide file tree
Showing 6 changed files with 529 additions and 4 deletions.
8 changes: 8 additions & 0 deletions experimental/experimental.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,11 @@ func MustCompileLatin1(str string) *re2.Regexp {
}
return regexp
}

// Set is a compiled collection of regular expressions that can be searched for simultaneously.
type Set = internal.Set

// CompileSet compiles the set of regular expression in preparation for matching.
func CompileSet(exprs []string) (*Set, error) {
return internal.CompileSet(exprs, internal.CompileOptions{})
}
206 changes: 206 additions & 0 deletions experimental/experimental_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ package experimental

import (
"fmt"
"reflect"
"sort"
"strings"
"testing"

"github.com/wasilibs/go-re2"
)

func TestCompileLatin1(t *testing.T) {
Expand Down Expand Up @@ -55,3 +60,204 @@ func TestCompileLatin1(t *testing.T) {
})
}
}

var goodRe = []string{
``,
`.`,
`^.$`,
`a`,
`a*`,
`a+`,
`a?`,
`a|b`,
`a*|b*`,
`(a*|b)(c*|d)`,
`[a-z]`,
`[a-abc-c\-\]\[]`,
`[a-z]+`,
`[abc]`,
`[^1234]`,
`[^\n]`,
`\!\\`,
}

type stringError struct {
re string
err string
}

var badSet = []stringError{
{`*`, "error parsing regexp: no argument for repetition operator: *"},
{`+`, "error parsing regexp: no argument for repetition operator: +"},
{`?`, "error parsing regexp: no argument for repetition operator: ?"},
{`(abc`, "error parsing regexp: missing ): (abc"},
{`abc)`, "error parsing regexp: unexpected ): abc)"},
{`x[a-z`, "error parsing regexp: missing ]: [a-z"},
{`[z-a]`, "error parsing regexp: invalid character class range: z-a"},
{`abc\`, "error parsing regexp: trailing \\"},
{`a**`, "error parsing regexp: bad repetition operator: **"},
{`a*+`, "error parsing regexp: bad repetition operator: *+"},
{`\x`, "error parsing regexp: invalid escape sequence: \\x"},
{strings.Repeat(`)\pL`, 27000), "error parsing regexp: unexpected ): " + strings.Repeat(`)\pL`, 27000)},
}

func compileSetTest(t *testing.T, exprs []string, error string) *Set {
set, err := CompileSet(exprs)
if error == "" && err != nil {
t.Error("compiling `", exprs, "`; unexpected error: ", err.Error())
}
if error != "" && err == nil {
t.Error("compiling `", exprs, "`; missing error")
} else if error != "" && !strings.Contains(err.Error(), error) {
t.Error("compiling `", exprs, "`; wrong error: ", err.Error(), "; want ", error)
}
return set
}

func TestGoodSetCompile(t *testing.T) {
compileSetTest(t, goodRe, "")
}

func TestBadCompileSet(t *testing.T) {
for i := 0; i < len(badSet); i++ {
compileSetTest(t, []string{badSet[i].re}, badSet[i].err)
}
}

type SetTest struct {
exprs []string
matches string
matched [4][]int
}

var setTests = []SetTest{
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "x",
matched: [4][]int{
nil, nil, nil, nil,
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "123",
matched: [4][]int{
nil, {3}, {3}, {3},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`},
matches: "df123abc",
matched: [4][]int{
nil, {0}, {0, 3}, {0, 1, 2, 3},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`, `d{4}-\d{2}-\d{2}$`, `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, `1[3-9]\d{9}`, `\.[a-zA-Z0-9]+$`, `<!--[\s\S]*?-->`},
matches: "abcdef123</html><!-- test -->[email protected]",
matched: [4][]int{
nil, {1}, {1, 2}, {1, 2, 3, 5, 6, 7, 8},
},
},
{
exprs: []string{`(d)(e){0}(f)`, `[a-c]+`, `abc`, `\d+`, `d{4}-\d{2}-\d{2}$`, `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, `1[3-9]\d{9}`, `\.[a-zA-Z0-9]+$`, `<!--[\s\S]*?-->`},
matches: "[email protected]",
matched: [4][]int{
nil, {0}, {0, 3}, {0, 1, 3, 5, 6, 7},
},
},
}

func setFindAllTest(t *testing.T, set *Set, matchStr string, matchNum int, matchedIds []int) {
m := set.FindAll([]byte(matchStr), matchNum)
sort.Ints(m)
if !reflect.DeepEqual(m, matchedIds) {
t.Errorf("Match failure on %s: %v should be %v", matchStr, m, matchedIds)
}
}

func setFindAllStringTest(t *testing.T, set *Set, matchStr string, matchNum int, matchedIds []int) {
m := set.FindAllString(matchStr, matchNum)
sort.Ints(m)
if !reflect.DeepEqual(m, matchedIds) {
t.Errorf("Match failure on %s: %v should be %v", matchStr, m, matchedIds)
}
}

func TestSetFindAll(t *testing.T) {
for _, test := range setTests {
set := compileSetTest(t, test.exprs, "")
if set == nil {
return
}
setFindAllTest(t, set, test.matches, 0, test.matched[0])
setFindAllTest(t, set, test.matches, 1, test.matched[1])
setFindAllTest(t, set, test.matches, 2, test.matched[2])
setFindAllTest(t, set, test.matches, 7, test.matched[3])
setFindAllTest(t, set, test.matches, 20, test.matched[3])
}
}

func TestSetFindAllString(t *testing.T) {
for _, test := range setTests {
set := compileSetTest(t, test.exprs, "")
if set == nil {
return
}
setFindAllStringTest(t, set, test.matches, 0, test.matched[0])
setFindAllStringTest(t, set, test.matches, 1, test.matched[1])
setFindAllStringTest(t, set, test.matches, 2, test.matched[2])
setFindAllStringTest(t, set, test.matches, 7, test.matched[3])
setFindAllStringTest(t, set, test.matches, 20, test.matched[3])
}
}

func BenchmarkSet(b *testing.B) {
b.Run("findAll", func(b *testing.B) {
set, err := CompileSet(goodRe)
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
set.FindAll([]byte("abcdef123</html><!-- test -->[email protected]"), 20)
}
})
}

func BenchmarkSetMatchWithFindSubmatch(b *testing.B) {
b.Run("set match", func(b *testing.B) {
set, err := CompileSet(goodRe)
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
set.FindAll([]byte("abcd123"), 20)
}
})
b.Run("findSubmatch", func(b *testing.B) {
re, err := re2.Compile("(" + strings.Join(goodRe, ")|(") + ")")
if err != nil {
panic(err)
}
for i := 0; i < b.N; i++ {
re.FindAllStringSubmatchIndex("abcd123", 20)
}
})
}

func ExampleCompileSet() {
exprs := []string{"abc", "\\d+"}
set, err := CompileSet(exprs)
if err != nil {
panic(err)
}
fmt.Println(set.FindAll([]byte("abcd"), len(exprs)))
fmt.Println(set.FindAll([]byte("123"), len(exprs)))
fmt.Println(set.FindAll([]byte("abc123"), len(exprs)))
fmt.Println(set.FindAll([]byte("def"), len(exprs)))
// Output:
// [0]
// [1]
// [0 1]
// []
}
30 changes: 29 additions & 1 deletion internal/cre2/cre2.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ void cre2_opt_set_posix_syntax(void* opt, int flag);
void cre2_opt_set_case_sensitive(void* opt, int flag);
void cre2_opt_set_latin1_encoding(void* opt);
void cre2_opt_set_max_mem(void* opt, int64_t size);
void* cre2_set_new(void* opt, int anchor);
void* cre2_set_add(void* set, void* pattern, int pattern_len);
int cre2_set_compile(void* set);
int cre2_set_match(void* set, void* text, int text_len, void* match, int nmatch);
void cre2_set_delete(void* set);
void* malloc(size_t size);
void free(void* ptr);
*/
import "C"
import "unsafe"

import (
"unsafe"
)

func New(patternPtr unsafe.Pointer, patternLen int, opts unsafe.Pointer) unsafe.Pointer {
return C.cre2_new(patternPtr, C.int(patternLen), opts)
Expand Down Expand Up @@ -112,6 +120,26 @@ func OptSetMaxMem(opt unsafe.Pointer, size int) {
C.cre2_opt_set_max_mem(opt, C.int64_t(size))
}

func NewSet(opt unsafe.Pointer, anchor int) unsafe.Pointer {
return C.cre2_set_new(opt, C.int(anchor))
}

func SetAdd(set unsafe.Pointer, patternPtr unsafe.Pointer, patternLen int) unsafe.Pointer {
return C.cre2_set_add(set, patternPtr, C.int(patternLen))
}

func SetCompile(set unsafe.Pointer) int {
return int(C.cre2_set_compile(set))
}

func SetMatch(set unsafe.Pointer, textPtr unsafe.Pointer, textLen int, match unsafe.Pointer, nMatch int) int {
return int(C.cre2_set_match(set, textPtr, C.int(textLen), match, C.int(nMatch)))
}

func SetDelete(ptr unsafe.Pointer) {
C.cre2_set_delete(ptr)
}

func Malloc(size int) unsafe.Pointer {
return C.malloc(C.size_t(size))
}
Expand Down
50 changes: 50 additions & 0 deletions internal/re2_re2_cgo.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package internal

import (
"fmt"
"unsafe"

"github.com/wasilibs/go-re2/internal/cre2"
Expand Down Expand Up @@ -112,6 +113,10 @@ func (a *allocation) newCStringArray(n int) cStringArray {
return cStringArray{ptr: wasmPtr(ptr)}
}

func (a *allocation) read(ptr wasmPtr, size int) []byte {
return (*[1 << 30]byte)(unsafe.Pointer(ptr))[:size:size]
}

type cString struct {
ptr unsafe.Pointer
length int
Expand Down Expand Up @@ -164,3 +169,48 @@ func readMatches(alloc *allocation, cs cString, matchesPtr wasmPtr, n int, deliv
}
}
}

func newSet(_ *libre2ABI, opts CompileOptions) wasmPtr {
opt := cre2.NewOpt()
defer cre2.DeleteOpt(opt)
cre2.OptSetMaxMem(opt, maxSize)
cre2.OptSetLogErrors(opt, false)
if opts.Longest {
cre2.OptSetLongestMatch(opt, true)
}
if opts.Posix {
cre2.OptSetPosixSyntax(opt, true)
}
if opts.CaseInsensitive {
cre2.OptSetCaseSensitive(opt, false)
}
if opts.Latin1 {
cre2.OptSetLatin1Encoding(opt)
}
return wasmPtr(cre2.NewSet(opt, 0))
}

func setAdd(set *Set, s cString) string {
msgPtr := cre2.SetAdd(unsafe.Pointer(set.ptr), s.ptr, s.length)
if msgPtr == nil {
return unknownCompileError
}
msg := cre2.CopyCString(msgPtr)
if msg != "ok" {
cre2.Free(msgPtr)
return fmt.Sprintf("error parsing regexp: %s", msg)
}
return ""
}

func setCompile(set *Set) int32 {
return int32(cre2.SetCompile(unsafe.Pointer(set.ptr)))
}

func setMatch(set *Set, cs cString, matchedPtr wasmPtr, nMatch int) int {
return cre2.SetMatch(unsafe.Pointer(set.ptr), cs.ptr, cs.length, unsafe.Pointer(matchedPtr), nMatch)
}

func deleteSet(_ *libre2ABI, setPtr wasmPtr) {
cre2.SetDelete(unsafe.Pointer(setPtr))
}
Loading

0 comments on commit d99d8a1

Please sign in to comment.