diff --git a/README.md b/README.md index 6f0dfc7..b125dde 100644 --- a/README.md +++ b/README.md @@ -23,14 +23,17 @@ go get -u github.com/raviqqe/liche Link checker for Markdown and HTML Usage: - liche [-c ] [-d ] [-r] [-t ] [-x ] [-v] ... + liche [-c ] [-d ] [-r] [-t ] [-x ] [-p] [-h] [-l] [-v] ... Options: - -c, --concurrency Set max number of concurrent HTTP requests. [default: 512] + -c, --concurrency Set max number of concurrent HTTP requests. [default: %v] -d, --document-root Set document root directory for absolute paths. -r, --recursive Search Markdown and HTML files recursively -t, --timeout Set timeout for HTTP requests in seconds. Disabled by default. -x, --exclude Regex of links to exclude from checking. + -p, --exclude-private-hosts Exclude private domains and ip addresses. + -h, --exclude-localhost Exclude localhost addresses. + -l, --exclude-link-local Exclude link local addresses. -v, --verbose Be verbose. ``` diff --git a/arguments.go b/arguments.go index a97bb99..9907327 100644 --- a/arguments.go +++ b/arguments.go @@ -14,7 +14,7 @@ const defaultConcurrency = maxOpenFiles / 2 const usage = `Link checker for Markdown and HTML Usage: - liche [-c ] [-d ] [-r] [-t ] [-x ] [-v] ... + liche [-c ] [-d ] [-r] [-t ] [-x ] [-p] [-h] [-l] [-v] ... Options: -c, --concurrency Set max number of concurrent HTTP requests. [default: %v] @@ -22,16 +22,22 @@ Options: -r, --recursive Search Markdown and HTML files recursively -t, --timeout Set timeout for HTTP requests in seconds. Disabled by default. -x, --exclude Regex of links to exclude from checking. + -p, --exclude-private-hosts Exclude private domains and ip addresses. + -h, --exclude-localhost Exclude localhost addresses. + -l, --exclude-link-local Exclude link local addresses. -v, --verbose Be verbose.` type arguments struct { - filenames []string - documentRoot string - concurrency int - timeout time.Duration - excludedPattern *regexp.Regexp - recursive bool - verbose bool + filenames []string + documentRoot string + concurrency int + timeout time.Duration + excludedPattern *regexp.Regexp + excludePrivateHosts bool + excludeLocalhost bool + excludeLinkLocal bool + recursive bool + verbose bool } func getArguments(argv []string) (arguments, error) { @@ -77,6 +83,9 @@ func getArguments(argv []string) (arguments, error) { int(c), time.Duration(t) * time.Second, r, + args["--exclude-private-hosts"].(bool), + args["--exclude-localhost"].(bool), + args["--exclude-link-local"].(bool), args["--recursive"].(bool), args["--verbose"].(bool), }, nil diff --git a/arguments_test.go b/arguments_test.go index d56fcce..b0eee1d 100644 --- a/arguments_test.go +++ b/arguments_test.go @@ -15,55 +15,79 @@ func TestGetArguments(t *testing.T) { }{ { argv: []string{"file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, false, false, false}, }, { argv: []string{"-c", "42", "file"}, - args: arguments{[]string{"file"}, "", 42, 0, nil, false, false}, + args: arguments{[]string{"file"}, "", 42, 0, nil, false, false, false, false, false}, }, { argv: []string{"--concurrency", "42", "file"}, - args: arguments{[]string{"file"}, "", 42, 0, nil, false, false}, + args: arguments{[]string{"file"}, "", 42, 0, nil, false, false, false, false, false}, }, { argv: []string{"-d", "directory", "file"}, - args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, nil, false, false}, + args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, nil, false, false, false, false, false}, }, { argv: []string{"--document-root", "directory", "file"}, - args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, nil, false, false}, + args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, nil, false, false, false, false, false}, }, { argv: []string{"-r", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, true, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, false, true, false}, }, { argv: []string{"--recursive", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, true, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, false, true, false}, }, { argv: []string{"-t", "42", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, nil, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, nil, false, false, false, false, false}, }, { argv: []string{"--timeout", "42", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, nil, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, nil, false, false, false, false, false}, }, { argv: []string{"-x", "^.*$", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, regexp.MustCompile(`^.*$`), false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, regexp.MustCompile(`^.*$`), false, false, false, false, false}, }, { argv: []string{"--exclude", "^.*$", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, regexp.MustCompile(`^.*$`), false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, regexp.MustCompile(`^.*$`), false, false, false, false, false}, + }, + { + argv: []string{"-p", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, true, false, false, false, false}, + }, + { + argv: []string{"--exclude-private-hosts", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, true, false, false, false, false}, + }, + { + argv: []string{"-h", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, true, false, false, false}, + }, + { + argv: []string{"--exclude-localhost", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, true, false, false, false}, + }, + { + argv: []string{"-l", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, true, false, false}, + }, + { + argv: []string{"--exclude-link-local", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, true, false, false}, }, { argv: []string{"-v", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, true}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, false, false, true}, }, { argv: []string{"--verbose", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, true}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, nil, false, false, false, false, true}, }, } { args, err := getArguments(c.argv) diff --git a/file_checker.go b/file_checker.go index b60db75..6f9971f 100644 --- a/file_checker.go +++ b/file_checker.go @@ -18,8 +18,8 @@ type fileChecker struct { semaphore semaphore } -func newFileChecker(timeout time.Duration, d string, r *regexp.Regexp, s semaphore) fileChecker { - return fileChecker{newURLChecker(timeout, d, r, s), s} +func newFileChecker(timeout time.Duration, d string, r *regexp.Regexp, excludePrivateHosts, excludeLocalhost, excludeLinkLocal bool, s semaphore) fileChecker { + return fileChecker{newURLChecker(timeout, d, r, excludePrivateHosts, excludeLocalhost, excludeLinkLocal, s), s} } func (c fileChecker) Check(f string) ([]urlResult, error) { diff --git a/file_checker_test.go b/file_checker_test.go index 06a70c5..8c59913 100644 --- a/file_checker_test.go +++ b/file_checker_test.go @@ -10,7 +10,7 @@ import ( ) func TestFileCheckerCheck(t *testing.T) { - c := newFileChecker(0, "", nil, newSemaphore(1024)) + c := newFileChecker(0, "", nil, false, false, false, newSemaphore(1024)) for _, f := range []string{"README.md", "test/foo.md", "test/foo.html"} { rs, err := c.Check(f) @@ -48,7 +48,7 @@ func TestFileCheckerCheck(t *testing.T) { } func TestFileCheckerCheckMany(t *testing.T) { - c := newFileChecker(0, "", nil, newSemaphore(maxOpenFiles)) + c := newFileChecker(0, "", nil, false, false, false, newSemaphore(maxOpenFiles)) for _, fs := range [][]string{ {"README.md"}, @@ -77,7 +77,7 @@ func TestFileCheckerCheckMany(t *testing.T) { } func TestFileCheckerCheckManyWithInvalidFiles(t *testing.T) { - c := newFileChecker(0, "", nil, newSemaphore(maxOpenFiles)) + c := newFileChecker(0, "", nil, false, false, false, newSemaphore(maxOpenFiles)) for _, fs := range [][]string{ {"test/absolute_path.md"}, @@ -107,7 +107,7 @@ func TestFileCheckerCheckManyWithInvalidFiles(t *testing.T) { } func TestFileCheckerExtractURLs(t *testing.T) { - c := newFileChecker(0, "", nil, newSemaphore(42)) + c := newFileChecker(0, "", nil, false, false, false, newSemaphore(42)) for _, x := range []struct { html string diff --git a/go.sum b/go.sum index eaa27ff..03bf842 100644 --- a/go.sum +++ b/go.sum @@ -20,7 +20,6 @@ github.com/mattn/go-isatty v0.0.9 h1:d5US/mDsogSGW37IV293h//ZFaeajb69h+EHFsv2xGg github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/russross/blackfriday v2.0.0+incompatible h1:cBXrhZNUf9C+La9/YpS+UHpUT8YD6Td9ZMSU9APFcsk= github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= @@ -44,6 +43,7 @@ golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24 h1:R8bzl0244nw47n1xKs1MUMAaTNgjavKcN/aX2Ss3+Fo= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/main.go b/main.go index 920f23f..32f2134 100644 --- a/main.go +++ b/main.go @@ -34,6 +34,9 @@ func main() { args.timeout, args.documentRoot, args.excludedPattern, + args.excludePrivateHosts, + args.excludeLocalhost, + args.excludeLinkLocal, newSemaphore(args.concurrency)) go c.CheckMany(m.Filenames(), rc) diff --git a/url_checker.go b/url_checker.go index f3895a0..2d70916 100644 --- a/url_checker.go +++ b/url_checker.go @@ -2,6 +2,7 @@ package main import ( "errors" + "net" "net/url" "os" "path" @@ -11,17 +12,21 @@ import ( "time" "github.com/valyala/fasthttp" + "golang.org/x/net/publicsuffix" ) type urlChecker struct { - timeout time.Duration - documentRoot string - excludedPattern *regexp.Regexp - semaphore semaphore + timeout time.Duration + documentRoot string + excludedPattern *regexp.Regexp + excludePrivateHosts bool + excludeLocalhost bool + excludeLinkLocal bool + semaphore semaphore } -func newURLChecker(t time.Duration, d string, r *regexp.Regexp, s semaphore) urlChecker { - return urlChecker{t, d, r, s} +func newURLChecker(t time.Duration, d string, r *regexp.Regexp, excludePrivateHosts, excludeLocalhost, excludeLinkLocal bool, s semaphore) urlChecker { + return urlChecker{t, d, r, excludePrivateHosts, excludeLocalhost, excludeLinkLocal, s} } func (c urlChecker) Check(u string, f string) error { @@ -30,6 +35,30 @@ func (c urlChecker) Check(u string, f string) error { return err } + if !local { + uu, _ := url.Parse(u) + host := uu.Hostname() + if ip := net.ParseIP(host); ip != nil { + if c.excludePrivateHosts && isPrivate(ip) { + return nil + } + if c.excludeLocalhost && ip.IsLoopback() { + return nil + } + if c.excludeLinkLocal && (ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast()) { + return nil + } + } else { + if host == "localhost" { + if c.excludeLocalhost { + return nil + } + } else if _, icann := publicsuffix.PublicSuffix(host); !icann && c.excludePrivateHosts { + return nil // private domain + } + } + } + if c.excludedPattern != nil && c.excludedPattern.MatchString(u) { return nil } @@ -88,3 +117,18 @@ func (c urlChecker) resolveURL(u string, f string) (string, bool, error) { return path.Join(c.documentRoot, uu.Path), true, nil } + +// isPrivate reports whether `ip' is a local address, according to +// RFC 1918 (IPv4 addresses) and RFC 4193 (IPv6 addresses). +// xref: https://go-review.googlesource.com/c/go/+/162998/ +// xref: https://github.com/golang/go/issues/29146 +func isPrivate(ip net.IP) bool { + if ip4 := ip.To4(); ip4 != nil { + // Local IPv4 addresses are defined in https://tools.ietf.org/html/rfc1918 + return ip4[0] == 10 || + (ip4[0] == 172 && ip4[1]&0xf0 == 16) || + (ip4[0] == 192 && ip4[1] == 168) + } + // Local IPv6 addresses are defined in https://tools.ietf.org/html/rfc4193 + return len(ip) == net.IPv6len && ip[0]&0xfe == 0xfc +} diff --git a/url_checker_test.go b/url_checker_test.go index 435976b..2283ede 100644 --- a/url_checker_test.go +++ b/url_checker_test.go @@ -9,7 +9,7 @@ import ( ) func TestURLCheckerCheck(t *testing.T) { - c := newURLChecker(0, "", nil, newSemaphore(1024)) + c := newURLChecker(0, "", nil, false, false, false, newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -21,7 +21,7 @@ func TestURLCheckerCheck(t *testing.T) { } func TestURLCheckerCheckWithExclude(t *testing.T) { - c := newURLChecker(0, "", regexp.MustCompile(`^http:\/\/localhost:[13]$`), newSemaphore(1024)) + c := newURLChecker(0, "", regexp.MustCompile(`^http:\/\/localhost:[13]$`), false, false, false, newSemaphore(1024)) for _, u := range []string{"http://localhost:1", "http://localhost:3", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -32,8 +32,42 @@ func TestURLCheckerCheckWithExclude(t *testing.T) { } } +func TestURLCheckerCheckWithExcludePrivateHosts(t *testing.T) { + c := newURLChecker(0, "", nil, true, false, false, newSemaphore(1024)) + + for _, u := range []string{ + "http://192.168.99.100", + "http://example.test", + "http://example.abcdxyz", + } { + assert.Equal(t, nil, c.Check(u, "README.md")) + } +} + +func TestURLCheckerCheckWithExcludeLocalhost(t *testing.T) { + c := newURLChecker(0, "", nil, false, true, false, newSemaphore(1024)) + + for _, u := range []string{ + "http://localhost:1", + "http://localhost:3", + "http://127.0.0.1:1", + } { + assert.Equal(t, nil, c.Check(u, "README.md")) + } +} + +func TestURLCheckerCheckWithExcludeLinkLocal(t *testing.T) { + c := newURLChecker(0, "", nil, false, false, true, newSemaphore(1024)) + + for _, u := range []string{ + "http://169.254.169.254:1", + } { + assert.Equal(t, nil, c.Check(u, "README.md")) + } +} + func TestURLCheckerCheckWithTimeout(t *testing.T) { - c := newURLChecker(30*time.Second, "", nil, newSemaphore(1024)) + c := newURLChecker(30*time.Second, "", nil, false, false, false, newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -45,7 +79,7 @@ func TestURLCheckerCheckWithTimeout(t *testing.T) { } func TestURLCheckerCheckMany(t *testing.T) { - c := newURLChecker(0, "", nil, newSemaphore(1024)) + c := newURLChecker(0, "", nil, false, false, false, newSemaphore(1024)) for _, us := range [][]string{{}, {"https://google.com", "README.md"}} { rc := make(chan urlResult, 1024) @@ -58,7 +92,7 @@ func TestURLCheckerCheckMany(t *testing.T) { } } func TestURLCheckerResolveURL(t *testing.T) { - f := newURLChecker(0, "", nil, newSemaphore(1024)) + f := newURLChecker(0, "", nil, false, false, false, newSemaphore(1024)) for _, c := range []struct { source, target string @@ -76,7 +110,7 @@ func TestURLCheckerResolveURL(t *testing.T) { } func TestURLCheckerResolveURLWithAbsolutePath(t *testing.T) { - f := newURLChecker(0, "", nil, newSemaphore(1024)) + f := newURLChecker(0, "", nil, false, false, false, newSemaphore(1024)) u, _, err := f.resolveURL("/foo", "foo.md") @@ -85,7 +119,7 @@ func TestURLCheckerResolveURLWithAbsolutePath(t *testing.T) { } func TestURLCheckerResolveURLWithDocumentRoot(t *testing.T) { - f := newURLChecker(0, "foo", nil, newSemaphore(1024)) + f := newURLChecker(0, "foo", nil, false, false, false, newSemaphore(1024)) for _, c := range []struct { source, target string