Skip to content

Commit

Permalink
enhancement: better sitemap handling + better domains-crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 16, 2024
1 parent 2cd9212 commit 07690ee
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 13 deletions.
17 changes: 9 additions & 8 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,8 @@ func (c *Crawl) Capture(item *queue.Item) error {
c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON)
}

var headers = make(map[string]string)
headers["Accept"] = HTTPHeaders.Accept
headers["Accept-Language"] = HTTPHeaders.AcceptLanguage
headers["Sec-Fetch-Mode"] = HTTPHeaders.SecFetchMode
headers["User-Agent"] = HTTPHeaders.UserAgent

if len(URLs) > 0 {
c.captureAssets(item, URLs, resp.Cookies(), headers)
c.captureAssets(item, URLs, resp.Cookies(), HTTPHeaders)

Check failure on line 359 in internal/pkg/crawl/capture.go

View workflow job for this annotation

GitHub Actions / build

cannot use HTTPHeaders (variable of type ytdlp.HTTPHeaders) as map[string]string value in argument to c.captureAssets
}

return nil
Expand Down Expand Up @@ -390,9 +384,16 @@ func (c *Crawl) Capture(item *queue.Item) error {

// If the response is an XML document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
assets, err = extractor.XML(resp)
URLsFromXML, isSitemap, err := extractor.XML(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
} else {
if isSitemap {
waitGroup.Add(1)
go c.queueOutlinks(URLsFromXML, item, &waitGroup)
} else {
assets = append(assets, URLsFromXML...)
}
}
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
assets, err = extractor.JSON(resp)
Expand Down
16 changes: 12 additions & 4 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,23 @@ import (
"github.com/clbanning/mxj/v2"
)

func XML(resp *http.Response) (URLs []*url.URL, err error) {
func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
return nil, sitemap, err
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, err
return nil, sitemap, err
}

// Try to find if it's a sitemap
for _, node := range mv.LeafNodes() {
if strings.Contains(node.Path, "sitemap") {
sitemap = true
break
}
}

for _, value := range mv.LeafValues() {
Expand All @@ -31,5 +39,5 @@ func XML(resp *http.Response) (URLs []*url.URL, err error) {
}
}

return URLs, nil
return URLs, sitemap, nil
}
27 changes: 26 additions & 1 deletion internal/pkg/crawl/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.Wa
}
}

if c.DomainsCrawl && strings.Contains(item.URL.Host, outlink.Host) && item.Hop == 0 {
if c.domainsCrawlPass(item.URL, outlink, item.Hop) {
newItem, err := queue.NewItem(outlink, item.URL, "seed", 0, "", false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, outlink, nil)).Error("unable to create new item from outlink, discarding")
Expand Down Expand Up @@ -96,3 +96,28 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.Wa
}
}
}

func (c *Crawl) domainsCrawlPass(origin, outlink *url.URL, originHop uint64) bool {
if origin == nil || outlink == nil {
return false
}

if !c.DomainsCrawl || originHop != 0 {
return false
}

// Strip out subdomains from both URLs & compare
originParts := strings.Split(origin.Host, ".")
if len(originParts) < 2 {
// ???
return false
}

outlinkParts := strings.Split(outlink.Host, ".")
if len(outlinkParts) < 2 {
// ???
return false
}

return originParts[len(originParts)-2]+"."+originParts[len(originParts)-1] == outlinkParts[len(outlinkParts)-2]+"."+outlinkParts[len(outlinkParts)-1]
}

0 comments on commit 07690ee

Please sign in to comment.