1 Star 0 Fork 0

bit212/emailscraper

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
scrape.go 1.26 KB
一键复制 编辑 原始数据 按行查看 历史
Law Zava 提交于 2021-06-27 00:34 +08:00 . :rotating_light:upgrade dependencies & fix linter errors
package emailscraper
import (
"github.com/gocolly/colly"
)
// Scrape is responsible for main scraping logic.
func (s *Scraper) Scrape(url string) ([]string, error) {
url = getWebsite(url, true)
var e emails
c := s.collector
if !s.cfg.FollowExternalLinks {
allowedDomains, err := prepareAllowedDomain(url)
if err != nil {
return nil, err
}
c.AllowedDomains = allowedDomains
}
// Parse emails on each downloaded page
c.OnScraped(func(response *colly.Response) {
e.parseEmails(response.Body)
})
// cloudflare encoded email support
c.OnHTML("span[data-cfemail]", func(el *colly.HTMLElement) {
e.parseCloudflareEmail(el.Attr("data-cfemail"))
})
// Start the scrape
if err := c.Visit(url); err != nil {
s.log("error while visiting secure domain: ", url, err.Error())
}
c.Wait() // Wait for concurrent scrapes to finish
if e.emails == nil || len(e.emails) == 0 {
// Start the scrape on insecure url
if err := c.Visit(getWebsite(url, false)); err != nil {
s.log("error while visiting insecure domain: ", err.Error())
}
c.Wait() // Wait for concurrent scrapes to finish
}
return e.emails, nil
}
func getWebsite(url string, secure bool) string {
url = trimProtocol(url)
if secure {
return "https://" + url
}
return "http://" + url
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/bit212/emailscraper.git
git@gitee.com:bit212/emailscraper.git
bit212
emailscraper
emailscraper
main

搜索帮助