// A CollectorOption sets an option on a Collector. type CollectorOption func(*Collector)
// Collector provides the scraper instance for a scraping job type Collector struct { // UserAgent is the User-Agent string used by HTTP requests UserAgent string // MaxDepth limits the recursion depth of visited URLs. // Set it to 0 for infinite recursion (default). MaxDepth int // AllowedDomains is a domain whitelist. // Leave it blank to allow any domains to be visited AllowedDomains []string // DisallowedDomains is a domain blacklist. DisallowedDomains []string // DisallowedURLFilters is a list of regular expressions which restricts // visiting URLs. If any of the rules matches to a URL the // request will be stopped. DisallowedURLFilters will // be evaluated before URLFilters // Leave it blank to allow any URLs to be visited DisallowedURLFilters []*regexp.Regexp // URLFilters is a list of regular expressions which restricts // visiting URLs. If any of the rules matches to a URL the // request won't be stopped. DisallowedURLFilters will // be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited URLFilters []*regexp.Regexp
// AllowURLRevisit allows multiple downloads of the same URL AllowURLRevisit bool // MaxBodySize is the limit of the retrieved response body in bytes. // 0 means unlimited. // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes). MaxBodySize int // CacheDir specifies a location where GET requests are cached as files. // When it's not defined, caching is disabled. CacheDir string // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by // the target host's robots.txt file. See http://www.robotstxt.org/ for more // information. IgnoreRobotsTxt bool // Async turns on asynchronous network communication. Use Collector.Wait() to // be sure all requests have been finished. Async bool // ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes. // By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse // to true to enable it. ParseHTTPErrorResponse bool // ID is the unique identifier of a collector ID uint32 // DetectCharset can enable character encoding detection for non-utf8 response bodies // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet DetectCharset bool // RedirectHandler allows control on how a redirect will be managed // use c.SetRedirectHandler to set this value redirectHandler func(req *http.Request, via []*http.Request)error // CheckHead performs a HEAD request before every GET to pre-validate the response CheckHead bool // TraceHTTP enables capturing and reporting request performance for crawler tuning. // When set to true, the Response.Trace will be filled in with an HTTPTrace object. TraceHTTP bool // Context is the context that will be used for HTTP requests. You can set this // to support clean cancellation of scraping. Context context.Context
// NewCollector creates a new Collector instance with default configuration funcNewCollector(options ...CollectorOption) *Collector { c := &Collector{} c.Init()
for _, f := range options { f(c) }
c.parseSettingsFromEnv()
return c }
func(c *Collector)parseSettingsFromEnv() { for _, e := range os.Environ() { if !strings.HasPrefix(e, "COLLY_") { continue } pair := strings.SplitN(e[6:], "=", 2) if f, ok := envMap[pair[0]]; ok { f(c, pair[1]) } else { log.Println("Unknown environment variable:", pair[0]) } } }