[日常] Go语言圣经--并发的web爬虫
程序员文章站
2022-04-04 23:53:07
两种: crawler.go crawler2.go ......
两种:
crawler.go
package main import ( "fmt" "links" //"log" "os" ) func main() { worklist := make(chan []string) // Start with the command-line arguments. go func() { worklist <- os.Args[1:] }() // Crawl the web concurrently. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true go func(link string) { worklist <- crawl(link) }(link) } } } } var tokens = make(chan struct{}, 20) //从一个url页面中提取出所有的url func crawl(url string) []string { fmt.Println(url) tokens <- struct{}{} list, err := links.Extract(url) <-tokens if err != nil { //log.Print(err) } return list }
crawler2.go
package main import ( "fmt" "links" //"log" "os" "strings" ) func main() { worklist := make(chan []string) unseenLinks := make(chan string) // Start with the command-line arguments. go func() { worklist <- os.Args[1:] }() // Create 20 crawler goroutines to fetch each unseen link. for i := 0; i < 20; i++ { go func() { for link := range unseenLinks { //if strings.HasPrefix(link, "http://www.lypeng.com") { foundLinks := crawl(link) go func() { worklist <- foundLinks }() //} } }() } // The main goroutine de-duplicates worklist items // and sends the unseen ones to the crawlers. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true unseenLinks <- link } } } } //从一个url页面中提取出所有的url func crawl(url string) []string { fmt.Println(url) list, err := links.Extract(url) if err != nil { //log.Print(err) } return list }
上一篇: 诺基亚930 通话时长高达15小时
下一篇: 常用工具及小技巧