go-百度贴吧-纵向爬取
程序员文章站
2022-05-22 21:42:17
百度贴吧纵向爬取 上一个是横向爬取的,这个纵向爬取,具体怎么做的看代码 ......
百度贴吧纵向爬取
上一个是横向爬取的,这个纵向爬取,具体怎么做的看代码
package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" ) func httpgetdb(url string) (result string, err error) { resp, err1 := http.get(url) if err1 != nil { err = err1 return } defer resp.body.close() buf := make([]byte, 4096) for { n, err2 := resp.body.read(buf) if n == 0 { break } if err2 != nil && err2 != io.eof { err = err2 return } result += string(buf[:n]) } return } func save2file(idx int, filename [][]string) { path := "第" + strconv.itoa(idx) + "页" + ".txt" f, err := os.create(path) if err != nil { fmt.println("os.create err", err) return } defer f.close() n := len(filename) f.writestring("名称\n") for i := 0; i < n; i++ { f.writestring(filename[i][1] + "\n") } } func spiderpagedb(idx int, page chan int) { // url := "https://movie.douban.com/review/best/?start=" + strconv.itoa((idx-1)*20) // url := "https://movie.douban.com/annual/2018?source=navigation#" + strconv.itoa(idx-1) // https://movie.douban.com/review/best/?start=20 url := "https://tieba.baidu.com/f?kw=vue&ie=utf-8&pn=" + strconv.itoa((idx-1)*50) result, err := httpgetdb(url) if err != nil { fmt.println("httpget2 err", err) return } // fmt.println("result=", result) ret := regexp.mustcompile(`<span class="tb_icon_author_rely j_replyer" title="最后回复人:(?s:(.*?))"`) filename := ret.findallstringsubmatch(result, -1) // for _, name := range filename { // fmt.println("name", name[1]) // } save2file(idx, filename) page <- idx } func towork(start, end int) { fmt.printf("正在爬取%d到%d页。。。\n", start, end) page := make(chan int) for i := start; i <= end; i++ { go spiderpagedb(i, page) } for i := start; i <= end; i++ { fmt.print("第%d页爬取完成\n", <-page) } } func main() { var start, end int fmt.print("请输入起始页(>=1):") fmt.scan(&start) fmt.print("请输入终止页(>=start):") fmt.scan(&end) towork(start, end) }