go-爬段子
程序员文章站
2022-05-22 21:43:17
爬取搞笑的段子,横向爬取+纵向爬取 横向爬取爬页数,纵向爬取,爬每页的内容 go package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" "strings" ) func HttpGet(url string) (res ......
爬取搞笑的段子,横向爬取+纵向爬取
横向爬取爬页数,纵向爬取,爬每页的内容
package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" "strings" ) func httpget(url string) (result string, err error) { resp, err1 := http.get(url) if err1 != nil { err = err1 return } defer resp.body.close() buf := make([]byte, 4096) for { n, err2 := resp.body.read(buf) if n == 0 { break } if err2 != nil && err2 != io.eof { err = err2 return } result += string(buf[:n]) } return } func savejoke2file(idx int, filetitle, filecontent []string) { path := "第" + strconv.itoa(idx) + "页.txt" f, err := os.create(path) if err != nil { fmt.println("err:", err) return } defer f.close() n := len(filetitle) for i := 0; i < n; i++ { f.writestring(filetitle[i] + "\n" + filecontent[i] + "\n") f.writestring("-----------------------------\n") } } //抓取一个网页 func spiderpage(idx int, page chan int) { url := "https://m.pengfue.com/xiaohua_" + strconv.itoa(idx) + ".html" result, err := httpget(url) if err != nil { fmt.println("httpget err", err) return } ret := regexp.mustcompile(`<h1 class="f18"><a href="(?s:(.*?))"`) alls := ret.findallstringsubmatch(result, -1) filetitle := make([]string, 0) filecontent := make([]string, 0) for _, jokeurl := range alls { // fmt.println("jokeurl", jokeurl[1]) title, content, err := spiderjokepage(jokeurl[1]) if err != nil { fmt.println("err:", err) continue } // fmt.println("title:", title) // fmt.println("content:", content) filetitle = append(filetitle, title) filecontent = append(filecontent, content) } savejoke2file(idx, filetitle, filecontent) page <- idx } func towork(start, end int) { fmt.printf("正在爬取%d到%d页。。。\n", start, end) page := make(chan int) for i := start; i <= end; i++ { // title, content, err := spiderpage(i) go spiderpage(i, page) // if err != nil { // fmt.println("err:", err) // continue // } // fmt.println("title:", title) // fmt.println("content:", content) } for i := start; i <= end; i++ { fmt.printf("第%d个页面爬取完成\n", <-page) } } func spiderjokepage(url string) (title, content string, err error) { result, err1 := httpget(url) if err1 != nil { // fmt.println("httpget err", err) err = err1 return } ret1 := regexp.mustcompile(`<title>(?s:(.*?))</title>`) alls := ret1.findallstringsubmatch(result, 1) //两处,取一个 for _, timtitle := range alls { title = timtitle[1] title = strings.replace(title, " ", "", -1) title = strings.replace(title, "\n", "", -1) break } ret2 := regexp.mustcompile(`<div class="con-txt">(?s:(.*?))</div>`) alls2 := ret2.findallstringsubmatch(result, 1) //两处,取一个 for _, timtitle := range alls2 { content = timtitle[1] content = strings.replace(content, " ", "", -1) content = strings.replace(content, "\n", "", -1) content = strings.replace(content, " ", "", -1) content = strings.replace(content, " ", "", -1) break } return } func main() { var start, end int fmt.print("请输入起始页。。。") fmt.scan(&start) fmt.print("请输入终止页。。。") fmt.scan(&end) towork(start, end) }