Go实现网页爬虫
程序员文章站
2022-05-30 12:15:50
...
package main
import (
"fmt"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
func main() {
var start, end int
fmt.Print("请输入起始页(>=1) :")
fmt.Scan(&start)
fmt.Print("请输入结束页(>=起始页) :")
fmt.Scan(&end)
//开始工作
DoWork(start, end)
}
func DoWork(start, end int) {
fmt.Printf("正在爬取页数范围是第%d页到第%d页。\n", start, end)
page := make(chan int)
for i := start; i <= end; i++ {
// 爬取主页面
go SpiderPage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d页已经爬取完成。\n", <-page)
}
}
func SpiderPage(i int, page chan int) {
// 明确爬取的url
// https://www.xiaohua.com/duanzi?page=1 下一页+1
url := "https://www.xiaohua.com/duanzi?page=" + strconv.Itoa(i)
fmt.Printf("正在爬取第%d个网页:%s\n", i, url)
// 开始爬取网页的内容
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err = ", err)
return
}
// fmt.Println("r = ", result)
//取
// <p class="fonts">
// <a href=" 一个段子的url ">
// 解释表达式
re := regexp.MustCompile(`<p class="fonts">(?s:(.*?))">`)
if re == nil {
fmt.Println("regexp.MustCompile err")
return
}
// 取关键信息
joyUrls := re.FindAllStringSubmatch(result, -1)
// fmt.Println("joyUrls = ", joyUrls)
fileTitle := make([]string, 0)
fileContent := make([]string, 0)
// 取网址
// 第一个返回下标,第二个返回内容
for _, data := range joyUrls {
// 一个笑话的连接
url := data[1]
url = strings.Replace(url, "<a href=\"", "", -1)
url = strings.Replace(url, "\r", "", -1)
url = strings.Replace(url, "\n", "", -1)
url = strings.Replace(url, " ", "", -1)
url = "https://www.xiaohua.com" + url
// fmt.Printf("test url = #%v#\n", url)
//开始爬取每一个笑话,每一个段子
title, content, err := SpiderOneJoy(url)
if err != nil {
fmt.Println("SpiderOneJoy err = ", err)
continue
}
// fmt.Printf("title = #%v#\n", title)
// fmt.Printf("content = #%v#\n", content)
//追加标题和内容
fileTitle = append(fileTitle, title)
fileContent = append(fileContent, content)
}
// fmt.Println("fileTitle = ", fileTitle)
// fmt.Println("fileContent = ", fileContent)
// 把内容写入到文件
StoreJoyToFile(i, fileTitle, fileContent)
// 写内容,写num
page <- i
}
func HttpGet(url string) (result string, err error) {
resp, err1 := http.Get(url) //发送get请求
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
// 读取网页内容
buf := make([]byte, 1024*4)
for {
n, _ := resp.Body.Read(buf)
if n == 0 {
break
}
// 累加读取的内容
result += string(buf[:n])
}
return
}
//开始爬取每一个笑话
func SpiderOneJoy(url string) (title, content string, err error) {
// 开始爬取页面内容
result, err1 := HttpGet(url)
if err1 != nil {
err = err1
return
}
//取关键信息
// 取作者
// <div class="one-cont-title clearfix">
// <div class="one-cont-time"><span></span></div>
// <div class="one-cont-font clearfix">
// <a href="/user/1761">
// <em>
// <img id="imgIco" data-default="portrait" class="lazy js_img" alt="" src="https://img.xiaohua.com/User/0/1/1761.jpg" />
// </em>
// <i>
// 标题
// </i>
re1 := regexp.MustCompile(`<div class="one-cont-title clearfix">(?s:(.*?))</i>`)
if re1 == nil {
err = fmt.Errorf("%s", "regexp.MustCompile re1 err")
return
}
tmpTitle := re1.FindAllStringSubmatch(result, -1)
for _, data := range tmpTitle {
title = data[1]
title = strings.Replace(title, " ", "", -1)
//特意给末尾加个 a,用于二次取标题
title = title + "a"
re1Tmp := regexp.MustCompile(`<i>(?s:(.*?))a`)
if re1Tmp == nil {
err = fmt.Errorf("%s", "regexp.MustCompile re1Tmp err")
}
tmpTitle2 := re1Tmp.FindAllStringSubmatch(title, -1)
for _, dataTmp := range tmpTitle2 {
title = dataTmp[1]
title = strings.Replace(title, "\r\n", "", -1)
// fmt.Printf("tmpTitle = #%v#\n", title)
break
}
break
}
// 取内容 <p class="fonts">内容</p>
re2 := regexp.MustCompile(`<p class="fonts">(?s:(.*?))</p>`)
tmpContent := re2.FindAllStringSubmatch(result, -1)
for _, data := range tmpContent {
content = data[1]
content = strings.Replace(content, "<br>", "", -1)
// fmt.Printf("tmpContent = #%v#\n", data[1])
break
}
return
}
// 把内容写入到文件
func StoreJoyToFile(i int, fileTitle, fileContent []string) {
fileName := strconv.Itoa(i) + ".txt"
f, err := os.Create(fileName)
if err != nil {
fmt.Println("os.Create err = ", err)
return
}
// 最后关闭文件
defer f.Close()
// 写内容
n := len(fileTitle)
for i := 0; i < n; i++ {
// 写标题
f.WriteString(strconv.Itoa(i+1) + ")" + fileTitle[i] + "\n")
// 写内容
f.WriteString(fileContent[i] + "\n")
f.WriteString("\n===========================================================\n")
}
}
上一篇: Node实现小爬虫
下一篇: 学习笔记☞ MySQL(四)