Golang---goquery爬虫获取golang语言中文网页面信息并保存MySQL
程序员文章站
2022-04-05 09:53:41
...
由于最近一直在golang语言中文网上看帖子,所以打算使用golang写一个爬虫把帖子信息抓取下来,并保存到mysql中。
以下是完整代码;
// goquery_spider_test project main.go
package main
import (
"database/sql"
"fmt"
"goquery-master"
"log"
// "net/http"
"strconv"
"strings"
_ "github.com/go-sql-driver/mysql"
)
var (
commonurl = "https://studygolang.com"
userid = ""
//获取数据
topicinfo = []string{"", "", "", "", ""}
)
type DbWorker struct {
//mysql data source name
Dsn string
}
func main() {
//连接MySQL
dbw := DbWorker{
Dsn: "root:aaa@qq.com(localhost:3306)/studygolang_topic?charset=utf8",
}
db, err := sql.Open("mysql",
dbw.Dsn)
defer db.Close()
if err != nil {
panic(err)
} else {
fmt.Println("数据库链接成功!")
}
var pagelasturl string = ""
var index int = 0
//获取页面数据信息
pageurl := "https://studygolang.com/topics?p="
for {
index++
temp := strconv.Itoa(index)
pagelasturl = pageurl + temp
doc, _ := goquery.NewDocument(pagelasturl)
if doc.Find("div.topic").Text() == "" {
break
}
getpageinfo(db, pagelasturl)
fmt.Println(pagelasturl)
temp = ""
pagelasturl = ""
}
defer db.Close()
}
func getpageinfo(db *sql.DB, pagefullurl string) {
doc, err := goquery.NewDocument(pagefullurl)
if err != nil {
log.Fatal(err)
}
doc.Find("div.topic").Each(func(i int, contentSelection *goquery.Selection) {
//帖子标题
title := contentSelection.Find("div.title a").Text()
//帖子url
topicurl, _ := contentSelection.Find("div.title a").Eq(0).Attr("href")
//帖子作者
userid = contentSelection.Find("div.meta a").Find("strong").Text()
//帖子作者url
userurl, _ := contentSelection.Find("dt.avatar a").Eq(0).Attr("href")
//帖子回复
click := contentSelection.Find("div.pull-right a").Text()
if click == "" {
click = "0"
}
topicinfo[0] = title
topicinfo[1] = commonurl + topicurl
topicinfo[2] = userid
topicinfo[3] = commonurl + userurl
topicinfo[4] = click
//消除各种不兼容字符
topicinfo = splitstring(topicinfo)
//数据库操作
dbmanager(db, topicinfo)
})
}
func splitstring(pageinfo []string) []string {
spilitinfo := pageinfo
for i := 0; i < 5; i++ {
spilitinfo[i] = strings.Replace(pageinfo[i], "'''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "'", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "’", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "‘", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "“", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "”", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], ",", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "?", " ", -1)
}
return spilitinfo
}
func dbmanager(db *sql.DB, info []string) {
// fmt.Print(info[0])
var sqlinfo string = "INSERT INTO golang_topic VALUES('" + info[0] + "','" + info[1] + "','" + info[2] + "','" + info[3] + "','" + info[4] + "')"
stmt, err := db.Prepare(sqlinfo)
if err != nil {
fmt.Println("insert data error: %v\n", err)
return
}
stmt.Exec()
}
步骤:
1、链接mysql
2、设置抓取规则,使用goquery解析html网页,获取想要的字段信息
3、保存mysql (字段为:topic_name(帖子名称), topic_url(帖子url), topic_userid (帖子发布作者), topic_userurl (作者url), topic_click (帖子回复数量))
由于刚开始golang的学习,对其中字符串的函数使用不熟练,在向mysql插入数据的时候,由于一些转义字符或者是特殊字符无法完成插入,后通过strings.Replace()函数来进行笨拙的替换,后期会继续学习并优化函数处理以及爬虫规模
代码中需要加载一些包,在此就不详细描述,如有需要,可留言回复,共同学习。
下面是抓取的mysql截图
Go语言中文网页面总共有81页,3416条数据,抓取到mysql以每页1000条显示(Navicat for MySQL)
由于刚接触golang,如有不足,请互相交流学习
上一篇: COM学习(四)——COM中的数据类型
下一篇: 请指点,php数组问题。