欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Golang---goquery爬虫获取golang语言中文网页面信息并保存MySQL

程序员文章站 2022-04-05 09:53:41
...

    由于最近一直在golang语言中文网上看帖子,所以打算使用golang写一个爬虫把帖子信息抓取下来,并保存到mysql中。

    以下是完整代码;

// goquery_spider_test project main.go
package main

import (
	"database/sql"
	"fmt"
	"goquery-master"
	"log"
	//	"net/http"
	"strconv"
	"strings"

	_ "github.com/go-sql-driver/mysql"
)

var (
	commonurl = "https://studygolang.com"
	userid    = ""
	//获取数据
	topicinfo = []string{"", "", "", "", ""}
)

type DbWorker struct {
	//mysql data source name
	Dsn string
}

func main() {
	//连接MySQL
	dbw := DbWorker{
		Dsn: "root:aaa@qq.com(localhost:3306)/studygolang_topic?charset=utf8",
	}
	db, err := sql.Open("mysql",
		dbw.Dsn)
	defer db.Close()
	if err != nil {
		panic(err)
	} else {
		fmt.Println("数据库链接成功!")
	}

	var pagelasturl string = ""
	var index int = 0

	//获取页面数据信息
	pageurl := "https://studygolang.com/topics?p="

	for {
		index++
		temp := strconv.Itoa(index)
		pagelasturl = pageurl + temp
		doc, _ := goquery.NewDocument(pagelasturl)
		if doc.Find("div.topic").Text() == "" {
			break
		}
		getpageinfo(db, pagelasturl)
		fmt.Println(pagelasturl)
		temp = ""
		pagelasturl = ""
	}
	defer db.Close()
}

func getpageinfo(db *sql.DB, pagefullurl string) {
	doc, err := goquery.NewDocument(pagefullurl)
	if err != nil {
		log.Fatal(err)
	}
	doc.Find("div.topic").Each(func(i int, contentSelection *goquery.Selection) {
		//帖子标题
		title := contentSelection.Find("div.title a").Text()
		//帖子url
		topicurl, _ := contentSelection.Find("div.title a").Eq(0).Attr("href")
		//帖子作者
		userid = contentSelection.Find("div.meta a").Find("strong").Text()
		//帖子作者url
		userurl, _ := contentSelection.Find("dt.avatar a").Eq(0).Attr("href")
		//帖子回复
		click := contentSelection.Find("div.pull-right a").Text()
		if click == "" {
			click = "0"
		}
		topicinfo[0] = title
		topicinfo[1] = commonurl + topicurl
		topicinfo[2] = userid
		topicinfo[3] = commonurl + userurl
		topicinfo[4] = click

		//消除各种不兼容字符
		topicinfo = splitstring(topicinfo)
		//数据库操作
		dbmanager(db, topicinfo)
	})
}

func splitstring(pageinfo []string) []string {
	spilitinfo := pageinfo
	for i := 0; i < 5; i++ {
		spilitinfo[i] = strings.Replace(pageinfo[i], "'''", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "'", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "''", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "’", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "‘", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "“", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "”", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], ",", " ", -1)
		spilitinfo[i] = strings.Replace(pageinfo[i], "?", " ", -1)
	}
	return spilitinfo
}

func dbmanager(db *sql.DB, info []string) {
	//	fmt.Print(info[0])
	var sqlinfo string = "INSERT INTO golang_topic VALUES('" + info[0] + "','" + info[1] + "','" + info[2] + "','" + info[3] + "','" + info[4] + "')"
	stmt, err := db.Prepare(sqlinfo)
	if err != nil {
		fmt.Println("insert data error: %v\n", err)
		return
	}
	stmt.Exec()
}


步骤:

1、链接mysql

2、设置抓取规则,使用goquery解析html网页,获取想要的字段信息

3、保存mysql (字段为:topic_name(帖子名称), topic_url(帖子url), topic_userid (帖子发布作者), topic_userurl (作者url), topic_click (帖子回复数量))

      由于刚开始golang的学习,对其中字符串的函数使用不熟练,在向mysql插入数据的时候,由于一些转义字符或者是特殊字符无法完成插入,后通过strings.Replace()函数来进行笨拙的替换,后期会继续学习并优化函数处理以及爬虫规模

     代码中需要加载一些包,在此就不详细描述,如有需要,可留言回复,共同学习。

下面是抓取的mysql截图

Golang---goquery爬虫获取golang语言中文网页面信息并保存MySQL

    Go语言中文网页面总共有81页,3416条数据,抓取到mysql以每页1000条显示(Navicat for MySQL)

   由于刚接触golang,如有不足,请互相交流学习Golang---goquery爬虫获取golang语言中文网页面信息并保存MySQL