基于 pyspider 的 infoq.com 爬虫
程序员文章站
2022-05-11 12:01:18
...
目标
需要多了解下世界 IT 领域发生了些什么事情, infoq 是最好的入口。定期爬 infoq 英文版内容,并展现在自己的网站上。
先贴爬虫效果:
总体架构
pyspider 负责爬取 infoq 内容,启用了 phantomjs。爬取的内容插入 MySQL。前端用 PHP 读 MySQL 数据库来展示。
脚本
CREATE TABLE `news_latest` (
`id` int(22) NOT NULL AUTO_INCREMENT,
`url` varchar(1024) DEFAULT NULL,
`title` varchar(1024) DEFAULT NULL,
`brief` varchar(5192) DEFAULT NULL,
`publish_date` varchar(64) DEFAULT NULL,
`content` text,
`author` varchar(128) DEFAULT NULL,
`source` varchar(32) DEFAULT '',
`log_id` varchar(32) DEFAULT '' COMMENT 'baidu summary service',
`tags` varchar(128) DEFAULT NULL,
`gmt_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2015 DEFAULT CHARSET=utf8mb4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-04-27 19:18:06
# Project: InfoQ
from pyspider.libs.base_handler import *
import MySQLdb
import re
class SQL():
#数据库初始化
def __init__(self):
#数据库连接相关信息
hosts = '127.0.0.1'
username = 'crawl'
password = 'mypass'
database = 'spiderdb'
charsets = 'utf8'
self.connection = False
try:
self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets)
self.cursor = self.conn.cursor()
self.cursor.execute("set names "+charsets)
self.connection = True
except Exception,e:
print "Cannot Connect To Mysql!/n",e
def escape(self,string):
return '%s' % string
#插入数据到数据库
def insert(self,tablename,**values):
if self.connection:
tablename = self.escape(tablename)
list_value = values.values() #list(values)
params = ",".join(['%s']*len(list_value))
columns = ",".join(list(values))
sql_query = "insert into %s(%s) values (%s)" % (tablename, columns, params)
print list_value
print sql_query
try:
self.cursor.execute(sql_query,list_value)
self.conn.commit()
return True
except Exception,e:
print "An Error Occured: ",e
print list_value
print sql_query
return False
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.siteUrl = 'https://www.infoq.com/'
self.contentSelector = '.article__content'
self.authorSelector = '.author__link:first'
self.dateSelector = '.date:first'
self.source = 'InfoQ.com'
self.contentUrlRegx = r'http(.*)infoq.com/(news|podcasts|articles|minibooks)/(.+)'
self.followUrlRegx = r'http(.*)infoq.com/(.*)'
self.tagsSelector = '.related__topic'
self.briefLength = 800
@every(minutes=3 * 60)
def on_start(self):
self.crawl(self.siteUrl, fetch_type = 'js', callback=self.index_page, age=3*60*60, auto_recrawl=True)
@config(age=3 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match(self.followUrlRegx, each.attr.href) is not None:
self.crawl(each.attr.href,
cookies = response.cookies,
fetch_type = 'js',
js_script="""
setTimeout(function() {
window.scrollTo(0,document.body.scrollHeight);
}, 2000);
""",
callback=self.detail_page, itag=each.attr.href)
elif (re.match(self.contentUrlRegx, each.attr.href) is not None):
url = self.strip_param(each.attr.href)
self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)
@config(priority=2)
def detail_page(self, response):
for each in response.doc('a[href^="http"]').items():
url = self.strip_param(each.attr.href)
if (re.match(self.contentUrlRegx, url) is not None):
self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)
else:
print ("pass %s") % (each.attr.href)
return self.record(response)
def on_result(self,result):
if result and result['author']:
sql = SQL()
sql.insert('news_latest',**result)
def strip_param(self, u):
q = u.find('?')
if (q > 0):
return u[0:q]
else:
return u
def record(self, response):
match = re.match( self.contentUrlRegx, response.url)
if match is None:
print ("url do not match record reg: %s" % (response.url))
return None
return {
"url": response.url,
"title": response.doc('title').text(),
"brief": response.doc(self.contentSelector).text()[0:self.briefLength],
"content": response.doc(self.contentSelector).text(),
"publish_date": response.doc(self.dateSelector).text(),
"author": response.doc(self.authorSelector).text(),
"source" : self.source,
"tags" : ",".join([item.text() for item in response.doc(self.tagsSelector).items()])
}
下一篇: 工业物联网有望加速成形