欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

基于 pyspider 的 infoq.com 爬虫

程序员文章站 2022-05-11 12:01:18
...

目标

需要多了解下世界 IT 领域发生了些什么事情, infoq 是最好的入口。定期爬 infoq 英文版内容,并展现在自己的网站上。

先贴爬虫效果:
基于 pyspider 的 infoq.com 爬虫

总体架构

pyspider 负责爬取 infoq 内容,启用了 phantomjs。爬取的内容插入 MySQL。前端用 PHP 读 MySQL 数据库来展示。

脚本

CREATE TABLE `news_latest` (
  `id` int(22) NOT NULL AUTO_INCREMENT,
  `url` varchar(1024) DEFAULT NULL,
  `title` varchar(1024) DEFAULT NULL,
  `brief` varchar(5192) DEFAULT NULL,
  `publish_date` varchar(64) DEFAULT NULL,
  `content` text,
  `author` varchar(128) DEFAULT NULL,
  `source` varchar(32) DEFAULT '',
  `log_id` varchar(32) DEFAULT '' COMMENT 'baidu summary service',
  `tags` varchar(128) DEFAULT NULL,
  `gmt_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2015 DEFAULT CHARSET=utf8mb4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-04-27 19:18:06
# Project: InfoQ

from pyspider.libs.base_handler import *


import MySQLdb
import re

class SQL():
    #数据库初始化
    def __init__(self):
        #数据库连接相关信息
        hosts    = '127.0.0.1'
        username = 'crawl'
        password = 'mypass'
        database = 'spiderdb'
        charsets = 'utf8'

        self.connection = False
        try:
            self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets)
            self.cursor = self.conn.cursor()
            self.cursor.execute("set names "+charsets)
            self.connection = True
        except Exception,e:
            print "Cannot Connect To Mysql!/n",e

    def escape(self,string):
        return '%s' % string
    #插入数据到数据库
    def insert(self,tablename,**values):
        if self.connection:
            tablename = self.escape(tablename)
            list_value = values.values() #list(values)
            params = ",".join(['%s']*len(list_value))
            columns = ",".join(list(values))
            sql_query = "insert into %s(%s) values (%s)" % (tablename, columns, params)

            print list_value
            print sql_query

            try:
                self.cursor.execute(sql_query,list_value)
                self.conn.commit()
                return True
            except Exception,e:
                print "An Error Occured: ",e
                print list_value
                print sql_query
                return False

            
class Handler(BaseHandler):
    crawl_config = {

    }
    
    def __init__(self):      
        self.siteUrl = 'https://www.infoq.com/'
        self.contentSelector = '.article__content'
        self.authorSelector = '.author__link:first'
        self.dateSelector = '.date:first'
        self.source = 'InfoQ.com'
        self.contentUrlRegx = r'http(.*)infoq.com/(news|podcasts|articles|minibooks)/(.+)'
        self.followUrlRegx = r'http(.*)infoq.com/(.*)'
        self.tagsSelector = '.related__topic'
        self.briefLength = 800
        
    @every(minutes=3 * 60)
    def on_start(self):
        self.crawl(self.siteUrl, fetch_type = 'js', callback=self.index_page, age=3*60*60, auto_recrawl=True)

    @config(age=3 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(self.followUrlRegx, each.attr.href) is not None:
                self.crawl(each.attr.href,
                           cookies = response.cookies,                            
                           fetch_type = 'js',
                           js_script="""
                               setTimeout(function() {
                                   window.scrollTo(0,document.body.scrollHeight);
                               }, 2000);
                               """,
                           callback=self.detail_page,  itag=each.attr.href)
            elif (re.match(self.contentUrlRegx, each.attr.href) is not None):
                url = self.strip_param(each.attr.href)
                self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)

    @config(priority=2)
    def detail_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            url = self.strip_param(each.attr.href)
            if (re.match(self.contentUrlRegx, url) is not None):
                self.crawl(url, cookies = response.cookies, callback=self.record, itag=url)
            else:
                print ("pass %s") % (each.attr.href)
        return self.record(response)
    
    def on_result(self,result):
        if result and result['author']:
            sql = SQL()
            sql.insert('news_latest',**result) 


    def strip_param(self, u):
        q = u.find('?')
        if (q > 0):
            return u[0:q]
        else:
            return u
              
 
    def record(self, response):
        match = re.match( self.contentUrlRegx, response.url)
        if match is None:
            print ("url do not match record reg: %s" % (response.url))
            return None
        return {
            "url": response.url,
            "title": response.doc('title').text(),
            "brief": response.doc(self.contentSelector).text()[0:self.briefLength],
            "content": response.doc(self.contentSelector).text(),
            "publish_date": response.doc(self.dateSelector).text(),
            "author": response.doc(self.authorSelector).text(),
            "source" : self.source,
            "tags" : ",".join([item.text() for item in response.doc(self.tagsSelector).items()])
        }          
        
相关标签: 脚本