[Python爬虫]爬取新浪理财师股票问答
本文将与大家分享如何爬取新浪理财师股票问答。
一.背景介绍
1)爬取顺序:
在这里,根据已有的股票id列表,按照顺序,依次爬取每只股票下面的股票问答。
股票id格式:
lines = ['300592.XSHE', '300604.XSHE', '002852.XSHE', '603603.XSHG', '603239.XSHG',...]
2)页面结构:
以浦发银行(sh600000)为例:
通过页面的结构,可以确定爬虫爬取的顺序为:
股票A的所有问答->股票A的单页问答->股票A的某一个具体问答
3)翻页原理
以浦发银行为例,第一页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2
第二页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2&ind_id=1&all=0&trim_ext=0&page=2
第三页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2&ind_id=1&all=0&trim_ext=0&page=3
因此,通过循环进行翻页即可。
二.模块设计
这里设计了三个模块:
1)URL管理器
作用:管理待抓取的URL集合和已抓取的URL集合。
2)网页下载器
作用:将URL对应的网页下载到本地。
3)网页解析器
作用:从下载好的网页或者字符串中,提取Url或者有价值的数据。
三.具体实现
1.调度程序
spider_main.py:
# coding:utf8
import url_manager, html_downloader, html_parser
import time
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
def craw(self):
count = 0
self.urls.get_init_urls()
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
#new_url = 'http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2'
print 'crow %d : %s' % (count, new_url)
html_cont = self.downloader.download(new_url)
self.parser.parse(new_url.split('&')[0][-8:], new_url, html_cont)
time.sleep(1)
print count
count = count + 1
except Exception,e:
print str(e)
if __name__ == "__main__":
obj_spider = SpiderMain()
obj_spider.craw()
步骤:
1)添加所有股票的主页网址
2)每循环一次,取出一只股票的主页网址
3)对当前股票的主页网址进行解析,获取我们需要的内容
4)间隔1ms,再进行下一次循环
2.Url管理器
url_manager.py:
# coding:utf8
import os
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
self.crawled_urls = []
def get_init_urls(self):
lines
for id in lines:
if 'XSHG' in id:
id = 'sh' + id.split('.')[0]
if 'XSHE' in id:
id = 'sz' + id.split('.')[0]
self.add_new_url('http://licaishi.sina.com.cn/web/searchNew?s='+str(id).strip()+'&t=2')
print id
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls and url not in self.crawled_urls:
self.new_urls.add(url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
1)由于要爬取的股票id是已知的,因此,在这里,用lines数组,将要爬取的股票id记录下来:
lines = ['300592.XSHE', '300604.XSHE', '002852.XSHE', '603603.XSHG', '603239.XSHG', '603331.XSHG', ...]
2)为了避免重复,选择set()集合进行记录。
3.下载管理器
html_downloader.py:
# coding:utf8
import urllib2
import socket
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class HtmlDownloader(object):
def download(self,url):
socket.setdefaulttimeout(200)
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode() != 200:
return None
return unicode(response.read(), 'UTF-8', 'ignore').encode('UTF-8')
解决中文乱码问题:先将下载内容用UTF-8编码,然后再用UTF-8解码。
unicode(response.read(), 'UTF-8', 'ignore').encode('UTF-8')
4.网页解析器
html_parser.py:
# coding:utf8
import html_downloader
from bs4 import BeautifulSoup
import urlparse
import pandas as pd
import sys
import time
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')
class HtmlParser(object):
def __init__(self):
self.downloader = html_downloader.HtmlDownloader()
self.ask_url = "http://licaishi.sina.com.cn/ask/"
self.gupiaoid = ''
self.columns_list = []
#http://licaishi.sina.com.cn/ask/
def _get_single_article_data(self, page_url, soup):
res_data = []
res_column = []
self.columns_list = []
if "ask" in page_url:
self.columns_list.append(page_url.split('/')[-1])
self.columns_list.append(self.gupiaoid)
#question
try:
question = str(soup.find('h1', class_="hd").get_text(strip=True))
res_data.append(question.encode('utf-8'))
res_column.append("question")
except Exception, e:
print "question" + str(e)
res_data.append("none")
res_column.append("question")
#question_time
try:
question_time = self.formalTime(str(soup.find('span', class_="it s d_time").get_text(strip=True)))
times = question_time.split(" ")[0].split('-')
year = int(times[0])
month = times[1]
if month[0] == '0':
month = month[1]
month = int(month)
res_data.append(question_time.encode('utf-8'))
res_column.append("question_time")
self.columns_list.append(question_time.encode('utf-8'))
except Exception, e:
print "question_time" + str(e)
res_data.append("none")
res_column.append("question_time")
self.columns_list.append("")
#answer
try:
answer = str(soup.find('div', class_="content").get_text(strip=True))
secret_answer_node = soup.find('div', class_="secret_content")
if secret_answer_node != None:
answer = answer + str(secret_answer_node.get_text(strip=True))
res_data.append(answer.encode('utf-8'))
res_column.append("answer")
except Exception, e:
print "answer" + str(e)
res_data.append("none")
res_column.append("answer")
#answer_time
try:
answer_time =self.formalTime(str(soup.find('div', class_="c").find('span', class_="time").get_text(strip=True)))
res_data.append(question_time.encode('utf-8'))
res_column.append("answer_time")
self.columns_list.append(answer_time.encode('utf-8'))
except Exception, e:
print "answer_time" + str(e)
res_data.append("none")
res_column.append("answer_time")
self.columns_list.append("")
#extra
try:
extras = ''
extra_question_nodes = soup.find_all('div', class_="aask")
if extra_question_nodes != None:
count = 1
for extra_question_node in extra_question_nodes:
extras = extras + str(count) + ':' + str(extra_question_node.get_text(strip=True))+'\n'
count = count + 1
extra_answer_nodes = soup.find_all('div', class_="aans")
if extra_answer_nodes != None:
count = 1
for extra_answer_node in extra_answer_nodes:
extras = extras + str(count) + ':' + str(extra_answer_node.get_text(strip=True))+'\n'
count = count + 1
res_data.append(extras.encode('utf-8'))
res_column.append("extras")
except Exception, e:
print "extras" + str(e)
res_data.append("none")
res_column.append("extras")
#rate
try:
rate = ''
rate_nodes = soup.find('span', class_="xx")
if rate_nodes != None:
rate = len(rate_nodes.find_all('i'))
res_data.append(str(rate).encode('utf-8'))
res_column.append("rate")
except Exception, e:
print "rate" + str(e)
res_data.append("none")
res_column.append("rate")
#planner_name
try:
#<div class="c">
#<a href="http://licaishi.sina.com.cn/planner/3695694092/1" node-type="avatar">陈赛赛</a>
#<span class="time">07-06 13:54</span>
#</div>
planner_name = str(soup.find('div', class_="c").find('a').get_text(strip=True))
res_data.append(planner_name.encode('utf-8'))
res_column.append("planner_name")
except Exception, e:
print "planner_name" + str(e)
res_data.append("none")
res_column.append("planner_name")
#planner_company
person_link = "http://licaishi.sina.com.cn/planner/1430448040/1"
person_url = str(soup.find('div', class_="c").find('a')['href'])
person_url = urlparse.urljoin(person_link, person_url)
person_cont = self.downloader.download(person_url)
person_soup = BeautifulSoup(person_cont, 'html.parser', from_encoding='utf-8')
planner_company = self._get_personal_page(person_soup)
if (planner_company == "none"):
try:
planner_company = str(soup.find('p', class_="company_name").get_text(strip=True))
except Exception, e:
planner_company = "none"
print "planner_company - none" + str(e)
res_data.append(planner_company.encode('utf-8'))
res_column.append("planner_company")
#planner_url:http://licaishi.sina.com.cn/planner/1882067524/1
try:
planner_url = str(soup.find('div', class_="c").find('a')['href'])[-12:].replace('/', '_')
res_data.append(planner_url.encode('utf-8'))
res_column.append("planner_url")
except Exception, e:
print "planner_url" + str(e)
res_data.append("none")
res_column.append("planner_url")
#save
df = pd.DataFrame(columns = res_column)
df.loc[0] = res_data
try:
df.T.to_csv("newplans/"+self.gupiaoid+"_"+str(page_url.strip().split('/')[-1])+'.csv')
except Exception,e:
print str(e)
return True
def _get_personal_page(self, soup):
try:
company_name = soup.find('dl', class_="w_lcs_info").find('dd').get_text(strip=True)
return company_name
except Exception, e:
print "_get_personal_page" + str(e)
return "none"
def _get_single_page_data(self, page_url, soup):
print "page_url:"+page_url
if "searchNew?s" in page_url:
try:
for article_node in soup.find_all('div', class_="wt"):
link = article_node.find('a')['href']
new_full_url = urlparse.urljoin(self.ask_url, link)
print new_full_url
new_html_cont = self.downloader.download(new_full_url)
new_soup = BeautifulSoup(new_html_cont, 'html.parser', from_encoding='utf-8')
flag = self._get_single_article_data(new_full_url, new_soup)
if flag == False:
return False
except Exception,e:
print "_get_single_page_data" + str(e)
return True
def _get_new_data(self, page_url, soup):
if "searchNew?s" in page_url:
self._get_single_page_data(page_url, soup)
try:
div_node = soup.find('div', id='pagesNode')
if (div_node != None):
page_node = div_node.find_all('a')
if (page_node != None) & (len(page_node) != 0):
totalPages = int(page_node[len(page_node)-2].get_text().strip())
for page in range(2,totalPages + 1):
new_page_url = page_url+"&page="+str(page)
new_page_html_cont = self.downloader.download(new_page_url)
new_soup = BeautifulSoup(new_page_html_cont, 'html.parser', from_encoding='utf-8')
flag = self._get_single_page_data(new_page_url, new_soup)
if flag == False:
break
except Exception,e:
print "_get_new_data" + str(e)
def unescape(self, s):
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("&", "&")
return s
def formalTime(self, recentReplyTime):
if "分钟前" in recentReplyTime:
delay = int(filter(str.isdigit, recentReplyTime))
recentReplyTime = (datetime.datetime.now()-datetime.timedelta(minutes=delay)).strftime("%Y-%m-%d %H-%M-%S")
#01-05 20:29
if len(recentReplyTime) == 11:
recentReplyTime = '2017-' + recentReplyTime + ':00'
#11:53
if len(recentReplyTime) == 5:
recentReplyTime = time.strftime("%Y-%m-%d",time.localtime(time.time())) + ' '+ recentReplyTime+':00'
#2014-10-16
if len(recentReplyTime) == 10:
recentReplyTime = recentReplyTime + ' 00:00:00'
if len(recentReplyTime) == 0:
recentReplyTime ='0000-00-00 00:00:00'
#time.strftime("%Y-%m-%d",time.localtime(time.time()))
return recentReplyTime
def parse(self, id ,page_url, html_cont):
if page_url is None or html_cont is None:
return
self.gupiaoid = id
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
self._get_new_data(page_url, soup)
这一部分是整个爬虫部分的精华:
函数之间调用的层级关系如下:
-parse 解析器入口
-_get_new_data 解析某股票主页
-_get_single_page_data 解析该股票某一页的数据
-_get_single_article_data 解析该股票某一页的某个具体问答
将每一个问答都保存为一个单独的.csv文件。
保存格式如下:
四.程序运行
将spider_main.py作为程序入口
可以看到程序跑起来的结果:
完整代码可参考:新浪理财师问答爬虫