[Python爬虫]爬取新浪理财师股票问答
本文将与大家分享如何爬取新浪理财师股票问答。
一.背景介绍
1)爬取顺序:
在这里,根据已有的股票id列表,按照顺序,依次爬取每只股票下面的股票问答。
股票id格式:
lines = ['300592.XSHE', '300604.XSHE', '002852.XSHE', '603603.XSHG', '603239.XSHG',...]
2)页面结构:
以浦发银行(sh600000)为例:
通过页面的结构,可以确定爬虫爬取的顺序为:
股票A的所有问答->股票A的单页问答->股票A的某一个具体问答
3)翻页原理
以浦发银行为例,第一页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2
第二页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2&ind_id=1&all=0&trim_ext=0&page=2
第三页的网址为:http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2&ind_id=1&all=0&trim_ext=0&page=3
因此,通过循环进行翻页即可。
二.模块设计
这里设计了三个模块:
1)URL管理器
作用:管理待抓取的URL集合和已抓取的URL集合。
2)网页下载器
作用:将URL对应的网页下载到本地。
3)网页解析器
作用:从下载好的网页或者字符串中,提取Url或者有价值的数据。
三.具体实现
1.调度程序
spider_main.py:
# coding:utf8
import url_manager, html_downloader, html_parser
import time
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
def craw(self):
count = 0
self.urls.get_init_urls()
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
#new_url = 'http://licaishi.sina.com.cn/web/searchNew?s=sh600000&t=2'
print 'crow %d : %s' % (count, new_url)
html_cont = self.downloader.download(new_url)
self.parser.parse(new_url.split('&')[0][-8:], new_url, html_cont)
time.sleep(1)
print count
count = count + 1
except Exception,e:
print str(e)
if __name__ == "__main__":
obj_spider = SpiderMain()
obj_spider.craw()
步骤:
1)添加所有股票的主页网址
2)每循环一次,取出一只股票的主页网址
3)对当前股票的主页网址进行解析,获取我们需要的内容
4)间隔1ms,再进行下一次循环
2.Url管理器
url_manager.py:
# coding:utf8
import os
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
self.crawled_urls = []
def get_init_urls(self):
lines = ['300592.XSHE', '300604.XSHE', '002852.XSHE', '603603.XSHG', '603239.XSHG', '603331.XSHG', '603787.XSHG', '002849.XSHE', '002877.XSHE', '002831.XSHE', '300672.XSHE', '603165.XSHG', '002774.XSHE', '603335.XSHG', '002845.XSHE', '603536.XSHG', '603389.XSHG', '603826.XSHG', '300617.XSHE', '300629.XSHE', '002832.XSHE', '300633.XSHE', '603628.XSHG', '603330.XSHG', '603042.XSHG', '603679.XSHG', '603980.XSHG', '300669.XSHE', '601200.XSHG', '300632.XSHE', '300656.XSHE', '300622.XSHE', '300685.XSHE', '300643.XSHE', '300615.XSHE', '603728.XSHG', '300675.XSHE', '300597.XSHE', '300680.XSHE', '300671.XSHE', '603938.XSHG', '300575.XSHE', '603656.XSHG', '603990.XSHG', '300661.XSHE', '300666.XSHE', '603855.XSHG', '603638.XSHG', '300576.XSHE', '300581.XSHE', '600996.XSHG', '601881.XSHG', '002847.XSHE', '603585.XSHG', '603496.XSHG', '300588.XSHE', '300625.XSHE', '300651.XSHE', '603218.XSHG', '300620.XSHE', '603197.XSHG', '002855.XSHE', '603758.XSHG', '603388.XSHG', '300650.XSHE', '603825.XSHG', '603233.XSHG', '300613.XSHE', '603360.XSHG', '603178.XSHG', '300630.XSHE', '002878.XSHE', '603488.XSHG', '603096.XSHG', '300649.XSHE', '603602.XSHG', '002886.XSHE', '603690.XSHG', '300627.XSHE', '600939.XSHG', '603228.XSHG', '300641.XSHE', '603833.XSHG', '300596.XSHE', '300687.XSHE', '603501.XSHG', '300514.XSHE', '603579.XSHG', '300593.XSHE', '601858.XSHG', '002880.XSHE', '603196.XSHG', '603286.XSHG', '002829.XSHE', '300635.XSHE', '300609.XSHE', '603538.XSHG', '300668.XSHE', '300667.XSHE', '603232.XSHG', '603058.XSHG', '603978.XSHG', '300606.XSHE', '002836.XSHE', '603039.XSHG', '300681.XSHE', '603926.XSHG', '603357.XSHG', '300579.XSHE', '002857.XSHE', '603823.XSHG', '603626.XSHG', '603380.XSHG', '300637.XSHE', '300631.XSHE', '603811.XSHG', '603298.XSHG', '603041.XSHG', '300602.XSHE', '603920.XSHG', '300682.XSHE', '603316.XSHG', '603385.XSHG', '300584.XSHE', '300590.XSHE', '603505.XSHG', '603985.XSHG', '603928.XSHG', '603637.XSHG', '002856.XSHE', '300595.XSHE', '300679.XSHE', '603326.XSHG', '300673.XSHE', '002859.XSHE', '603429.XSHG', '603896.XSHG', '300585.XSHE', '300688.XSHE', '002828.XSHE', '603881.XSHG', '300580.XSHE', '300645.XSHE', '603878.XSHG', '300655.XSHE', '002846.XSHE', '603933.XSHG', '603676.XSHG', '601228.XSHG', '300619.XSHE', '300639.XSHE', '002865.XSHE', '603444.XSHG', '603358.XSHG', '603133.XSHG', '603238.XSHG', '603880.XSHG', '603886.XSHG', '603269.XSHG', '603038.XSHG', '603757.XSHG', '300642.XSHE', '002872.XSHE', '603416.XSHG', '002885.XSHE', '002848.XSHE', '300603.XSHE', '603345.XSHG', '601366.XSHG', '603266.XSHG', '300554.XSHE', '002835.XSHE', '601619.XSHG', '002838.XSHE', '300612.XSHE', '603208.XSHG', '603801.XSHG', '002851.XSHE', '002867.XSHE', '603903.XSHG', '300663.XSHE', '300605.XSHE', '603767.XSHG', '603186.XSHG', '603595.XSHG', '300578.XSHE', '603078.XSHG', '603665.XSHG', '603179.XSHG', '300657.XSHE', '002842.XSHE', '300683.XSHE', '300591.XSHE', '002837.XSHE', '603877.XSHG', '300623.XSHE', '603860.XSHG', '603677.XSHG', '300616.XSHE', '603040.XSHG', '300640.XSHE', '002890.XSHE', '002863.XSHE', '603577.XSHG', '300636.XSHE', '603639.XSHG', '300618.XSHE', '603689.XSHG', '603050.XSHG', '603305.XSHG', '300670.XSHE', '300638.XSHE', '002889.XSHE', '300678.XSHE', '603630.XSHG', '603906.XSHG', '603730.XSHG', '603768.XSHG', '300583.XSHE', '603991.XSHG', '002860.XSHE', '002830.XSHE', '300689.XSHE', '002833.XSHE', '300582.XSHE', '002868.XSHE', '300611.XSHE', '601212.XSHG', '603586.XSHG', '603043.XSHG', '603113.XSHG', '603229.XSHG', '002862.XSHE', '002866.XSHE', '300601.XSHE', '603180.XSHG', '300658.XSHE', '002876.XSHE', '002824.XSHE', '603089.XSHG', '603617.XSHG', '603138.XSHG', '603580.XSHG', '603037.XSHG', '300571.XSHE', '603817.XSHG', '300608.XSHE', '603387.XSHG', '603139.XSHG', '300660.XSHE', '603707.XSHG', '603303.XSHG', '002841.XSHE', '300665.XSHE', '603337.XSHG', '002875.XSHE', '300653.XSHE', '603929.XSHG', '002827.XSHE', '300587.XSHE', '601952.XSHG', '603032.XSHG', '603839.XSHG', '603803.XSHG', '300648.XSHE', '603966.XSHG', '603908.XSHG', '603797.XSHG', '603612.XSHG', '601878.XSHG', '603615.XSHG', '002861.XSHE', '601375.XSHG', '603035.XSHG', '603955.XSHG', '300598.XSHE', '002879.XSHE', '300573.XSHE', '002858.XSHE', '300577.XSHE', '002826.XSHE', '300621.XSHE', '300628.XSHE', '300599.XSHE', '603036.XSHG', '300607.XSHE', '002888.XSHE', '603717.XSHG', '300600.XSHE', '300586.XSHE', '603960.XSHG', '603226.XSHG', '300626.XSHE', '300652.XSHE', '002882.XSHE', '603320.XSHG', '002840.XSHE', '603177.XSHG', '603458.XSHG', '603535.XSHG', '002883.XSHE', '002869.XSHE', '603063.XSHG', '603081.XSHG', '603098.XSHG', '300686.XSHE', '603879.XSHG', '300610.XSHE', '300677.XSHE', '002843.XSHE', '300589.XSHE', '603708.XSHG', '002881.XSHE', '300676.XSHE', '002850.XSHE', '002853.XSHE', '603383.XSHG', '002870.XSHE', '603578.XSHG', '002887.XSHE', '300690.XSHE', '603086.XSHG', '300647.XSHE', '002871.XSHE', '300662.XSHE', '603668.XSHG', '603225.XSHG', '002873.XSHE', '300659.XSHE', '002884.XSHE', '603200.XSHG', '002839.XSHE', '603517.XSHG']
for id in lines:
if 'XSHG' in id:
id = 'sh' + id.split('.')[0]
if 'XSHE' in id:
id = 'sz' + id.split('.')[0]
self.add_new_url('http://licaishi.sina.com.cn/web/searchNew?s='+str(id).strip()+'&t=2')
print id
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls and url not in self.crawled_urls:
self.new_urls.add(url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
1)由于要爬取的股票id是已知的,因此,在这里,用lines数组,将要爬取的股票id记录下来:
lines = ['300592.XSHE', '300604.XSHE', '002852.XSHE', '603603.XSHG', '603239.XSHG', '603331.XSHG', ...]
2)为了避免重复,选择set()集合进行记录。
3.下载管理器
html_downloader.py:
# coding:utf8
import urllib2
import socket
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class HtmlDownloader(object):
def download(self,url):
socket.setdefaulttimeout(200)
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode() != 200:
return None
return unicode(response.read(), 'UTF-8', 'ignore').encode('UTF-8')
解决中文乱码问题:先将下载内容用UTF-8编码,然后再用UTF-8解码。
unicode(response.read(), 'UTF-8', 'ignore').encode('UTF-8')
4.网页解析器
html_parser.py:
# coding:utf8
import html_downloader
from bs4 import BeautifulSoup
import urlparse
import pandas as pd
import sys
import time
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')
class HtmlParser(object):
def __init__(self):
self.downloader = html_downloader.HtmlDownloader()
self.ask_url = "http://licaishi.sina.com.cn/ask/"
self.gupiaoid = ''
self.columns_list = []
#http://licaishi.sina.com.cn/ask/
def _get_single_article_data(self, page_url, soup):
res_data = []
res_column = []
self.columns_list = []
if "ask" in page_url:
self.columns_list.append(page_url.split('/')[-1])
self.columns_list.append(self.gupiaoid)
#question
try:
question = str(soup.find('h1', class_="hd").get_text(strip=True))
res_data.append(question.encode('utf-8'))
res_column.append("question")
except Exception, e:
print "question" + str(e)
res_data.append("none")
res_column.append("question")
#question_time
try:
question_time = self.formalTime(str(soup.find('span', class_="it s d_time").get_text(strip=True)))
times = question_time.split(" ")[0].split('-')
year = int(times[0])
month = times[1]
if month[0] == '0':
month = month[1]
month = int(month)
res_data.append(question_time.encode('utf-8'))
res_column.append("question_time")
self.columns_list.append(question_time.encode('utf-8'))
except Exception, e:
print "question_time" + str(e)
res_data.append("none")
res_column.append("question_time")
self.columns_list.append("")
#answer
try:
answer = str(soup.find('div', class_="content").get_text(strip=True))
secret_answer_node = soup.find('div', class_="secret_content")
if secret_answer_node != None:
answer = answer + str(secret_answer_node.get_text(strip=True))
res_data.append(answer.encode('utf-8'))
res_column.append("answer")
except Exception, e:
print "answer" + str(e)
res_data.append("none")
res_column.append("answer")
#answer_time
try:
answer_time =self.formalTime(str(soup.find('div', class_="c").find('span', class_="time").get_text(strip=True)))
res_data.append(question_time.encode('utf-8'))
res_column.append("answer_time")
self.columns_list.append(answer_time.encode('utf-8'))
except Exception, e:
print "answer_time" + str(e)
res_data.append("none")
res_column.append("answer_time")
self.columns_list.append("")
#extra
try:
extras = ''
extra_question_nodes = soup.find_all('div', class_="aask")
if extra_question_nodes != None:
count = 1
for extra_question_node in extra_question_nodes:
extras = extras + str(count) + ':' + str(extra_question_node.get_text(strip=True))+'\n'
count = count + 1
extra_answer_nodes = soup.find_all('div', class_="aans")
if extra_answer_nodes != None:
count = 1
for extra_answer_node in extra_answer_nodes:
extras = extras + str(count) + ':' + str(extra_answer_node.get_text(strip=True))+'\n'
count = count + 1
res_data.append(extras.encode('utf-8'))
res_column.append("extras")
except Exception, e:
print "extras" + str(e)
res_data.append("none")
res_column.append("extras")
#rate
try:
rate = ''
rate_nodes = soup.find('span', class_="xx")
if rate_nodes != None:
rate = len(rate_nodes.find_all('i'))
res_data.append(str(rate).encode('utf-8'))
res_column.append("rate")
except Exception, e:
print "rate" + str(e)
res_data.append("none")
res_column.append("rate")
#planner_name
try:
#<div class="c">
#<a href="http://licaishi.sina.com.cn/planner/3695694092/1" node-type="avatar">陈赛赛</a>
#<span class="time">07-06 13:54</span>
#</div>
planner_name = str(soup.find('div', class_="c").find('a').get_text(strip=True))
res_data.append(planner_name.encode('utf-8'))
res_column.append("planner_name")
except Exception, e:
print "planner_name" + str(e)
res_data.append("none")
res_column.append("planner_name")
#planner_company
person_link = "http://licaishi.sina.com.cn/planner/1430448040/1"
person_url = str(soup.find('div', class_="c").find('a')['href'])
person_url = urlparse.urljoin(person_link, person_url)
person_cont = self.downloader.download(person_url)
person_soup = BeautifulSoup(person_cont, 'html.parser', from_encoding='utf-8')
planner_company = self._get_personal_page(person_soup)
if (planner_company == "none"):
try:
planner_company = str(soup.find('p', class_="company_name").get_text(strip=True))
except Exception, e:
planner_company = "none"
print "planner_company - none" + str(e)
res_data.append(planner_company.encode('utf-8'))
res_column.append("planner_company")
#planner_url:http://licaishi.sina.com.cn/planner/1882067524/1
try:
planner_url = str(soup.find('div', class_="c").find('a')['href'])[-12:].replace('/', '_')
res_data.append(planner_url.encode('utf-8'))
res_column.append("planner_url")
except Exception, e:
print "planner_url" + str(e)
res_data.append("none")
res_column.append("planner_url")
#save
df = pd.DataFrame(columns = res_column)
df.loc[0] = res_data
try:
df.T.to_csv("newplans/"+self.gupiaoid+"_"+str(page_url.strip().split('/')[-1])+'.csv')
except Exception,e:
print str(e)
return True
def _get_personal_page(self, soup):
try:
company_name = soup.find('dl', class_="w_lcs_info").find('dd').get_text(strip=True)
return company_name
except Exception, e:
print "_get_personal_page" + str(e)
return "none"
def _get_single_page_data(self, page_url, soup):
print "page_url:"+page_url
if "searchNew?s" in page_url:
try:
for article_node in soup.find_all('div', class_="wt"):
link = article_node.find('a')['href']
new_full_url = urlparse.urljoin(self.ask_url, link)
print new_full_url
new_html_cont = self.downloader.download(new_full_url)
new_soup = BeautifulSoup(new_html_cont, 'html.parser', from_encoding='utf-8')
flag = self._get_single_article_data(new_full_url, new_soup)
if flag == False:
return False
except Exception,e:
print "_get_single_page_data" + str(e)
return True
def _get_new_data(self, page_url, soup):
if "searchNew?s" in page_url:
self._get_single_page_data(page_url, soup)
try:
div_node = soup.find('div', id='pagesNode')
if (div_node != None):
page_node = div_node.find_all('a')
if (page_node != None) & (len(page_node) != 0):
totalPages = int(page_node[len(page_node)-2].get_text().strip())
for page in range(2,totalPages + 1):
new_page_url = page_url+"&page="+str(page)
new_page_html_cont = self.downloader.download(new_page_url)
new_soup = BeautifulSoup(new_page_html_cont, 'html.parser', from_encoding='utf-8')
flag = self._get_single_page_data(new_page_url, new_soup)
if flag == False:
break
except Exception,e:
print "_get_new_data" + str(e)
def unescape(self, s):
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("&", "&")
return s
def formalTime(self, recentReplyTime):
if "分钟前" in recentReplyTime:
delay = int(filter(str.isdigit, recentReplyTime))
recentReplyTime = (datetime.datetime.now()-datetime.timedelta(minutes=delay)).strftime("%Y-%m-%d %H-%M-%S")
#01-05 20:29
if len(recentReplyTime) == 11:
recentReplyTime = '2017-' + recentReplyTime + ':00'
#11:53
if len(recentReplyTime) == 5:
recentReplyTime = time.strftime("%Y-%m-%d",time.localtime(time.time())) + ' '+ recentReplyTime+':00'
#2014-10-16
if len(recentReplyTime) == 10:
recentReplyTime = recentReplyTime + ' 00:00:00'
if len(recentReplyTime) == 0:
recentReplyTime ='0000-00-00 00:00:00'
#time.strftime("%Y-%m-%d",time.localtime(time.time()))
return recentReplyTime
def parse(self, id ,page_url, html_cont):
if page_url is None or html_cont is None:
return
self.gupiaoid = id
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
self._get_new_data(page_url, soup)
这一部分是整个爬虫部分的精华:
函数之间调用的层级关系如下:
-parse 解析器入口
-_get_new_data 解析某股票主页
-_get_single_page_data 解析该股票某一页的数据
-_get_single_article_data 解析该股票某一页的某个具体问答
将每一个问答都保存为一个单独的.csv文件。
保存格式如下:
四.程序运行
将spider_main.py作为程序入口
可以看到程序跑起来的结果:
完整代码可参考:新浪理财师问答爬虫