spider
程序员文章站
2022-05-05 15:06:07
...
# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author :GuoLi
# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author :GuoLi
import pymongo
import requests
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import json
class Xueqiuspider:
def __init__(self):
# a = int(round(time.time(), 3) * 1000)
self.start_url = 'https://xueqiu.com/service/v5/stock/screener/quote/list?page={}&size=30&order=desc&order_by=percent&exchange=CN&market=CN&type=sha&'
self.headers = {
"Host": "xueqiu.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
"Referer": "https://xueqiu.com/hq",
"Cookie": "_ga=GA1.2.1804883770.1533901856; device_id=9daeb88d3fe360a4954c39c2f91c7589; s=fj11jnjt2u; bid=269d98283aafb9910fb4cab2ed6e57c8_jnh0f9ih; __utmz=1.1540014007.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAANWgrkrJqAEA0N5b01LvpneYg0ww; _gid=GA1.2.1135578950.1558965826; Hm_lvt_1db88642e346389874251b5a1eded6e3=1558234025,1558356119,1558612972,1558965826; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xq_a_token.sig=tZr1IzubXKomDqLYeFF8QF8PPzI; xqat=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xqat.sig=1Sxp5ruHh-YyWFuoSQPEaMSfwiI; xq_r_token=2eda2b6d8fb0ceef9123c6204054dc70797dede8; xq_r_token.sig=WSt4LWT_BPiKyAowEon2kdQRId4; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=3094390085; u.sig=EsGua0pSN_rwwiOGMVwjjbUXF_s; __utma=1.1804883770.1533901856.1544886463.1558966864.6; __utmc=1; __utmt=1; __utmb=1.1.10.1558966864; _gat=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1558967034",
}
def parse(self):
for i in range(100):
response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
res_list = response.json()['data']['list']
count_all = response.json()['data']['count']
if i * 30 < count_all:
time.sleep(5)
response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
res_list = response.json()['data']['list']
yield res_list
else:
break
def parse_all_url(self, res):
#symbol = res['symbol']
symbol = 510880
print(symbol)
count = 100
for i in range(100):
detail_url = "https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol={}&hl=0&source=user&sort=time&page={}&q=".format(
symbol, i + 1)
print(detail_url)
try:
content_list, count = self.parse_comment_url(detail_url)
time.sleep(5)
except Exception as e:
print("Error:", e)
time.sleep(5)
content_list = self.parse_comment_url(detail_url)
time.sleep(5)
self.save_file(content_list)
def parse_comment_url(self, url):
#proxies = self.get_proxy()
response = requests.get(url, headers=self.headers, verify=False)
res_list = response.json()['list']
count = response.json()['count']
content_list = []
for res in res_list:
item = {}
item['user_name'] = res['user']['screen_name']
item['user_description'] = res['user']['description']
item['comment_title'] = res['title']
item['comment_text'] = res['text']
content_list.append(item)
return content_list, count
def save_file(self, content_list):
for content in content_list:
with open('xueqiu2.json', 'a')as f:
f.write(str(content).encode("gbk", 'ignore').decode("gbk", "ignore"))
f.write("\n")
def run(self):
for res_list in self.parse():
for res in res_list:
self.parse_all_url(res)
if __name__ == '__main__':
xueqiu = Xueqiuspider()
xueqiu.run()
上一篇: url spider
下一篇: JAVA之爬虫jsoup实现
推荐阅读