欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

spider

程序员文章站 2022-05-05 15:06:07
...
# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author :GuoLi

# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author :GuoLi
import pymongo
import requests
import time
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import json


class Xueqiuspider:
    def __init__(self):
        # a = int(round(time.time(), 3) * 1000)
        self.start_url = 'https://xueqiu.com/service/v5/stock/screener/quote/list?page={}&size=30&order=desc&order_by=percent&exchange=CN&market=CN&type=sha&'
        self.headers = {
            "Host": "xueqiu.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
            "Referer": "https://xueqiu.com/hq",
            "Cookie": "_ga=GA1.2.1804883770.1533901856; device_id=9daeb88d3fe360a4954c39c2f91c7589; s=fj11jnjt2u; bid=269d98283aafb9910fb4cab2ed6e57c8_jnh0f9ih; __utmz=1.1540014007.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAANWgrkrJqAEA0N5b01LvpneYg0ww; _gid=GA1.2.1135578950.1558965826; Hm_lvt_1db88642e346389874251b5a1eded6e3=1558234025,1558356119,1558612972,1558965826; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xq_a_token.sig=tZr1IzubXKomDqLYeFF8QF8PPzI; xqat=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xqat.sig=1Sxp5ruHh-YyWFuoSQPEaMSfwiI; xq_r_token=2eda2b6d8fb0ceef9123c6204054dc70797dede8; xq_r_token.sig=WSt4LWT_BPiKyAowEon2kdQRId4; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=3094390085; u.sig=EsGua0pSN_rwwiOGMVwjjbUXF_s; __utma=1.1804883770.1533901856.1544886463.1558966864.6; __utmc=1; __utmt=1; __utmb=1.1.10.1558966864; _gat=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1558967034",
        }

    def parse(self):
        for i in range(100):
            response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
            res_list = response.json()['data']['list']
            count_all = response.json()['data']['count']
            if i * 30 < count_all:
                time.sleep(5)
                response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
                res_list = response.json()['data']['list']
                yield res_list
            else:
                break

    def parse_all_url(self, res):
        #symbol = res['symbol']
        symbol = 510880
        print(symbol)
        count = 100
        for i in range(100):
            detail_url = "https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol={}&hl=0&source=user&sort=time&page={}&q=".format(
                symbol, i + 1)
            print(detail_url)
            try:
                content_list, count = self.parse_comment_url(detail_url)
                time.sleep(5)
            except Exception as e:
                print("Error:", e)
                time.sleep(5)
                content_list = self.parse_comment_url(detail_url)
                time.sleep(5)
            self.save_file(content_list)

    def parse_comment_url(self, url):
        #proxies = self.get_proxy()
        response = requests.get(url, headers=self.headers, verify=False)
        res_list = response.json()['list']
        count = response.json()['count']
        content_list = []
        for res in res_list:
            item = {}
            item['user_name'] = res['user']['screen_name']
            item['user_description'] = res['user']['description']
            item['comment_title'] = res['title']
            item['comment_text'] = res['text']
            content_list.append(item)
        return content_list, count

    def save_file(self, content_list):
        for content in content_list:
            with open('xueqiu2.json', 'a')as f:
                f.write(str(content).encode("gbk", 'ignore').decode("gbk", "ignore"))
                f.write("\n")

    def run(self):
        for res_list in self.parse():
            for res in res_list:
                self.parse_all_url(res)


if __name__ == '__main__':
    xueqiu = Xueqiuspider()
    xueqiu.run()