spider

程序员文章站 2022-05-05 15:06:07

...

# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author ：GuoLi

# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author ：GuoLi
import pymongo
import requests
import time
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import json


class Xueqiuspider:
    def __init__(self):
        # a = int(round(time.time(), 3) * 1000)
        self.start_url = 'https://xueqiu.com/service/v5/stock/screener/quote/list?page={}&size=30&order=desc&order_by=percent&exchange=CN&market=CN&type=sha&'
        self.headers = {
            "Host": "xueqiu.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
            "Referer": "https://xueqiu.com/hq",
            "Cookie": "_ga=GA1.2.1804883770.1533901856; device_id=9daeb88d3fe360a4954c39c2f91c7589; s=fj11jnjt2u; bid=269d98283aafb9910fb4cab2ed6e57c8_jnh0f9ih; __utmz=1.1540014007.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAANWgrkrJqAEA0N5b01LvpneYg0ww; _gid=GA1.2.1135578950.1558965826; Hm_lvt_1db88642e346389874251b5a1eded6e3=1558234025,1558356119,1558612972,1558965826; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xq_a_token.sig=tZr1IzubXKomDqLYeFF8QF8PPzI; xqat=eef8c420d0d08452a1c1462ccbf971ce02ff7358; xqat.sig=1Sxp5ruHh-YyWFuoSQPEaMSfwiI; xq_r_token=2eda2b6d8fb0ceef9123c6204054dc70797dede8; xq_r_token.sig=WSt4LWT_BPiKyAowEon2kdQRId4; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=3094390085; u.sig=EsGua0pSN_rwwiOGMVwjjbUXF_s; __utma=1.1804883770.1533901856.1544886463.1558966864.6; __utmc=1; __utmt=1; __utmb=1.1.10.1558966864; _gat=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1558967034",
        }

    def parse(self):
        for i in range(100):
            response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
            res_list = response.json()['data']['list']
            count_all = response.json()['data']['count']
            if i * 30 < count_all:
                time.sleep(5)
                response = requests.get(self.start_url.format(i + 1), headers=self.headers, verify=False)
                res_list = response.json()['data']['list']
                yield res_list
            else:
                break

    def parse_all_url(self, res):
        #symbol = res['symbol']
        symbol = 510880
        print(symbol)
        count = 100
        for i in range(100):
            detail_url = "https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol={}&hl=0&source=user&sort=time&page={}&q=".format(
                symbol, i + 1)
            print(detail_url)
            try:
                content_list, count = self.parse_comment_url(detail_url)
                time.sleep(5)
            except Exception as e:
                print("Error:", e)
                time.sleep(5)
                content_list = self.parse_comment_url(detail_url)
                time.sleep(5)
            self.save_file(content_list)

    def parse_comment_url(self, url):
        #proxies = self.get_proxy()
        response = requests.get(url, headers=self.headers, verify=False)
        res_list = response.json()['list']
        count = response.json()['count']
        content_list = []
        for res in res_list:
            item = {}
            item['user_name'] = res['user']['screen_name']
            item['user_description'] = res['user']['description']
            item['comment_title'] = res['title']
            item['comment_text'] = res['text']
            content_list.append(item)
        return content_list, count

    def save_file(self, content_list):
        for content in content_list:
            with open('xueqiu2.json', 'a')as f:
                f.write(str(content).encode("gbk", 'ignore').decode("gbk", "ignore"))
                f.write("\n")

    def run(self):
        for res_list in self.parse():
            for res in res_list:
                self.parse_all_url(res)


if __name__ == '__main__':
    xueqiu = Xueqiuspider()
    xueqiu.run()

上一篇： url spider

下一篇： JAVA之爬虫jsoup实现

spider

百度为什么不收录？spider抓取环节影响线上展现的因素分析详解

spider抓取篇：百度不收录原因分析

Scrapy的Spider类和CrawlSpider类

详解node字体压缩插件font-spider的用法

爬虫(十六)：Scrapy框架(三) Spider Middleware、Item Pipeline、对接Selenium

Scrapy框架-Spider和CrawlSpider的区别

Scrapy框架-Spider

创建编码一个spider的具体步骤

Python爬虫框架之Scrapy中Spider的用法

百度为什么不收录？spider抓取环节影响线上展现的因素分析详解