欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫2

程序员文章站 2022-03-02 20:55:20
...

import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”,“https”:“https://120.83.111.153:9999”,“https”:“https://117.57.90.191:9999”,“https”:“https://117.57.91.36:9999”,“http”:“http://218.27.136.169:8085”}
# self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
def Response(self):
url_list = [“http://www.1ppt.com/xiazai/jianli/ppt_jianli_{}.html”.format(i) for i in range(1, 7)]
for url in url_list:
response2 = requests.get(url=url, headers=self.headers)
html2 = etree.HTML(response2.content)
title_list1 = html2.xpath("//h2//a//@href")
# print(title_list1)
title_list=data_url_list = [“http://www.1ppt.com/{}”.format(i) for i in title_list1]
for url in title_list:
response1 = requests.get(url=url,headers=self.headers)
html1=etree.HTML(response1.content)
down_list=html1.xpath("//ul[@class=“downurllist”]/li/a/@href")
for i,j in zip(title_list,down_list):
title={“title”:i,“down”:j}
print(title)
if name == ‘main’:
data = Spider()
data.Response()
##########################################
import urllib.request
import json
import jsonpath
import time
end_page = int(input(‘请输入爬取的结束页码:’))
for i in range(0,end_page+1):
print(‘第%s页开始爬取------’%(i+1))
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv385&productId=52322470877&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1 ’
‘https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv319&productId=10421264905&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1’
url = url.format(i)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36’,
‘Referer’: ‘https://item.jd.com/52322470877.html’
}

request = urllib.request.Request(url=url,headers=headers)
content = urllib.request.urlopen(request).read().decode('gbk')
content = content.strip('fetchJSON_comment98vv385();')
obj = json.loads(content)
comments = obj['comments']
fp = open('京东.txt','a',encoding='utf8')
for comment in comments:
    #评论时间
    creationTime = comment['creationTime']
    #评论人
    nickname = comment['nickname']
    #评论内容
    contents = comment['content']
    #评论图片
    if 'images' in comment:
        img_src = jsonpath.jsonpath(comment,'$..images[*].imgUrl')
        img_src = 'https:' + str(img_src).strip('[]')
    else:
        img_src = '无图片'
    item = {
        '评论时间': creationTime,
        '用户': nickname,
        '评论内容': contents,
        '图片地址': img_src,
    }
    string = str(item)
    fp.write(string + '\n')
print('第%s页完成----------'%(i+1))
time.sleep(4)
fp.close()
##############################################################
from lxml import etree

import requests
header={
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”
}

url = “https://list.youku.com/category/show/c_96.html”

url_list = [“https://movie.douban.com/top250?start=” + str(i) for i in range(0, 256, 25)]

for url in url_list:
# print(i,url)
response = requests.get(url=url, headers=header)
# print(response.text)
html = etree.HTML(response.text)
img_list = html.xpath("//img/@src")
for num,i in enumerate(img_list):
if numlen(img_list)-1:
continue
else:
print(“img : " + i)
name_list = html.xpath(”//span[@class=“title”][1]/text()")
for i in name_list:
print(“name : " + i)
comment_list = html.xpath(”//span[@class=“inq”]/text()")
for i in comment_list:
print(“comment :”+i)
##################################################
import requests
from lxml import etree
class Data(object):
def init(self):
self.headers={
“Cookie”: ‘douban-fav-remind=1; ll=“108298”; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed=“34859246_34857213_34879050_34841131_34880452”; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.’,
“Host”: “movie.douban.com”,
“Referer”: “https: // movie.douban.com / top250?start = 0 & filter =”,
“Sec - Fetch - Mode”: “navigate”,
“Sec - Fetch - Site”: “same - origin”,
“Sec - Fetch - User”: “?1”,
“Upgrade - Insecure - Requests”: “1”,
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”
}
def get_response(self):
return requests.get(self.url,headers=self.headers)
def get_html(self, res):
return etree.HTML(res.text)
def get_data(self,html):
items=html.xpath(’//div[@class=“pic”]//img/@src | //span[@class=“title”][1]/text() | //span[@class=“inq”]/text()’)
j=0
while(j<72):
info={
“img”:items[j],
“name”: items[j+1],
“comment”: items[j+2]
}
print(info)
j=j+3
def main(self):
for i in range(0, 256, 25):
self.url = “https://movie.douban.com/top250?start=” + str(i)
res = self.get_response()
html = self.get_html(res)
self.get_data(html)
if name
main’:
data=Data()
data.main()
##########################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers={“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”}
self.url=“http://www.1ppt.com/”
self.proxies={“https”: “https://117.88.176.179:3000”}
def response(self):
response=requests.get(url=self.url,headers=self.headers,proxies=self.proxies)
# return response
# def html(self):
html=etree.HTML(response.content)
# return html
# def list(self):
list=html.xpath("//h4//a/text()")
for i in list:
print(i)
# print(list)
if name==“main”:
spider=Spider()
spider.response()
###########################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”}
self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
def Response(self):
for url in self.url:
response = requests.get(url=url,headers=self.headers, proxies=self.proxies)
# print(response.status_code)
html = etree.HTML(response.text)
downlist=html.xpath("//ul[@class=“downurllist”]/li/a/@href")
if len(downlist)==0:
continue
else:
print(downlist)

if name == ‘main’:
data = Spider()
data.Response()

####################################################
import requests
import html
from lxml import etree

class Spider(object):
def init(self):
self.header = {
“user-agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”}
self.proxie = {“https”: “https://117.88.176.179:3000”}
# self.url = [“http://www.cssmoban.com/tags.asp?” + str(page)=i + “&n=%E5%8D%9A%E5%AE%A2&sb=GO%2C+GO” for i in
# range(0, 23)]
self.url= [“http://www.cssmoban.com/tags.asp?page={}&n=css3”.format(i) for i in range(1, 23)]
def Response(self):
for url in self.url:
response = requests.get(url=url, headers=self.header, proxies=self.proxie)
html = etree.HTML(response.text)
imglist = html.xpath("//img/@src")
print(imglist)

    # def Run(self):


    # from lxml import etree
    # class Data(object):[]
    #     def __init__(self):
    #         self.headers = {
    #             "Cookie": 'douban-fav-remind=1; ll="108298"; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed="34859246_34857213_34879050_34841131_34880452"; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.',
    #             "Host": "movie.douban.com",
    #             "Referer": "https: // movie.douban.com / top250?start = 0 & filter =",
    #             "Sec - Fetch - Mode": "navigate",
    #             "Sec - Fetch - Site": "same - origin",
    #             "Sec - Fetch - User": "?1",
    #             "Upgrade - Insecure - Requests": "1",
    #             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    #         }
    #
    #     def get_response(self):
    #         return requests.get(self.url, headers=self.headers)
    #
    #     def get_html(self, res):
    #         return etree.HTML(res.text)
    #
    #     def get_data(self, html):
    #         items = html.xpath(
    #             '//div[@class="pic"]//img/@src | //span[@class="title"][1]/text() | //span[@class="inq"]/text()')
    #         j = 0
    #         while (j < 72):
    #             info = {
    #                 "img": items[j],
    #                 "name": items[j + 1],
    #                 "comment": items[j + 2]
    #             }
    #             print(info)
    #             j = j + 3

    # def main(self):
    #     for i in range(0, 256, 25):
    #         self.url = "https://movie.douban.com/top250?start=" + str(i)
    #         res = self.get_response()
    #         html = self.get_html(res)
    #         self.get_data(html)

if name == ‘main’:
data = Spider()
data.Response()
###################################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”}
# self.url = [“http://www.cssmoban.com/tags.asp?” + str(page)=i + “&n=%E5%8D%9A%E5%AE%A2&sb=GO%2C+GO” for i in
# range(0, 23)]
#一定要注意url的问题,注意url是否错误,url是列表还是单个元素
self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
# self.url=“http://www.1ppt.com/”
def Response(self):
for url in self.url:
response = requests.get(url=url,headers=self.headers, proxies=self.proxies)
# print(response.status_code)
html = etree.HTML(response.text)
# print(html)
# titlelist=html.xpath("//h4//a/@href")

        # titlelist = html.xpath("//h4//a/text()")
        # print(titlelist)
        downlist=html.xpath("//ul[@class=\"downurllist\"]/li/a/@href")
        if len(downlist)==0:
            continue
        else:
            print(downlist)
        # print(titlelist)
        # print(titlelist)

    # def Run(self):


    # from lxml import etree
    # class Data(object):[]
    #     def __init__(self):
    #         self.headers = {
    #             "Cookie": 'douban-fav-remind=1; ll="108298"; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed="34859246_34857213_34879050_34841131_34880452"; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.',
    #             "Host": "movie.douban.com",
    #             "Referer": "https: // movie.douban.com / top250?start = 0 & filter =",
    #             "Sec - Fetch - Mode": "navigate",
    #             "Sec - Fetch - Site": "same - origin",
    #             "Sec - Fetch - User": "?1",
    #             "Upgrade - Insecure - Requests": "1",
    #             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    #         }
    #
    #     def get_response(self):
    #         return requests.get(self.url, headers=self.headers)
    #
    #     def get_html(self, res):
    #         return etree.HTML(res.text)
    #
    #     def get_data(self, html):
    #         items = html.xpath(
    #             '//div[@class="pic"]//img/@src | //span[@class="title"][1]/text() | //span[@class="inq"]/text()')
    #         j = 0
    #         while (j < 72):
    #             info = {
    #                 "img": items[j],
    #                 "name": items[j + 1],
    #                 "comment": items[j + 2]
    #             }
    #             print(info)
    #             j = j + 3

    # def main(self):
    #     for i in range(0, 256, 25):
    #         self.url = "https://movie.douban.com/top250?start=" + str(i)
    #         res = self.get_response()
    #         html = self.get_html(res)
    #         self.get_data(html)

if name == ‘main’:
data = Spider()
data.Response()