爬虫2
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”,“https”:“https://120.83.111.153:9999”,“https”:“https://117.57.90.191:9999”,“https”:“https://117.57.91.36:9999”,“http”:“http://218.27.136.169:8085”}
# self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
def Response(self):
url_list = [“http://www.1ppt.com/xiazai/jianli/ppt_jianli_{}.html”.format(i) for i in range(1, 7)]
for url in url_list:
response2 = requests.get(url=url, headers=self.headers)
html2 = etree.HTML(response2.content)
title_list1 = html2.xpath("//h2//a//@href")
# print(title_list1)
title_list=data_url_list = [“http://www.1ppt.com/{}”.format(i) for i in title_list1]
for url in title_list:
response1 = requests.get(url=url,headers=self.headers)
html1=etree.HTML(response1.content)
down_list=html1.xpath("//ul[@class=“downurllist”]/li/a/@href")
for i,j in zip(title_list,down_list):
title={“title”:i,“down”:j}
print(title)
if name == ‘main’:
data = Spider()
data.Response()
##########################################
import urllib.request
import json
import jsonpath
import time
end_page = int(input(‘请输入爬取的结束页码:’))
for i in range(0,end_page+1):
print(‘第%s页开始爬取------’%(i+1))
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv385&productId=52322470877&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1 ’
‘https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv319&productId=10421264905&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1’
url = url.format(i)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36’,
‘Referer’: ‘https://item.jd.com/52322470877.html’
}
request = urllib.request.Request(url=url,headers=headers)
content = urllib.request.urlopen(request).read().decode('gbk')
content = content.strip('fetchJSON_comment98vv385();')
obj = json.loads(content)
comments = obj['comments']
fp = open('京东.txt','a',encoding='utf8')
for comment in comments:
#评论时间
creationTime = comment['creationTime']
#评论人
nickname = comment['nickname']
#评论内容
contents = comment['content']
#评论图片
if 'images' in comment:
img_src = jsonpath.jsonpath(comment,'$..images[*].imgUrl')
img_src = 'https:' + str(img_src).strip('[]')
else:
img_src = '无图片'
item = {
'评论时间': creationTime,
'用户': nickname,
'评论内容': contents,
'图片地址': img_src,
}
string = str(item)
fp.write(string + '\n')
print('第%s页完成----------'%(i+1))
time.sleep(4)
fp.close()
##############################################################
from lxml import etree
import requests
header={
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”
}
url = “https://list.youku.com/category/show/c_96.html”
url_list = [“https://movie.douban.com/top250?start=” + str(i) for i in range(0, 256, 25)]
for url in url_list:
# print(i,url)
response = requests.get(url=url, headers=header)
# print(response.text)
html = etree.HTML(response.text)
img_list = html.xpath("//img/@src")
for num,i in enumerate(img_list):
if numlen(img_list)-1:
continue
else:
print(“img : " + i)
name_list = html.xpath(”//span[@class=“title”][1]/text()")
for i in name_list:
print(“name : " + i)
comment_list = html.xpath(”//span[@class=“inq”]/text()")
for i in comment_list:
print(“comment :”+i)
##################################################
import requests
from lxml import etree
class Data(object):
def init(self):
self.headers={
“Cookie”: ‘douban-fav-remind=1; ll=“108298”; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed=“34859246_34857213_34879050_34841131_34880452”; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.’,
“Host”: “movie.douban.com”,
“Referer”: “https: // movie.douban.com / top250?start = 0 & filter =”,
“Sec - Fetch - Mode”: “navigate”,
“Sec - Fetch - Site”: “same - origin”,
“Sec - Fetch - User”: “?1”,
“Upgrade - Insecure - Requests”: “1”,
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”
}
def get_response(self):
return requests.get(self.url,headers=self.headers)
def get_html(self, res):
return etree.HTML(res.text)
def get_data(self,html):
items=html.xpath(’//div[@class=“pic”]//img/@src | //span[@class=“title”][1]/text() | //span[@class=“inq”]/text()’)
j=0
while(j<72):
info={
“img”:items[j],
“name”: items[j+1],
“comment”: items[j+2]
}
print(info)
j=j+3
def main(self):
for i in range(0, 256, 25):
self.url = “https://movie.douban.com/top250?start=” + str(i)
res = self.get_response()
html = self.get_html(res)
self.get_data(html)
if name’main’:
data=Data()
data.main()
##########################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers={“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”}
self.url=“http://www.1ppt.com/”
self.proxies={“https”: “https://117.88.176.179:3000”}
def response(self):
response=requests.get(url=self.url,headers=self.headers,proxies=self.proxies)
# return response
# def html(self):
html=etree.HTML(response.content)
# return html
# def list(self):
list=html.xpath("//h4//a/text()")
for i in list:
print(i)
# print(list)
if name==“main”:
spider=Spider()
spider.response()
###########################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”}
self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
def Response(self):
for url in self.url:
response = requests.get(url=url,headers=self.headers, proxies=self.proxies)
# print(response.status_code)
html = etree.HTML(response.text)
downlist=html.xpath("//ul[@class=“downurllist”]/li/a/@href")
if len(downlist)==0:
continue
else:
print(downlist)
if name == ‘main’:
data = Spider()
data.Response()
####################################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.header = {
“user-agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36”}
self.proxie = {“https”: “https://117.88.176.179:3000”}
# self.url = [“http://www.cssmoban.com/tags.asp?” + str(page)=i + “&n=%E5%8D%9A%E5%AE%A2&sb=GO%2C+GO” for i in
# range(0, 23)]
self.url= [“http://www.cssmoban.com/tags.asp?page={}&n=css3”.format(i) for i in range(1, 23)]
def Response(self):
for url in self.url:
response = requests.get(url=url, headers=self.header, proxies=self.proxie)
html = etree.HTML(response.text)
imglist = html.xpath("//img/@src")
print(imglist)
# def Run(self):
# from lxml import etree
# class Data(object):[]
# def __init__(self):
# self.headers = {
# "Cookie": 'douban-fav-remind=1; ll="108298"; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed="34859246_34857213_34879050_34841131_34880452"; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.',
# "Host": "movie.douban.com",
# "Referer": "https: // movie.douban.com / top250?start = 0 & filter =",
# "Sec - Fetch - Mode": "navigate",
# "Sec - Fetch - Site": "same - origin",
# "Sec - Fetch - User": "?1",
# "Upgrade - Insecure - Requests": "1",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
# }
#
# def get_response(self):
# return requests.get(self.url, headers=self.headers)
#
# def get_html(self, res):
# return etree.HTML(res.text)
#
# def get_data(self, html):
# items = html.xpath(
# '//div[@class="pic"]//img/@src | //span[@class="title"][1]/text() | //span[@class="inq"]/text()')
# j = 0
# while (j < 72):
# info = {
# "img": items[j],
# "name": items[j + 1],
# "comment": items[j + 2]
# }
# print(info)
# j = j + 3
# def main(self):
# for i in range(0, 256, 25):
# self.url = "https://movie.douban.com/top250?start=" + str(i)
# res = self.get_response()
# html = self.get_html(res)
# self.get_data(html)
if name == ‘main’:
data = Spider()
data.Response()
###################################################
import requests
import html
from lxml import etree
class Spider(object):
def init(self):
self.headers = {
“user-agent”:“Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/ 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36”
}
self.proxies = {“https”: “https://117.88.176.179:3000”}
# self.url = [“http://www.cssmoban.com/tags.asp?” + str(page)=i + “&n=%E5%8D%9A%E5%AE%A2&sb=GO%2C+GO” for i in
# range(0, 23)]
#一定要注意url的问题,注意url是否错误,url是列表还是单个元素
self.url= [“http://www.1ppt.com/article/{}.html”.format(i) for i in range(60000,61200)]
# self.url=“http://www.1ppt.com/”
def Response(self):
for url in self.url:
response = requests.get(url=url,headers=self.headers, proxies=self.proxies)
# print(response.status_code)
html = etree.HTML(response.text)
# print(html)
# titlelist=html.xpath("//h4//a/@href")
# titlelist = html.xpath("//h4//a/text()")
# print(titlelist)
downlist=html.xpath("//ul[@class=\"downurllist\"]/li/a/@href")
if len(downlist)==0:
continue
else:
print(downlist)
# print(titlelist)
# print(titlelist)
# def Run(self):
# from lxml import etree
# class Data(object):[]
# def __init__(self):
# self.headers = {
# "Cookie": 'douban-fav-remind=1; ll="108298"; bid=vBm7Gy7w8MQ; _vwo_uuid_v2=D18B0595A558A9F72535543F7B037A02B|97bdf052ca68694e864689d0676333be; gr_user_id=2d757664-b854-484b-b66c-dfce323e7f1b; viewed="34859246_34857213_34879050_34841131_34880452"; __yadk_uid=ktT8c9ZmjwOy7FjaA4R1JZnQG2LN0cSS; __gads=Test; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1575698260%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHVv6dn2owgf_vgkzfDJnHcO4FibAmywgjqMs0sOwh9MqOy8NCyZfB1Hs19Q8IU_N%26wd%3D%26eqid%3Dcc7af70000017d7e000000065de9f2a6%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.570648179.1531317756.1575698261.1575698595.8; __utmb=223695111.0.10.1575698595; __utmz=223695111.1575698595.8.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmt=1; __utma=30149280.354220863.1531317756.1575698667.1575698696.12; __utmb=30149280.0.10.1575698696; __utmz=30149280.1575698696.12.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; _pk_id.100001.4cf6=7f924d4bc583765a.1531317756.6.1575698708.1575691096.',
# "Host": "movie.douban.com",
# "Referer": "https: // movie.douban.com / top250?start = 0 & filter =",
# "Sec - Fetch - Mode": "navigate",
# "Sec - Fetch - Site": "same - origin",
# "Sec - Fetch - User": "?1",
# "Upgrade - Insecure - Requests": "1",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
# }
#
# def get_response(self):
# return requests.get(self.url, headers=self.headers)
#
# def get_html(self, res):
# return etree.HTML(res.text)
#
# def get_data(self, html):
# items = html.xpath(
# '//div[@class="pic"]//img/@src | //span[@class="title"][1]/text() | //span[@class="inq"]/text()')
# j = 0
# while (j < 72):
# info = {
# "img": items[j],
# "name": items[j + 1],
# "comment": items[j + 2]
# }
# print(info)
# j = j + 3
# def main(self):
# for i in range(0, 256, 25):
# self.url = "https://movie.douban.com/top250?start=" + str(i)
# res = self.get_response()
# html = self.get_html(res)
# self.get_data(html)
if name == ‘main’:
data = Spider()
data.Response()
下一篇: 从《牛津高阶英汉词典》中提取单词(1)
推荐阅读
-
零基础写python爬虫之urllib2中的两个重要概念:Openers和Handlers
-
零基础写python爬虫之使用urllib2组件抓取网页内容
-
零基础写python爬虫之urllib2使用指南
-
使用Python的urllib和urllib2模块制作爬虫的实例教程
-
Python中urllib+urllib2+cookielib模块编写爬虫实战
-
Python中使用urllib2模块编写爬虫的简单上手示例
-
零基础写python爬虫之urllib2使用指南
-
零基础写python爬虫之使用urllib2组件抓取网页内容
-
零基础写python爬虫之urllib2中的两个重要概念:Openers和Handlers
-
【Python爬虫案例学习2】python多线程爬取youtube视频