【转】网络爬虫

程序员文章站 2022-05-08 17:42:29

...

https://www.jianshu.com/p/ddb45c8e3399


import requests
import lxml
import time

from bs4 import BeautifulSoup

from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import Request


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

# data = bytes(urlencode({'word':'hello'}), encoding='utf-8')
# response = urlopen("http://www.baidu.com", data=data)
# print(type(response))
# print(response.status)
# print(response.read().decode('utf-8'))

# response = urlopen("http://www.baidu.com")
# print(type(response))
# print(response.status)
# print(response.getheaders())
# print(response.getheader('Server'))
# html = response.read()
# print(html)


# 携带timeout参数打开网页2
# try:
#     # response = urlopen('http://httpbin.org/get', timeout=1)
#     response = urlopen('http://httpbin.org/get', timeout=0.1)
#     print(response.read())
# except Exception as e:
#     print(e)

# 通过构建Request打开网页1
# request = Request('https://python.org')
# response = urlopen(request)
# print(response.read().decode('utf-8'))

# 通过构建Request打开网页2
# url = 'http://httpbin.org/post'
# headers = {
#     'User-Agent': 'Mozilla/4.0(compatibe;MSIE 5.5;Windows NT)',
#     'Host': 'httpbin.org'
# }
# dict = {'name': 'Germey'}
# data = bytes(urlencode(dict), encoding='utf8')
# req = Request(url=url, data=data, headers=headers, method='POST')
# response = urlopen(req)
# print(response.read().decode('utf-8'))

# url = 'http://httpbin.org/post'
# dict = {'name': 'Germey'}
# data = bytes(urlencode(dict), encoding='utf8')
# req = Request(url=url, data=data, method='POST')
# response = urlopen(req)
# print(response.read().decode('utf-8'))

# urlencode()的使用
# url = 'http://httpbin.org/post'
# data = {'first':'true', 'pn':1, 'kd':'Python'}
# data = urlencode(data).encode('utf-8')
# req = Request(url=url, data=data, method='POST')
# page = urlopen(req, data=data).read()
# print(page)


# request -> params 字典或字节序列  --> 新增数据保存在 “args”
# payload = {'key1':'value', 'key2':'value2'}
# r = requests.request('GET', 'http://httpbin.org/get', params=payload)
# print(r.url)
# print(r.text)


# request -> data: 字典，字节序列，或文件对象   --> 新增数据保存在 “form”
# payload = {'key1':'value', 'key2':'value2'}
# r = requests.request('POST', 'http://httpbin.org/post', data=payload)
# print(r.url)
# print(r.text)


# request -> json: json 格式数据
# payload =  {'key1':'value', 'key2':'value2'}
# r = requests.request('POST', 'http://httpbin.org/post', json=payload)
# print(r.url)
# print(r.text)


# payload = {'key1': 'value1', 'key2': 'value2'}
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
# r = requests.request('GET', 'http://httpbin.org/get', params=payload, headers=headers)
# print(r.url)
# print(r.text)


# cs_user = '用户名'
# cs_psw = '密码'
# r = requests.request('GET', 'https://api.github.com', auth=(cs_user, cs_psw))
# print(r.url)
# print(r.text)

# proxies = {
#     'https': 'http://41.118.132.69:4433'
# }
#
# r = requests.request('GET', 'https://api.github.com', proxies = proxies)
# print(r.url)
# print(r.text)


# url = 'http://www.amazon.cn/gp/product/B01M8L5Z3Y'
# try:
#     kv = {'user_agent': 'Mozilla/5.0'}
#     r = requests.get(url, headers=kv)  # 改变自己的请求数据
#     r.raise_for_status()
#     r.encoding = r.apparent_encoding
#     print(r.text[1000:2000])  # 部分信息
# except:
#     print("失败")
#

# keyword = 'python'
# try:
#     kv = {'wd': keyword}
#     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
#     r = requests.get('https://www.baidu.com/s', params=kv, headers=headers)
#     r.raise_for_status()
#     r.encoding = r.apparent_encoding
#     # print(len(r.text))
#     print(r.text)
# except:
#     print("失败")


# try:
#     url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1567500795590&di=9f0c7eda33448553c317d5526dcc00d1&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fblog%2F201402%2F12%2F20140212200704_WcTiF.thumb.700_0.jpeg"
#     root = "/Users/synyi/PycharmProjects/For_testing/Crawler/"
#     path = root + url.split("%")[-1] + ".jpeg"
#     r = requests.get(url)
#     f = open(path, 'wb')
#     f.write(r.content)
#     f.close()
# except:
#     print('失败')


# beautiful soup
# html_doc = """
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title"><b>The Dormouse's story</b></p>
#
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
#
# <p class="story">...</p>
# """
# soup = BeautifulSoup(html_doc, 'lxml')
# print(soup.prettify())
# print('\n------------\n')
# # 找到所有<a>标签的链接
# for link in soup.find_all('a'):
#     print(link.get('href'))
#
# # 所有文字内容
# print(soup.get_text())
#
# # 结合正则
# import re
# for tag in soup.find_all(href=re.compile("elsie")):
#     print(tag)
#
# for tag in soup.find_all("a", class_ = "sister"):
#     print(tag)


# example
# html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
# bsObj = BeautifulSoup(html.read(), 'lxml')
# print(bsObj)
# print(bsObj.div)

# html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())
# nameList = bsObj.findAll('span', {'class':'green'})
# for name in nameList:
#     print(name.get_text())

# html = urlopen("http://www.pythonscraping.com/pages/page3.html")
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())

# html = urlopen(r'https://en.wikipedia.org/wiki/Kevin_Bacon')
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())



# brower = webdriver.Chrome()
#
# try:
#     brower.get("http://www.baidu.com/")
#     input = brower.find_element_by_id('kw')
#     input.send_keys('Python')
#     input.send_keys(Keys.ENTER)
#     wait = WebDriverWait(brower, 10)
#     wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
#
#     print(brower.current_url)
#     print('\n-------------------------------\n')
#     print(brower.get_cookies())
#     print('\n-------------------------------\n')
#     print(brower.page_source)
#
# finally:
#     pass

# driver = webdriver.PhantomJS(executable_path = "/Users/synyi/Documents/phantomjs-2.1.1-macosx/bin/phantomjs")
# driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
# driver.get("https://db.yaozh.com/goods?p=7&pageSize=30")
# a = 1
# print(driver.find_element_by_id("content").text)

# time.sleep(3)
# print(driver.find_element_by_id("content").text)



# from selenium.common.exceptions import StaleElementReferenceException
# def waitForLoad(driver):
#     elem = driver.find_element_by_tag_name("html")
#     count = 0
#     while True:
#         count += 1
#         if count > 10:
#             print('Timing out after 10 seconds and returning')
#             return
#         time.sleep(.5)
#         try:
#             elem == driver.find_element_by_tag_name("html")
#         except StaleElementReferenceException:
#             return
#
# driver = webdriver.PhantomJS(executable_path = "/Users/synyi/Documents/phantomjs-2.1.1-macosx/bin/phantomjs")
# driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
# print(driver.page_source)
# print('--------------------')
# waitForLoad(driver)
# print(driver.page_source)

# from PIL import Image, ImageFilter
# kitten = Image.open("/Users/synyi/PycharmProjects/For_testing/Crawler/totoro.jpeg")
# blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
# blurryKitten.save("/Users/synyi/PycharmProjects/For_testing/Crawler/totoro-2.jpeg")
# blurryKitten.show()
# a = 1

# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--disable-gpu")
#
# driver = webdriver.Chrome(chrome_options=chrome_options)
# driver.get('https://www.baidu.com/')
# print('open chrome')
# a = 1
# print(driver.title)
# driver.find_element_by_id("kw").send_keys("测试")
# print('关闭')
# driver.quit()
# print("ok")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_window_size(1366, 768)
# get方法会一直等到页面加载，然后才会继续程序，通常测试会在这里选择time.sleep(2)
driver.get("http://www.baidu.com/")
# 获取页面名为wraper的id标签的文本内容
data = driver.find_element_by_id('wrapper').text
# 打印数据内容
print(data)

上一篇： BZOJ1816 Cqoi2010 扑克牌【二分答案】

下一篇： P4755 Beautiful Pair

【转】网络爬虫

如何解决在Azure上部署Sqlserver网络访问不了

红魔5G散热新升级：南北通透风道+15000转/分离心风扇

python3 Scrapy爬虫框架ip代理配置的方法

XListView实现多条目网络数据刷新加载网络加载图片

详解c# .net core 下的网络请求

XListView实现网络加载图片和下拉刷新

Android8.1原生系统网络感叹号消除的方法

如何办理支付宝钱包信用卡支付宝网络信用卡办理申请流程

Adobe Acrobat使用教程(JPG转PDF)

JS将unicode码转中文方法

【转】网络爬虫

如何解决在Azure上部署Sqlserver网络访问不了

红魔5G散热新升级：南北通透风道+15000转/分离心风扇

python3 Scrapy爬虫框架ip代理配置的方法

XListView实现多条目网络数据刷新加载 网络加载图片

详解c# .net core 下的网络请求

XListView实现网络加载图片和下拉刷新

Android8.1原生系统网络感叹号消除的方法

如何办理支付宝钱包信用卡 支付宝网络信用卡办理申请流程

Adobe Acrobat使用教程(JPG转PDF)

JS将unicode码转中文方法

XListView实现多条目网络数据刷新加载网络加载图片

如何办理支付宝钱包信用卡支付宝网络信用卡办理申请流程