欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

【转】网络爬虫

程序员文章站 2022-05-08 17:42:29
...

https://www.jianshu.com/p/ddb45c8e3399

 


import requests
import lxml
import time

from bs4 import BeautifulSoup

from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import Request


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

# data = bytes(urlencode({'word':'hello'}), encoding='utf-8')
# response = urlopen("http://www.baidu.com", data=data)
# print(type(response))
# print(response.status)
# print(response.read().decode('utf-8'))

# response = urlopen("http://www.baidu.com")
# print(type(response))
# print(response.status)
# print(response.getheaders())
# print(response.getheader('Server'))
# html = response.read()
# print(html)


# 携带timeout参数打开网页2
# try:
#     # response = urlopen('http://httpbin.org/get', timeout=1)
#     response = urlopen('http://httpbin.org/get', timeout=0.1)
#     print(response.read())
# except Exception as e:
#     print(e)

# 通过构建Request打开网页1
# request = Request('https://python.org')
# response = urlopen(request)
# print(response.read().decode('utf-8'))

# 通过构建Request打开网页2
# url = 'http://httpbin.org/post'
# headers = {
#     'User-Agent': 'Mozilla/4.0(compatibe;MSIE 5.5;Windows NT)',
#     'Host': 'httpbin.org'
# }
# dict = {'name': 'Germey'}
# data = bytes(urlencode(dict), encoding='utf8')
# req = Request(url=url, data=data, headers=headers, method='POST')
# response = urlopen(req)
# print(response.read().decode('utf-8'))

# url = 'http://httpbin.org/post'
# dict = {'name': 'Germey'}
# data = bytes(urlencode(dict), encoding='utf8')
# req = Request(url=url, data=data, method='POST')
# response = urlopen(req)
# print(response.read().decode('utf-8'))

# urlencode()的使用
# url = 'http://httpbin.org/post'
# data = {'first':'true', 'pn':1, 'kd':'Python'}
# data = urlencode(data).encode('utf-8')
# req = Request(url=url, data=data, method='POST')
# page = urlopen(req, data=data).read()
# print(page)


# request -> params 字典或字节序列  --> 新增数据保存在 “args”
# payload = {'key1':'value', 'key2':'value2'}
# r = requests.request('GET', 'http://httpbin.org/get', params=payload)
# print(r.url)
# print(r.text)


# request -> data: 字典,字节序列,或文件对象   --> 新增数据保存在 “form”
# payload = {'key1':'value', 'key2':'value2'}
# r = requests.request('POST', 'http://httpbin.org/post', data=payload)
# print(r.url)
# print(r.text)


# request -> json: json 格式数据
# payload =  {'key1':'value', 'key2':'value2'}
# r = requests.request('POST', 'http://httpbin.org/post', json=payload)
# print(r.url)
# print(r.text)


# payload = {'key1': 'value1', 'key2': 'value2'}
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
# r = requests.request('GET', 'http://httpbin.org/get', params=payload, headers=headers)
# print(r.url)
# print(r.text)


# cs_user = '用户名'
# cs_psw = '密码'
# r = requests.request('GET', 'https://api.github.com', auth=(cs_user, cs_psw))
# print(r.url)
# print(r.text)

# proxies = {
#     'https': 'http://41.118.132.69:4433'
# }
#
# r = requests.request('GET', 'https://api.github.com', proxies = proxies)
# print(r.url)
# print(r.text)


# url = 'http://www.amazon.cn/gp/product/B01M8L5Z3Y'
# try:
#     kv = {'user_agent': 'Mozilla/5.0'}
#     r = requests.get(url, headers=kv)  # 改变自己的请求数据
#     r.raise_for_status()
#     r.encoding = r.apparent_encoding
#     print(r.text[1000:2000])  # 部分信息
# except:
#     print("失败")
#

# keyword = 'python'
# try:
#     kv = {'wd': keyword}
#     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
#     r = requests.get('https://www.baidu.com/s', params=kv, headers=headers)
#     r.raise_for_status()
#     r.encoding = r.apparent_encoding
#     # print(len(r.text))
#     print(r.text)
# except:
#     print("失败")


# try:
#     url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1567500795590&di=9f0c7eda33448553c317d5526dcc00d1&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fblog%2F201402%2F12%2F20140212200704_WcTiF.thumb.700_0.jpeg"
#     root = "/Users/synyi/PycharmProjects/For_testing/Crawler/"
#     path = root + url.split("%")[-1] + ".jpeg"
#     r = requests.get(url)
#     f = open(path, 'wb')
#     f.write(r.content)
#     f.close()
# except:
#     print('失败')


# beautiful soup
# html_doc = """
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title"><b>The Dormouse's story</b></p>
#
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
#
# <p class="story">...</p>
# """
# soup = BeautifulSoup(html_doc, 'lxml')
# print(soup.prettify())
# print('\n------------\n')
# # 找到所有<a>标签的链接
# for link in soup.find_all('a'):
#     print(link.get('href'))
#
# # 所有文字内容
# print(soup.get_text())
#
# # 结合正则
# import re
# for tag in soup.find_all(href=re.compile("elsie")):
#     print(tag)
#
# for tag in soup.find_all("a", class_ = "sister"):
#     print(tag)


# example
# html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
# bsObj = BeautifulSoup(html.read(), 'lxml')
# print(bsObj)
# print(bsObj.div)

# html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())
# nameList = bsObj.findAll('span', {'class':'green'})
# for name in nameList:
#     print(name.get_text())

# html = urlopen("http://www.pythonscraping.com/pages/page3.html")
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())

# html = urlopen(r'https://en.wikipedia.org/wiki/Kevin_Bacon')
# bsObj = BeautifulSoup(html, 'lxml')
# print(bsObj.prettify())



# brower = webdriver.Chrome()
#
# try:
#     brower.get("http://www.baidu.com/")
#     input = brower.find_element_by_id('kw')
#     input.send_keys('Python')
#     input.send_keys(Keys.ENTER)
#     wait = WebDriverWait(brower, 10)
#     wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
#
#     print(brower.current_url)
#     print('\n-------------------------------\n')
#     print(brower.get_cookies())
#     print('\n-------------------------------\n')
#     print(brower.page_source)
#
# finally:
#     pass

# driver = webdriver.PhantomJS(executable_path = "/Users/synyi/Documents/phantomjs-2.1.1-macosx/bin/phantomjs")
# driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
# driver.get("https://db.yaozh.com/goods?p=7&pageSize=30")
# a = 1
# print(driver.find_element_by_id("content").text)

# time.sleep(3)
# print(driver.find_element_by_id("content").text)



# from selenium.common.exceptions import StaleElementReferenceException
# def waitForLoad(driver):
#     elem = driver.find_element_by_tag_name("html")
#     count = 0
#     while True:
#         count += 1
#         if count > 10:
#             print('Timing out after 10 seconds and returning')
#             return
#         time.sleep(.5)
#         try:
#             elem == driver.find_element_by_tag_name("html")
#         except StaleElementReferenceException:
#             return
#
# driver = webdriver.PhantomJS(executable_path = "/Users/synyi/Documents/phantomjs-2.1.1-macosx/bin/phantomjs")
# driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
# print(driver.page_source)
# print('--------------------')
# waitForLoad(driver)
# print(driver.page_source)

# from PIL import Image, ImageFilter
# kitten = Image.open("/Users/synyi/PycharmProjects/For_testing/Crawler/totoro.jpeg")
# blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
# blurryKitten.save("/Users/synyi/PycharmProjects/For_testing/Crawler/totoro-2.jpeg")
# blurryKitten.show()
# a = 1

# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--disable-gpu")
#
# driver = webdriver.Chrome(chrome_options=chrome_options)
# driver.get('https://www.baidu.com/')
# print('open chrome')
# a = 1
# print(driver.title)
# driver.find_element_by_id("kw").send_keys("测试")
# print('关闭')
# driver.quit()
# print("ok")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_window_size(1366, 768)
# get方法会一直等到页面加载,然后才会继续程序,通常测试会在这里选择time.sleep(2)
driver.get("http://www.baidu.com/")
# 获取页面名为wraper的id标签的文本内容
data = driver.find_element_by_id('wrapper').text
# 打印数据内容
print(data)