(1)爬虫笔记备份
程序员文章站
2022-06-26 08:46:38
'''第一天import requestsfrom urllib.request import urlopenurl = 'http://quote.eastmoney.com/us/BIDU.html?from=BaiduAladdin'response = urlopen(url)info = response.read()print(info.decode())print(response.info())''''''动态UApip install fake_userag...
'''
第一天
import requests
from urllib.request import urlopen
url = 'http://quote.eastmoney.com/us/BIDU.html?from=BaiduAladdin'
response = urlopen(url)
info = response.read()
print(info.decode())
print(response.info())
'''
'''
动态UA
pip install fake_useragent
from fake_useragent import UserAgent
ua=UserAgent()
print(ua.chrome)
from urllib.request import urlopen
from urllib.request import Request
from random import choice
url = 'http://quote.eastmoney.com/us/BIDU.html?from=BaiduAladdin'
user_agents=['Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11']
print(choice(user_agents))
headers={
'User-Agent':choice(user_agents)
}
request=Request(url,headers=headers)
response=urlopen(request)
info=response.read()
print(info.decode())
'''
'''
搜索中文转码1
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import quote
print(quote('历史'))
url = 'https://www.baidu.com/s?wd={}'.format(quote('历史'))
headers={
'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'
}
print(url)
request = Request(url,headers=headers)
response= urlopen(request)
print(response.read().decode())
'''
'''
搜索中文转码2
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import urlencode
arg={
'wd':'历史',
'ie':'utf-8'
}
print(urlencode(arg))
url = 'https://www.baidu.com/s?{}'.format(urlencode(arg))
headers={
'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'
}
print(url)
request = Request(url,headers=headers)
response= urlopen(request)
print(response.read().decode())
'''
'''
爬贴吧
‘’‘
'''
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import urlencode
from random import choice
def get_html(url):
user_agents = ['Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11']
#print(choice(user_agents))
headers = {
'User-Agent': choice(user_agents)
}
request=Request(url,headers=headers)
response=urlopen(request)
return response.read()
def save_html(filename,html_bytes):
with open(filename,'wb') as f:
f.write(html_bytes)
def main():
content=input('download')
num = input('num')
base_url='https://tieba.baidu.com/f?ie=utf-8&{}'
for pn in range(int(num)):
args={
'pn':pn*50,
'kw':content
}
args=urlencode(args)
print(base_url.format(args))
#url=base_url.format(args)
html_bytes = get_html(base_url.format(args))
filename = '第'+str(pn+1)+'页.html'
print('正在下载'+filename)
save_html(filename,html_bytes)
if __name__ == '__main__':
main()
本文地址:https://blog.csdn.net/qq_42830971/article/details/107154486
上一篇: 通用网页播放器