网络爬虫的简易实现(1)
程序员文章站
2022-06-05 18:46:19
...
这个爬虫主要实现对http://pic.yesky.com这个网站图片的爬取;
import urllib
import urllib2
import re
import time
from bs4 import BeautifulSoup
send_headers = {}
send_headers["Host"] = "http://pic.yesky.com"
send_headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.3228.1 Safari/537.36"
send_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
send_headers["Connection"] = "keep-alive"
#print(send_headers)
#f = urllib2.urlopen("https://www.4399.com")
req = urllib2.Request("http://pic.yesky.com") #伪装头部的请求
#print(req.headers)
f = urllib2.urlopen(req)
html = f.read()
print(html)
soup = BeautifulSoup(html)
yy = soup.select("img") #按标签进行查找
print(yy)
ruffix = "jpg" #存储后缀
i = 1
for temp in yy:
# print(temp['src'])
print('-'*50)
print(temp.prettify())
str =temp['src'].encode('gbk') #unicode编码转为string类型,查找后缀
ruffix = str[str.rfind('.'):str.rfind('.')+4:1]
print(ruffix)
print(type(str))
if str.find("htt") != -1:
print(temp['src'])
urllib.urlretrieve(temp['src'],filename="/home/lxt/Desktop/pach/4399Pic/%d%s"%(i,ruffix)) #保存图片到本地
i+=1
time.sleep(1)