欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

网络爬虫的简易实现(1)

程序员文章站 2022-06-05 18:46:19
...

这个爬虫主要实现对http://pic.yesky.com这个网站图片的爬取;

import urllib
import urllib2
import re
import time
from bs4 import BeautifulSoup
send_headers = {}
send_headers["Host"] = "http://pic.yesky.com"
send_headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.3228.1 Safari/537.36"
send_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
send_headers["Connection"] = "keep-alive"
#print(send_headers)


#f = urllib2.urlopen("https://www.4399.com")
req = urllib2.Request("http://pic.yesky.com")    #伪装头部的请求
#print(req.headers)
f = urllib2.urlopen(req)
html = f.read()
print(html)
soup = BeautifulSoup(html)
yy = soup.select("img")    #按标签进行查找
print(yy)
ruffix = "jpg"     #存储后缀


i = 1
for temp in yy:
#	print(temp['src'])
	print('-'*50)
	print(temp.prettify())
	str =temp['src'].encode('gbk')     #unicode编码转为string类型,查找后缀
	ruffix = str[str.rfind('.'):str.rfind('.')+4:1]
	print(ruffix)
	print(type(str))
	if str.find("htt") != -1:
		print(temp['src'])
		urllib.urlretrieve(temp['src'],filename="/home/lxt/Desktop/pach/4399Pic/%d%s"%(i,ruffix))   #保存图片到本地
		i+=1
		time.sleep(1)


相关标签: python