这是一个静态页面爬虫学习
程序员文章站
2022-06-05 18:46:59
...
import warnings
warnings.filterwarnings("ignore")
初见网络爬虫
调用 BeautifulSoup对象
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://pythonscraping.com/pages/page1.html")
bs0bj=BeautifulSoup(html.read())
print(bs0bj)
print(bs0bj.h1)
#用print(bs0bj.h1)输出<h1></h1>中的内容
复杂的HTML解析
findAll()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
#创建一个beautifulsoup对象
namelist=bs0bj.findAll("span",{"class":"green"})
for name in namelist:
print(name.get_text())
#取出span标签,用 `.get_text()`函数会清楚所有的标签,只包含文字的字符串
#namelist1=bs0bj.findAll(text="the price")
BeautifulSoup的find()和findAll()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
allText=bs0bj.findAll("",{"id":"text"})
for i in allText:
print(i.get_text())
#findAll(tag,attributes,recursive,text,limit,keyboards)
#find(tag,attributes,recursive,text,keyboards) #find默认limit=1
#后三行等同于用keyword:
#allText=bs0bj.findAll(id="text")
#print(allText[0].get_text())
处理子标签(.children)
#BeautifulSoup 函数总是处理当前的“后代标签”
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
for child in bs0bj.find("table",{"id":"giftList"}).children:
print(child)
处理兄弟标签(.next_siblings)`
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
for child in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
print(sibling)
#next_sibling和previous_sibling函数作用类似,返回的是单个标签,而不是一组标签
父标签处理()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
print(bs0bj.find("img",{"src":"../img/gifts/img1.jpg"
}).parent.previous_sibling.get_text())
正则表达式和BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)
images=bs0bj.findAll("img",{"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
print(image["src"])
lambda表达式
#soup.findAll(lambda tag:len(tag.attrs)==2)
#上式中:
#myTag.attrs可获得全部属性
开始采集
遍历单个域名
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)
for line in bs0bj.findAll("a"):
if "href" in line.attrs:
print(line.attrs["href"])
#只看指向词条页面的链接(不看侧边栏、页眉、页脚)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)
for line in bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")}):
if "href" in line.attrs:
print(line.attrs["href"])
#此处前者必须用find,后者必须用findAll,原因未知。如有知情者欢迎帮助我解答。
#并且findAll才与line.attrs对应,有输出
#由此,findAll才是王道啊
#设置随机数,利用函数获取新页面的函数链接
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime
random.seed(datetime.datetime.now())
def getlines(articleUrl):
html=urlopen("http://en.wikipedia.org"+articleUrl)
bs0bj=BeautifulSoup(html)
return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")})
links=getlines("/wiki/Kevin_Bacon")
print(links)
while len(links)>0:
newArticle=links[random.randint(0,len(links)-1)].attrs["href"]
print(newArticle)
links=getlines(newArticle)
采集整个网站
#为了避免一个页面被采集两次,需要链接去重
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def getlinks(pageUrl):
global pages
html=urlopen("http://en.wikipedia.org"+ pageUrl)
bs0bj=BeautifulSoup(html)
for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
newpage=link.attrs["href"]
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks(" ")
#global 为一个定义在函数外的变量赋值
#收集整个网站数据
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages=set()
def getlinks(pageUrl):
global pages
html=urlopen("http://en.wikipedia.org"+ pageUrl)
bs0bj=BeautifulSoup(html)
try:
print(bs0bj.h1.get_text())
print(bs0bj.find(id="mw-content-text").findAll("p")[0])
print(bs0bj.find(id="ca-edit").find("span").find("a").attr["href"])
except AttributeError:
print("页面缺少一些属性!不过不用担心")
for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
newpage=link.attrs["href"]
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks(" ")
#每个打印语句都是按照数据在页面上出现的可能性
#得到随机外链,若没外链则换
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []
#Finds all links that begin with a "/"
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith("/")):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
#Finds all links that start with "http" or "www" that do
#not contain the current URL
for link in bsObj.findAll("a", href=re.compile(
"^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html,"html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("No external links, looking around the site for one")
domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj, domain)
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
#bsObj = BeautifulSoup(html,"html.parser")
#后面的"html.parser"可以省略
#采集一个网站所有的外链
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
bsObj = BeautifulSoup(html, "html.parser")
internalLinks = getInternalLinks(bsObj,domain)
externalLinks = getExternalLinks(bsObj,domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
followExternalOnly("http://oreilly.com")
allIntLinks.add("http://oreilly.com")
getAllExternalLinks("http://oreilly.com")
#用scrapy爬。。这一块太强大,暂时放弃
from scrapy.selector import Selector
from scrapy import Spider
from typapa.typapa.items import Article
class ArticleSpider(Spider):
name="article"
allowed_domains=["en.wikipedia.org"]
start_urls=["http://en.wikipedia.org/wiki/Main_Page",
"http://en.wikipedia.org/wiki/Python_%28programming_language%29"]
def parse(delf,response):
item=Article()
title=response.xpath("//h1/text()")[0].extract()
print("Title is: "+title)
item["title"]=title
return item
解析JSON数据
#获取ip地址属于哪个国家
import json
from urllib.request import urlopen
def getCountry(ipAddress):
response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
responseJson=json.loads(response)
return responseJson.get("country_code")
print (getCountry("50.78.253.58"))
import json
jsonString='{"array0fNums":[{"number":0},{"number":1},{"number":2}],"array0fFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj=json.loads(jsonString)
print(json0bj.get("array0fNums"))
print(json0bj.get("array0fNums")[1])
print(json0bj.get("array0fNums")[1].get("number")+
json0bj.get("array0fNums")[2].get("number"))
print(json0bj.get("array0fFruits")[2].get("fruit"))
#利用getLinks函数进行解析并找到需要的url
#利用getHistoryIPs函数取出历史网址中的ip地址
#利用getCountry函数得到ip地址对应的实际的国家
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html=urlopen("http://en.wikipedia.org"+articleUrl)
bs0bj=BeautifulSoup(html)
return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^(\/wiki\/)((?!:).)*$")})
def getHistoryIPs(pageUrl):
pageUrl=pageUrl.replace("/wiki/","")
historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
print("history url is: "+historyUrl)
html=urlopen(historyUrl)
bs0bj=BeautifulSoup(html)
Addresses=bs0bj.findAll("a",{"class":"mw-anonuserLink"})
addressList=set()
for Address in Addresses:
addressList.add(Address)
return addressList
def getCountry(ipAddress):
try:
response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
except HTTPError:
return None
responseJson = json.loads(response)
return responseJson["region_name"]
links = getLinks("/wiki/Python_(programming_language)")
while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
country = getCountry(historyIP)
if country is not None:
print(historyIP+" is from "+country)
newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
存储数据
媒体文件
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"id":"logo"}).find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
#哈哈哈,尝试下了一张游侠客图片
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.youxiake.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"href":"http://www.youxiake.com/hotel/yunhe"}).find("img").attrs["src"]
print(imageLocation)
urlretrieve(imageLocation,"logo1.jpg")
把数据存储到csv
import csv
csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/testa.csv","wt", newline ="")
try:
writer=csv.writer(csvFile)
writer.writerow(("number","number plus 2","number times 2"))
for i in range(10):
writer.writerow((i,i+2,i*2))
finally:
csvFile.close
#`newline=""`可去除空行
#获取html表格并写入csv文件
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html=urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bj0bs=BeautifulSoup(html)
table=bj0bs.findAll("table",{"class":"wikitable"})[0]
rows=table.findAll("tr")
csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/test1.csv","wt",newline="",encoding="utf-8")
writer=csv.writer(csvFile)
#上两行为导入时必须写
try:
for row in rows:
csvRow=[]
for cell in row.findAll(["td","th"]):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
#上行为导入一行
finally:
csvFile.close()
上一篇: 网络爬虫之数据解析
下一篇: 用Sigar取得网卡流量数据
推荐阅读