欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

这是一个静态页面爬虫学习

程序员文章站 2022-06-05 18:46:59
...
import warnings
warnings.filterwarnings("ignore")

初见网络爬虫

调用 BeautifulSoup对象

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://pythonscraping.com/pages/page1.html")
bs0bj=BeautifulSoup(html.read())

print(bs0bj)
print(bs0bj.h1) 
#用print(bs0bj.h1)输出<h1></h1>中的内容

复杂的HTML解析

findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
#创建一个beautifulsoup对象

namelist=bs0bj.findAll("span",{"class":"green"})
for name in namelist:
    print(name.get_text())
#取出span标签,用 `.get_text()`函数会清楚所有的标签,只包含文字的字符串

#namelist1=bs0bj.findAll(text="the price")

BeautifulSoup的find()和findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)

allText=bs0bj.findAll("",{"id":"text"})
for i in allText:
    print(i.get_text())

#findAll(tag,attributes,recursive,text,limit,keyboards)
#find(tag,attributes,recursive,text,keyboards)  #find默认limit=1
#后三行等同于用keyword:
#allText=bs0bj.findAll(id="text")
#print(allText[0].get_text())

处理子标签(.children)

#BeautifulSoup 函数总是处理当前的“后代标签”
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).children:
    print(child)

处理兄弟标签(.next_siblings)`

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling)

#next_sibling和previous_sibling函数作用类似,返回的是单个标签,而不是一组标签

父标签处理()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

print(bs0bj.find("img",{"src":"../img/gifts/img1.jpg"
                       }).parent.previous_sibling.get_text())

正则表达式和BeautifulSoup

from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

images=bs0bj.findAll("img",{"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image["src"])

lambda表达式

#soup.findAll(lambda tag:len(tag.attrs)==2)

#上式中:
#myTag.attrs可获得全部属性

开始采集

遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.findAll("a"):
    if "href" in line.attrs:
        print(line.attrs["href"])
#只看指向词条页面的链接(不看侧边栏、页眉、页脚)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")}):
    if "href" in line.attrs:
        print(line.attrs["href"])

#此处前者必须用find,后者必须用findAll,原因未知。如有知情者欢迎帮助我解答。
#并且findAll才与line.attrs对应,有输出
#由此,findAll才是王道啊
#设置随机数,利用函数获取新页面的函数链接

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime

random.seed(datetime.datetime.now())
def getlines(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")})

links=getlines("/wiki/Kevin_Bacon")
print(links)
while len(links)>0:
    newArticle=links[random.randint(0,len(links)-1)].attrs["href"]
    print(newArticle)
    links=getlines(newArticle)

采集整个网站

#为了避免一个页面被采集两次,需要链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#global 为一个定义在函数外的变量赋值
#收集整个网站数据

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    try:
        print(bs0bj.h1.get_text())
        print(bs0bj.find(id="mw-content-text").findAll("p")[0])
        print(bs0bj.find(id="ca-edit").find("span").find("a").attr["href"])
    except AttributeError:
        print("页面缺少一些属性!不过不用担心")
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#每个打印语句都是按照数据在页面上出现的可能性
#得到随机外链,若没外链则换

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" or "www" that do
    #not contain the current URL
    for link in bsObj.findAll("a", href=re.compile(
                                "^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html,"html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is: "+externalLink)
    followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")

#bsObj = BeautifulSoup(html,"html.parser")
#后面的"html.parser"可以省略


#采集一个网站所有的外链
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
    bsObj = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(bsObj,domain)
    externalLinks = getExternalLinks(bsObj,domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

followExternalOnly("http://oreilly.com")

allIntLinks.add("http://oreilly.com")
getAllExternalLinks("http://oreilly.com")
#用scrapy爬。。这一块太强大,暂时放弃

from scrapy.selector import Selector
from scrapy import Spider
from typapa.typapa.items import Article

class ArticleSpider(Spider):
    name="article"
    allowed_domains=["en.wikipedia.org"]
    start_urls=["http://en.wikipedia.org/wiki/Main_Page",
                "http://en.wikipedia.org/wiki/Python_%28programming_language%29"]

    def parse(delf,response):
        item=Article()
        title=response.xpath("//h1/text()")[0].extract()
        print("Title is: "+title)
        item["title"]=title
        return item

解析JSON数据

#获取ip地址属于哪个国家

import json 
from urllib.request import urlopen

def getCountry(ipAddress):
    response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
    responseJson=json.loads(response)
    return responseJson.get("country_code")
print (getCountry("50.78.253.58"))
import json
jsonString='{"array0fNums":[{"number":0},{"number":1},{"number":2}],"array0fFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj=json.loads(jsonString)

print(json0bj.get("array0fNums"))
print(json0bj.get("array0fNums")[1])
print(json0bj.get("array0fNums")[1].get("number")+
     json0bj.get("array0fNums")[2].get("number"))
print(json0bj.get("array0fFruits")[2].get("fruit"))
#利用getLinks函数进行解析并找到需要的url
#利用getHistoryIPs函数取出历史网址中的ip地址
#利用getCountry函数得到ip地址对应的实际的国家

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re 
import datetime 
import random

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^(\/wiki\/)((?!:).)*$")})

def getHistoryIPs(pageUrl):
    pageUrl=pageUrl.replace("/wiki/","")
    historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
    print("history url is: "+historyUrl)
    html=urlopen(historyUrl)
    bs0bj=BeautifulSoup(html)
    Addresses=bs0bj.findAll("a",{"class":"mw-anonuserLink"})
    addressList=set()
    for Address in Addresses:
        addressList.add(Address)
    return addressList

def getCountry(ipAddress):
    try:
        response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson["region_name"]

links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
    for link in links:
        print("-------------------") 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print(historyIP+" is from "+country)
    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
    links = getLinks(newLink)

存储数据

媒体文件

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"id":"logo"}).find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
#哈哈哈,尝试下了一张游侠客图片

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.youxiake.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"href":"http://www.youxiake.com/hotel/yunhe"}).find("img").attrs["src"]
print(imageLocation)
urlretrieve(imageLocation,"logo1.jpg")

把数据存储到csv

import csv

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/testa.csv","wt", newline ="")
try:
    writer=csv.writer(csvFile)
    writer.writerow(("number","number plus 2","number times 2"))
    for i in range(10):
        writer.writerow((i,i+2,i*2))
finally:
    csvFile.close

#`newline=""`可去除空行
#获取html表格并写入csv文件

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

html=urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bj0bs=BeautifulSoup(html)
table=bj0bs.findAll("table",{"class":"wikitable"})[0]
rows=table.findAll("tr")

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/test1.csv","wt",newline="",encoding="utf-8")
writer=csv.writer(csvFile)
#上两行为导入时必须写

try:
    for row in rows:
        csvRow=[]
        for cell in row.findAll(["td","th"]):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
        #上行为导入一行
finally:
    csvFile.close()