这是一个静态页面爬虫学习

程序员文章站 2022-06-05 18:46:59

...

import warnings
warnings.filterwarnings("ignore")

初见网络爬虫

调用 BeautifulSoup对象

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://pythonscraping.com/pages/page1.html")
bs0bj=BeautifulSoup(html.read())

print(bs0bj)
print(bs0bj.h1) 
#用print（bs0bj.h1)输出<h1></h1>中的内容

复杂的HTML解析

findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)
#创建一个beautifulsoup对象

namelist=bs0bj.findAll("span",{"class":"green"})
for name in namelist:
    print(name.get_text())
#取出span标签，用 `.get_text()`函数会清楚所有的标签，只包含文字的字符串

#namelist1=bs0bj.findAll(text="the price")

BeautifulSoup的find()和findAll()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj=BeautifulSoup(html)

allText=bs0bj.findAll("",{"id":"text"})
for i in allText:
    print(i.get_text())

#findAll(tag,attributes,recursive,text,limit,keyboards)
#find(tag,attributes,recursive,text,keyboards)  #find默认limit=1
#后三行等同于用keyword：
#allText=bs0bj.findAll(id="text")
#print(allText[0].get_text())

处理子标签(.children)

#BeautifulSoup 函数总是处理当前的“后代标签”
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).children:
    print(child)

处理兄弟标签(.next_siblings)`

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

for child in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling)

#next_sibling和previous_sibling函数作用类似，返回的是单个标签，而不是一组标签

父标签处理()

from urllib.request import urlopen
from bs4 import BeautifulSoup 

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

print(bs0bj.find("img",{"src":"../img/gifts/img1.jpg"
                       }).parent.previous_sibling.get_text())

正则表达式和BeautifulSoup

from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re

html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj= BeautifulSoup(html)

images=bs0bj.findAll("img",{"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image["src"])

lambda表达式

#soup.findAll(lambda tag:len(tag.attrs)==2)

#上式中：
#myTag.attrs可获得全部属性

开始采集

遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.findAll("a"):
    if "href" in line.attrs:
        print(line.attrs["href"])

#只看指向词条页面的链接（不看侧边栏、页眉、页脚）

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj=BeautifulSoup(html)

for line in bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")}):
    if "href" in line.attrs:
        print(line.attrs["href"])

#此处前者必须用find，后者必须用findAll,原因未知。如有知情者欢迎帮助我解答。
#并且findAll才与line.attrs对应，有输出
#由此，findAll才是王道啊

#设置随机数，利用函数获取新页面的函数链接

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime

random.seed(datetime.datetime.now())
def getlines(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^\/wiki\/((?!:).)*$")})

links=getlines("/wiki/Kevin_Bacon")
print(links)
while len(links)>0:
    newArticle=links[random.randint(0,len(links)-1)].attrs["href"]
    print(newArticle)
    links=getlines(newArticle)

采集整个网站

#为了避免一个页面被采集两次，需要链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#global 为一个定义在函数外的变量赋值

#收集整个网站数据

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getlinks(pageUrl):
    global pages
    html=urlopen("http://en.wikipedia.org"+ pageUrl)
    bs0bj=BeautifulSoup(html)
    try:
        print(bs0bj.h1.get_text())
        print(bs0bj.find(id="mw-content-text").findAll("p")[0])
        print(bs0bj.find(id="ca-edit").find("span").find("a").attr["href"])
    except AttributeError:
        print("页面缺少一些属性！不过不用担心")
    for link in bs0bj.findAll("a",{"href":re.compile("^(\/wiki\/)")}):
        if "href" in link.attrs:
            if link.attrs["href"] not in pages:
                newpage=link.attrs["href"]
                print(newpage)
                pages.add(newpage)
                getlinks(newpage)
getlinks(" ")

#每个打印语句都是按照数据在页面上出现的可能性

#得到随机外链，若没外链则换

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" or "www" that do
    #not contain the current URL
    for link in bsObj.findAll("a", href=re.compile(
                                "^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html,"html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is: "+externalLink)
    followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")

#bsObj = BeautifulSoup(html,"html.parser")
#后面的"html.parser"可以省略


#采集一个网站所有的外链
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc
    bsObj = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(bsObj,domain)
    externalLinks = getExternalLinks(bsObj,domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)

followExternalOnly("http://oreilly.com")

allIntLinks.add("http://oreilly.com")
getAllExternalLinks("http://oreilly.com")

#用scrapy爬。。这一块太强大，暂时放弃

from scrapy.selector import Selector
from scrapy import Spider
from typapa.typapa.items import Article

class ArticleSpider(Spider):
    name="article"
    allowed_domains=["en.wikipedia.org"]
    start_urls=["http://en.wikipedia.org/wiki/Main_Page",
                "http://en.wikipedia.org/wiki/Python_%28programming_language%29"]

    def parse(delf,response):
        item=Article()
        title=response.xpath("//h1/text()")[0].extract()
        print("Title is: "+title)
        item["title"]=title
        return item

解析JSON数据

#获取ip地址属于哪个国家

import json 
from urllib.request import urlopen

def getCountry(ipAddress):
    response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
    responseJson=json.loads(response)
    return responseJson.get("country_code")
print (getCountry("50.78.253.58"))

import json
jsonString='{"array0fNums":[{"number":0},{"number":1},{"number":2}],"array0fFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj=json.loads(jsonString)

print(json0bj.get("array0fNums"))
print(json0bj.get("array0fNums")[1])
print(json0bj.get("array0fNums")[1].get("number")+
     json0bj.get("array0fNums")[2].get("number"))
print(json0bj.get("array0fFruits")[2].get("fruit"))

#利用getLinks函数进行解析并找到需要的url
#利用getHistoryIPs函数取出历史网址中的ip地址
#利用getCountry函数得到ip地址对应的实际的国家

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re 
import datetime 
import random

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj=BeautifulSoup(html)
    return bs0bj.find("div",{"id":"bodyContent"}).findAll("a",{"href":re.compile("^(\/wiki\/)((?!:).)*$")})

def getHistoryIPs(pageUrl):
    pageUrl=pageUrl.replace("/wiki/","")
    historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
    print("history url is: "+historyUrl)
    html=urlopen(historyUrl)
    bs0bj=BeautifulSoup(html)
    Addresses=bs0bj.findAll("a",{"class":"mw-anonuserLink"})
    addressList=set()
    for Address in Addresses:
        addressList.add(Address)
    return addressList

def getCountry(ipAddress):
    try:
        response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson["region_name"]

links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
    for link in links:
        print("-------------------") 
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print(historyIP+" is from "+country)
    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
    links = getLinks(newLink)

存储数据

媒体文件

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.pythonscraping.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"id":"logo"}).find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")

#哈哈哈，尝试下了一张游侠客图片

from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen("http://www.youxiake.com/")
bs0bj=BeautifulSoup(html)
imageLocation=bs0bj.find("a",{"href":"http://www.youxiake.com/hotel/yunhe"}).find("img").attrs["src"]
print(imageLocation)
urlretrieve(imageLocation,"logo1.jpg")

把数据存储到csv

import csv

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/testa.csv","wt", newline ="")
try:
    writer=csv.writer(csvFile)
    writer.writerow(("number","number plus 2","number times 2"))
    for i in range(10):
        writer.writerow((i,i+2,i*2))
finally:
    csvFile.close

#`newline=""`可去除空行

#获取html表格并写入csv文件

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

html=urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bj0bs=BeautifulSoup(html)
table=bj0bs.findAll("table",{"class":"wikitable"})[0]
rows=table.findAll("tr")

csvFile=open("C:/Users/dell-pc/Documents/Python Scripts/test1.csv","wt",newline="",encoding="utf-8")
writer=csv.writer(csvFile)
#上两行为导入时必须写

try:
    for row in rows:
        csvRow=[]
        for cell in row.findAll(["td","th"]):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
        #上行为导入一行
finally:
    csvFile.close()

相关标签：爬虫网络爬虫 python

上一篇：网络爬虫之数据解析

下一篇：用Sigar取得网卡流量数据

这是一个静态页面爬虫学习

初见网络爬虫

调用 BeautifulSoup对象

复杂的HTML解析

findAll()

BeautifulSoup的find()和findAll()

处理子标签(.children)

处理兄弟标签(.next_siblings)`

父标签处理()

正则表达式和BeautifulSoup

lambda表达式

开始采集

遍历单个域名

采集整个网站

解析JSON数据

存储数据

媒体文件

把数据存储到csv

学习C#静态函数及变量的一个精典例子与代码

零基础入门学习爬虫，这四大工具你一定要掌握的，尤其是最后一个

一个jsp页面引入另一个jsp页面的三种方式及静态引入和动态引入的区别详解

一个月入门Python爬虫学习,轻松爬取大规模数据

前端学习笔记（五）HTML+CSS静态页面实战案例：幸福西饼首页和百度首页

一个做页面静态化的php类

从一个网站扒下的asp生成静态页面的代码脚本之家特供版

学习node从0做一个爬虫，有朋友留言说想学习一下VUE，我整理了之前自学时候的学习资源分享给大家。

一个做页面静态化的php类

发送-请教用php邮件如何实现将一个静态页面表单内容提交到指定邮箱？

这是一个静态页面爬虫学习

初见网络爬虫

调用 BeautifulSoup对象

复杂的HTML解析

findAll()

BeautifulSoup的find()和findAll()

处理子标签(.children)

处理兄弟标签(.next_siblings)`

父标签处理()

正则表达式和BeautifulSoup

lambda表达式

开始采集

遍历单个域名

采集整个网站

解析JSON数据

存储数据

媒体文件

把数据存储到csv

学习C#静态函数及变量的一个精典例子与代码

零基础入门学习爬虫，这四大工具你一定要掌握的，尤其是最后一个

一个jsp页面引入另一个jsp页面的三种方式及静态引入和动态引入的区别详解

一个月入门Python爬虫学习,轻松爬取大规模数据

前端学习笔记（五）HTML+CSS静态页面实战案例：幸福西饼首页和百度首页

一个做页面静态化的php类

从一个网站扒下的asp生成静态页面的代码 脚本之家特供版

学习node从0做一个爬虫，有朋友留言说想学习一下VUE，我整理了之前自学时候的学习资源分享给大家。

一个做页面静态化的php类

发送-请教用php邮件 如何实现将一个静态页面表单内容提交到指定邮箱？

从一个网站扒下的asp生成静态页面的代码脚本之家特供版

发送-请教用php邮件如何实现将一个静态页面表单内容提交到指定邮箱？