欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫程序2

程序员文章站 2022-01-28 21:52:53
...
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
def get_url(url):
    r = requests.get(url)
    text = r.text
    #使用etree.HTML处理源代码,然后使用Xpath提取内容。xpath里的内容为何这样写。
    html = etree.HTML(text)
    content1 = html.xpath('//div[@class="zx-yiyuan-top"]/span[2]/a/@href')#xpath这样写的原因
    content2 = html.xpath('//div[@class="zx-yichi-top"]/span[2]/a/@href')
    all_interlinkage1 = content1 + content2
    return all_interlinkage1
    
#headers是用来做什么。为了防止网页认为是在爬虫操作。
headers = {
'Cookie':'ASPSESSIONIDACRQDSBT=IKHIFBFCKPAKOCNCLJFOKBDB; ASPSESSIONIDQABSASAR=NMIIFBFCCINBABDKJMGDMJLA; fikker-VSPC-05re=MiR9hWJ92ShLIUv9D5PcBOrCRgNgvwpe; fikker-VSPC-05re=MiR9hWJ92ShLIUv9D5PcBOrCRgNgvwpe; Hm_lvt_c58e42b54acb40ab70d48af7b1ce0d6a=1540193271,1540193667,1540194766,1540194772; Hm_lpvt_c58e42b54acb40ab70d48af7b1ce0d6a=1540195322',
'Host':'www.kq36.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}
url = 'http://www.kq36.com/'
all_interlinkage1 = get_url(url)
#url为什么要变成这样,是为了获取网页信息。‘li_title’这个参数。b这个变量运用正则表达式
def get_content(all_interlinkage1):
    d = [] 
    urls = []
    for interlinkage in all_interlinkage1:
#         print(interlinkage)
        for i in range(1,100):
            url = interlinkage.split('?')[0] + '?page=' + str(i) + '&' + interlinkage.split('?')[1]
#             print(interlinkage.split('?')[0])
            urls.append(url)
    for url in urls:
        try:
            r = requests.get(url,headers=headers)
            text = r.text
            soup = BeautifulSoup(text,'html.parser')
            content = soup.find_all('div','li_title')
            for i in content:
                b = re.search(r'.*<a href="(.*?)" ', str(i)).group(1)
                d.append(b)
                print(b)
        except:
            pass
    return d
相关标签: 爬虫