爬虫程序2
程序员文章站
2022-01-28 21:52:53
...
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
def get_url(url):
r = requests.get(url)
text = r.text
#使用etree.HTML处理源代码,然后使用Xpath提取内容。xpath里的内容为何这样写。
html = etree.HTML(text)
content1 = html.xpath('//div[@class="zx-yiyuan-top"]/span[2]/a/@href')#xpath这样写的原因
content2 = html.xpath('//div[@class="zx-yichi-top"]/span[2]/a/@href')
all_interlinkage1 = content1 + content2
return all_interlinkage1
#headers是用来做什么。为了防止网页认为是在爬虫操作。
headers = {
'Cookie':'ASPSESSIONIDACRQDSBT=IKHIFBFCKPAKOCNCLJFOKBDB; ASPSESSIONIDQABSASAR=NMIIFBFCCINBABDKJMGDMJLA; fikker-VSPC-05re=MiR9hWJ92ShLIUv9D5PcBOrCRgNgvwpe; fikker-VSPC-05re=MiR9hWJ92ShLIUv9D5PcBOrCRgNgvwpe; Hm_lvt_c58e42b54acb40ab70d48af7b1ce0d6a=1540193271,1540193667,1540194766,1540194772; Hm_lpvt_c58e42b54acb40ab70d48af7b1ce0d6a=1540195322',
'Host':'www.kq36.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}
url = 'http://www.kq36.com/'
all_interlinkage1 = get_url(url)
#url为什么要变成这样,是为了获取网页信息。‘li_title’这个参数。b这个变量运用正则表达式
def get_content(all_interlinkage1):
d = []
urls = []
for interlinkage in all_interlinkage1:
# print(interlinkage)
for i in range(1,100):
url = interlinkage.split('?')[0] + '?page=' + str(i) + '&' + interlinkage.split('?')[1]
# print(interlinkage.split('?')[0])
urls.append(url)
for url in urls:
try:
r = requests.get(url,headers=headers)
text = r.text
soup = BeautifulSoup(text,'html.parser')
content = soup.find_all('div','li_title')
for i in content:
b = re.search(r'.*<a href="(.*?)" ', str(i)).group(1)
d.append(b)
print(b)
except:
pass
return d
上一篇: urllib 网页爬虫