天眼查反爬的曲线救国道路-爬取红盾网企业信息(Python爬虫实战)
程序员文章站
2022-04-08 23:20:15
...
先在这里给出红盾网抓取企业信息代码,有时间再去研究如果对天眼查进行企业信息抓取,后续更新…
import requests
import time
from lxml import etree
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}
f=open("company_info.csv","a",encoding="utf-8")
def paser_detail(url):
response = requests.get(url=url, headers=headers)
time.sleep(1)
items = etree.HTML(response.text)
titles = items.xpath('//a[@class="name"]')
for titl in titles: # 地区
title = titl.xpath('./text()')[0]
lis = items.xpath('//*[@id="list-container"]/ul/li')
for li in lis:
mingcheng=li.xpath('./div/a/text()')[0]
daima = li.xpath('./div/p[1]/a/span[1]/text()')[0] # 代码
person = li.xpath('./div/p[1]/a/span[2]/text()')[0] # 法人
address = li.xpath('./div/p[2]/a/span/text()')[0] # 地址
f.write(title+"\t"+mingcheng+"\t"+daima+"\t"+person+"\t"+address+"\n")
data = [title, daima, person, address]
collection = {
'地区': title,
'代码': daima,
'法人': person,
'地址': address
}
print(mingcheng,title, daima, person, address)
if __name__ == '__main__':
for i in range(51,1000):#修改当前页
print("第"+str(i)+"页")
paser_detail("https://www.ubaike.cn/class_204/"+str(i)+".html")