python爬虫基础 使用urllib库抓取高德接口边界信息
程序员文章站
2022-05-03 19:59:14
...
使用的是python3
需要pip install MySQL
#爬取逻辑
1.根据name查询匹配建筑物
2.通过接口返回的建筑物信息获取建筑物ID
3.用建筑物ID查询边界信息
4.数据返回格式是json字符串,直接转成json处理
5.遍历出边界信息保存
6.存储边界信息
#coding=utf-8
import urllib.request
from urllib.parse import quote
import time
import json
import pymysql
import random
import string
COON = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='navi_scrapy',
charset='utf8')
pagestart = 1
'''
1.数据表增加border字段
2.修改数据库链接
3.修改对应中的name和city字段
4.启动run_gaode_border.py
'''
start_url = 'https://restapi.amap.com/v3/place/text?key=4b86820a7590de60e4f81f53e59ae17f&citylimit=true&output=json&' #开始网址
url = "https://ditu.amap.com/detail/get/detail?id="
def hello():
citys = get_data() #查询数据
print(citys)
for city in citys:
tempurl = quote(start_url + "keywords="+str(city[2])+"&city="+str(city[3])+"", safe=string.printable)
request = urllib.request.Request(url=tempurl, headers=get_header(),method='GET')
time.sleep(0.8)
response = urllib.request.urlopen(request)
parse(response,{"id": city[0], "name": city[2]})
def parse(response,meta):
try:
data = json.loads(response.read().decode("utf8"))
print(data)
if data["status"] == "1":
poi = data["pois"][0] # 一般第一个就是查找的
print( poi )
if poi["parent"] != []:
print("查询parent ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
id = poi["parent"]
else:
id = poi["id"]
gaode_url = url + str(id)
print(gaode_url)
request = urllib.request.Request(url=gaode_url, headers=get_header(), method='GET')
time.sleep(0.8)
response = urllib.request.urlopen(request)
info(response,{"id": meta["id"], "url": gaode_url})
else:
print("接口返回异常~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·")
except Exception as e:
print("查询失败~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·" + e)
def info(response,meta):
try:
data = json.loads(response.read().decode("utf8"))
print(data)
if data["status"] == "1":
spec = data["data"]["spec"]
border = spec["mining_shape"]["shape"]
print("border :~~~~~~~~~~~~~~~~~~~",border)
update_data((meta["id"]),border)
except Exception as e:
print("查询错误~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + meta["url"] + " error: "+ e)
def get_data():
# 创建游标
cursor = COON.cursor()
try:
sql = "SELECT * FROM new_shopping WHERE border is null"
# 执行SQL,并返回受影响行数,执行多次
cursor.execute(sql)
infoList = cursor.fetchall()
return infoList
except Exception as e:
print(e)
finally:
# 关闭
cursor.close()
def update_data(id,border):
# 创建游标
cursor = COON.cursor()
try:
sql = "update new_shopping set border='" +str(border)+ " 'where id=" + str(id)
# 执行SQL,并返回受影响行数,执行多次
cursor.execute(sql)
except Exception as e:
print(e)
finally:
# 关闭游标
cursor.close()
def get_header():
'''
随机生成User-Agent
:return:
'''
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*',
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
]
result = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return result
hello()