欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫基础 使用urllib库抓取高德接口边界信息

程序员文章站 2022-05-03 19:59:14
...

使用的是python3

需要pip install MySQL

#爬取逻辑

1.根据name查询匹配建筑物

2.通过接口返回的建筑物信息获取建筑物ID

3.用建筑物ID查询边界信息

4.数据返回格式是json字符串,直接转成json处理

5.遍历出边界信息保存

6.存储边界信息

#coding=utf-8
import urllib.request
from urllib.parse import quote
import time
import json
import pymysql
import random
import string

COON = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    user='root',
    passwd='root',
    db='navi_scrapy',
    charset='utf8')

pagestart = 1

'''
1.数据表增加border字段
2.修改数据库链接
3.修改对应中的name和city字段
4.启动run_gaode_border.py

'''


start_url = 'https://restapi.amap.com/v3/place/text?key=4b86820a7590de60e4f81f53e59ae17f&citylimit=true&output=json&'    #开始网址

url = "https://ditu.amap.com/detail/get/detail?id="



def hello():
     citys = get_data() #查询数据
     print(citys)
     for city in citys:
         tempurl = quote(start_url + "keywords="+str(city[2])+"&city="+str(city[3])+"", safe=string.printable)
         request = urllib.request.Request(url=tempurl, headers=get_header(),method='GET')
         time.sleep(0.8)
         response = urllib.request.urlopen(request)
         parse(response,{"id": city[0], "name": city[2]})


def parse(response,meta):

    try:
        data = json.loads(response.read().decode("utf8"))
        print(data)
        if data["status"] == "1":
            poi = data["pois"][0] # 一般第一个就是查找的
            print( poi  )
            if poi["parent"] != []:
                print("查询parent ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                id = poi["parent"]
            else:
                id = poi["id"]

            gaode_url = url + str(id)
            print(gaode_url)
            request = urllib.request.Request(url=gaode_url, headers=get_header(), method='GET')
            time.sleep(0.8)
            response = urllib.request.urlopen(request)
            info(response,{"id": meta["id"], "url": gaode_url})

        else:
            print("接口返回异常~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·")
    except Exception as e:

        print("查询失败~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·" + e)


def info(response,meta):
    try:
        data = json.loads(response.read().decode("utf8"))
        print(data)
        if data["status"] == "1":
            spec = data["data"]["spec"]
            border = spec["mining_shape"]["shape"]
            print("border :~~~~~~~~~~~~~~~~~~~",border)
            update_data((meta["id"]),border)
    except Exception as e:
        print("查询错误~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + meta["url"] + " error: "+ e)

def get_data():
    # 创建游标
    cursor = COON.cursor()
    try:
        sql = "SELECT * FROM new_shopping WHERE border is null"
        # 执行SQL,并返回受影响行数,执行多次
        cursor.execute(sql)
        infoList = cursor.fetchall()
        return infoList
    except Exception as e:
        print(e)
    finally:
        # 关闭
        cursor.close()

def update_data(id,border):
    # 创建游标
    cursor = COON.cursor()
    try:
        sql = "update new_shopping set border='" +str(border)+ " 'where id=" + str(id)
        # 执行SQL,并返回受影响行数,执行多次
        cursor.execute(sql)
    except Exception as e:
        print(e)
    finally:
        # 关闭游标
        cursor.close()

def get_header():
    '''
               随机生成User-Agent
               :return:
               '''
    head_connection = ['Keep-Alive', 'close']
    head_accept = ['text/html, application/xhtml+xml, */*',
                   'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8']
    head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
    head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                       'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
                       'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                       'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
                       ]
    result = {
        'Connection': head_connection[0],
        'Accept': head_accept[0],
        'Accept-Language': head_accept_language[1],
        'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
    }
    return result

hello()