Python 爬取携程所有机票的实例代码
程序员文章站
2023-11-28 20:49:28
打开携程网,查询机票,如广州到成都。
这时网址为:http://flights.ctrip.com/booking/can-ctu-day-1.html?ddat...
打开携程网,查询机票,如广州到成都。
这时网址为:http://flights.ctrip.com/booking/can-ctu-day-1.html?ddate1=2018-06-15
其中,can 表示广州,ctu 表示成都,日期 “2018-06-15”就比较明显了。一般的爬虫,只有替换这几个值,就可以遍历了。但观察发现,有个链接可以看到当前网页的所有json格式的数据。如下
同样可以看到城市和日期,该连接打开的是 json 文件,里面存储的就是当前页面的数据。显示如下,其中 "fis" 则是航班信息。
每一次爬取只要替换城市代码和日期即可,城市代码自己手动整理了一份:
city={"yie":"阿尔山","aku":"阿克苏","rht":"阿拉善右旗","axf":"阿拉善左旗","aat":"阿勒泰","ngq":"阿里","mfm":"澳门" ,"aqg":"安庆","ava":"安顺","aog":"鞍山","rlk":"巴彦淖尔","aeb":"百色","bav":"包头","bsd":"保山","bhy":"北海","bjs":"北京" ,"dbc":"白城","nbs":"白山","bfj":"毕节","bpl":"博乐","ckg":"重庆","bpx":"昌都","cgd":"常德","czx":"常州" ,"chg":"朝阳","ctu":"成都","juh":"池州","cif":"赤峰","swa":"潮州","cgq":"长春","csx":"长沙","cih":"长治","cde":"承德" ,"cwj":"沧源","dax":"达州","dlu":"大理","dlc":"大连","dqa":"大庆","dat":"大同","ddg":"丹东","dcy":"稻城","doy":"东营" ,"dnh":"敦煌","dax":"达县","lum":"德宏","ejn":"额济纳旗","dsn":"鄂尔多斯","enh":"恩施","erl":"二连浩特","fuo":"佛山" ,"foc":"福州","fyj":"抚远","fug":"阜阳","kow":"赣州","goq":"格尔木","gyu":"固原","gys":"广元","can":"广州","kwe":"贵阳" ,"kwl":"桂林","hrb":"哈尔滨","hmi":"哈密","hak":"海口","hld":"海拉尔","hdg":"邯郸","hzg":"汉中","hgh":"杭州","hfe":"合肥" ,"htn":"和田","hek":"黑河","het":"呼和浩特","hia":"淮安","hjj":"怀化","txn":"黄山","huz":"惠州","jxa":"鸡西","tna":"济南" ,"jng":"济宁","jgd":"加格达奇","jmu":"佳木斯","jgn":"嘉峪关","swa":"揭阳","jic":"金昌","knh":"金门","jnz":"锦州" ,"cyi":"嘉义","jhg":"景洪","jsj":"建三江","jjn":"晋江","jgs":"井冈山","jdz":"景德镇","jiu":"九江","jzh":"九寨沟","khg":"喀什" ,"kjh":"凯里","kgt":"康定","kry":"克拉玛依","kca":"库车","krl":"库尔勒","kmg":"昆明","lxa":"拉萨","lhw":"兰州","hzh":"黎平" ,"ljg":"丽江","llb":"荔波","lyg":"连云港","lpf":"六盘水","lfq":"临汾","lzy":"林芝","lnj":"临沧","lyi":"临沂","lzh":"柳州" ,"lzo":"泸州","lya":"洛阳","llv":"吕梁","jmj":"澜沧","lcx":"龙岩","nzh":"满洲里","lum":"芒市","mxz":"梅州","mig":"绵阳" ,"ohe":"漠河","mdg":"牡丹江","mfk":"马祖" ,"khn":"南昌","nao":"南充","nkg":"南京","nng":"南宁","ntg":"南通","nny":"南阳" ,"ngb":"宁波","nlh":"宁蒗","pzi":"攀枝花","sym":"普洱","ndg":"齐齐哈尔","jiq":"黔江","iqm":"且末","bpe":"秦皇岛","tao":"青岛" ,"iqn":"庆阳","juz":"衢州","rkz":"日喀则","riz":"日照","syx":"三亚","xmn":"厦门","sha":"上海","szx":"深圳","hpg":"神农架" ,"she":"沈阳","sjw":"石家庄","tcg":"塔城","hyn":"台州","tyn":"太原","yty":"泰州","tvs":"唐山","tcz":"腾冲","tsn":"天津" ,"thq":"天水","tgo":"通辽","ten":"铜仁","tlq":"吐鲁番","wxn":"万州","weh":"威海","wef":"潍坊","wnz":"温州","wnh":"文山" ,"wua":"乌海","hlh":"乌兰浩特","urc":"乌鲁木齐","wux":"无锡","wuz":"梧州","wuh":"武汉","wus":"武夷山","sia":"西安","xic":"西昌" ,"xnn":"西宁","jhg":"西双版纳","xil":"锡林浩特","dig":"香格里拉(迪庆)","xfn":"襄阳","acx":"兴义","xuz":"徐州","hkg":"香港" ,"ynt":"烟台","eny":"延安","ynj":"延吉","ynz":"盐城","yty":"扬州","lds":"伊春","yin":"伊宁","ybp":"宜宾","yih":"宜昌" ,"yic":"宜春","yiw":"义乌","inc":"银川","llf":"永州","uyn":"榆林","yus":"玉树","ycu":"运城","zha":"湛江","dyg":"张家界" ,"zqz":"张家口","yzy":"张掖","zat":"昭通","cgo":"郑州","zhy":"中卫","hsn":"舟山","zuh":"珠海","wmt":"遵义(茅台)","zyi":"遵义(新舟)"}
为了防止频繁请求出现 429,useragent 也找多一些让其随机取值。但是有时候太频繁则需要输入验证码,所以还是每爬取一个出发城市,暂停10秒钟吧。
先创建表用于存储数据,此处用的是 sql server:
create table kkflight( id int identity(1,1), --自增id itinerardate date, --行程日期 airline varchar(100), --航空公司 airlinecode varchar(100), --航空公司代码 flightnumber varchar(20), --航班号 flightnumbers varchar(20), --航班号-共享(实际航班) aircraft varchar(50), --飞机型号 aircraftsize char(2), --型号大小(l大;m中;s小) airporttax decimal(10,2), --机场建设费 fueloiltax decimal(10,2), --燃油税 fromcity varchar(50), --出发城市 fromcitycode varchar(10), --出发城市代码 fromairport varchar(50), --出发机场 fromterminal varchar(20), --出发航站楼 fromdatetime datetime, --出发时间 tocity varchar(50), --到达城市 tocitycode varchar(10), --到达城市代码 toairport varchar(50), --到达机场 toterminal varchar(20), --到达航站楼 todatetime datetime, --到达时间 durationhour int, --时长(小时h) durationminute int, --时长(分钟m) duration varchar(20), --时长(字符串) currency varchar(10), --币种 ticketprices decimal(10,2), --票价 discount decimal(4,2), --已打折扣 punctualityrate decimal(4,2), --准点率 aircraftcabin char(1), --仓位(f头等舱;c公务舱;y经济舱) insertdate datetime default(getdate()), --添加时间 )
因为是爬取所有城市,所以城市不限制,只限制日期,即爬取哪天至哪天的数据。全部脚本如下:
#-*- coding: utf-8 -*- # python 3.5.0 import json import time import random import datetime import sqlalchemy import urllib.request import pandas as pd from operator import itemgetter from dateutil.parser import parse class flight(object): def __init__(self): self.airline = {} #航空公司代码 self.engine = sqlalchemy.create_engine("mssql+pymssql://kk:kk@hzc/myspider") self.url = '' self.headers = {} self.city={"aat":"阿勒泰","acx":"兴义","aeb":"百色","aku":"阿克苏","aog":"鞍山","aqg":"安庆","ava":"安顺","axf":"阿拉善左旗","bav":"包头","bfj":"毕节","bhy":"北海" ,"bjs":"北京","bpe":"秦皇岛","bpl":"博乐","bpx":"昌都","bsd":"保山","can":"广州","cde":"承德","cgd":"常德","cgo":"郑州","cgq":"长春","chg":"朝阳","cif":"赤峰" ,"cih":"长治","ckg":"重庆","csx":"长沙","ctu":"成都","cwj":"沧源","cyi":"嘉义","czx":"常州","dat":"大同","dax":"达县","dbc":"白城","dcy":"稻城","ddg":"丹东" ,"dig":"香格里拉(迪庆)","dlc":"大连","dlu":"大理","dnh":"敦煌","doy":"东营","dqa":"大庆","dsn":"鄂尔多斯","dyg":"张家界","ejn":"额济纳旗","enh":"恩施" ,"eny":"延安","erl":"二连浩特","foc":"福州","fug":"阜阳","fuo":"佛山","fyj":"抚远","goq":"格尔木","gys":"广元","gyu":"固原","hak":"海口","hdg":"邯郸" ,"hek":"黑河","het":"呼和浩特","hfe":"合肥","hgh":"杭州","hia":"淮安","hjj":"怀化","hkg":"香港","hld":"海拉尔","hlh":"乌兰浩特","hmi":"哈密","hpg":"神农架" ,"hrb":"哈尔滨","hsn":"舟山","htn":"和田","huz":"惠州","hyn":"台州","hzg":"汉中","hzh":"黎平","inc":"银川","iqm":"且末","iqn":"庆阳","jdz":"景德镇" ,"jgd":"加格达奇","jgn":"嘉峪关","jgs":"井冈山","jhg":"西双版纳","jic":"金昌","jiq":"黔江","jiu":"九江","jjn":"晋江","jmj":"澜沧","jmu":"佳木斯","jng":"济宁" ,"jnz":"锦州","jsj":"建三江","juh":"池州","juz":"衢州","jxa":"鸡西","jzh":"九寨沟","kca":"库车","kgt":"康定","khg":"喀什","khn":"南昌","kjh":"凯里","kmg":"昆明" ,"knh":"金门","kow":"赣州","krl":"库尔勒","kry":"克拉玛依","kwe":"贵阳","kwl":"桂林","lcx":"龙岩","lds":"伊春","lfq":"临汾","lhw":"兰州","ljg":"丽江","llb":"荔波" ,"llf":"永州","llv":"吕梁","lnj":"临沧","lpf":"六盘水","lum":"芒市","lxa":"拉萨","lya":"洛阳","lyg":"连云港","lyi":"临沂","lzh":"柳州","lzo":"泸州" ,"lzy":"林芝","mdg":"牡丹江","mfk":"马祖","mfm":"澳门","mig":"绵阳","mxz":"梅州","nao":"南充","nbs":"白山","ndg":"齐齐哈尔","ngb":"宁波","ngq":"阿里" ,"nkg":"南京","nlh":"宁蒗","nng":"南宁","nny":"南阳","ntg":"南通","nzh":"满洲里","ohe":"漠河","pzi":"攀枝花","rht":"阿拉善右旗","riz":"日照","rkz":"日喀则" ,"rlk":"巴彦淖尔","sha":"上海","she":"沈阳","sia":"西安","sjw":"石家庄","swa":"揭阳","sym":"普洱","syx":"三亚","szx":"深圳","tao":"青岛","tcg":"塔城","tcz":"腾冲" ,"ten":"铜仁","tgo":"通辽","thq":"天水","tlq":"吐鲁番","tna":"济南","tsn":"天津","tvs":"唐山","txn":"黄山","tyn":"太原","urc":"乌鲁木齐","uyn":"榆林","wef":"潍坊" ,"weh":"威海","wmt":"遵义(茅台)","wnh":"文山","wnz":"温州","wua":"乌海","wuh":"武汉","wus":"武夷山","wux":"无锡","wuz":"梧州","wxn":"万州","xfn":"襄阳","xic":"西昌" ,"xil":"锡林浩特","xmn":"厦门","xnn":"西宁","xuz":"徐州","ybp":"宜宾","ycu":"运城","yic":"宜春","yie":"阿尔山","yih":"宜昌","yin":"伊宁","yiw":"义乌","ynj":"延吉" ,"ynt":"烟台","ynz":"盐城","yty":"扬州","yus":"玉树","yzy":"张掖","zat":"昭通","zha":"湛江","zhy":"中卫","zqz":"张家口","zuh":"珠海","zyi":"遵义(新舟)"} """{"kji":"布尔津"}""" self.useragent = [ "mozilla/5.0 (windows nt 6.3; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/66.0.3359.139 safari/537.36", "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.7 (khtml, like gecko) chrome/16.0.912.36 safari/535.7", "mozilla/5.0 (windows nt 6.2; win64; x64; rv:16.0) gecko/16.0 firefox/16.0", "mozilla/5.0 (macintosh; intel mac os x 10_7_3) applewebkit/534.55.3 (khtml, like gecko) version/5.1.3 safari/534.53.10", "mozilla/5.0 (compatible; msie 9.0; windows nt 6.1; win64; x64; trident/5.0; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 2.0.50727; media center pc 6.0)", "mozilla/5.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0; wow64; trident/4.0; slcc2; .net clr 2.0.50727; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 1.0.3705; .net clr 1.1.4322)", "mozilla/5.0 (windows nt 6.2; wow64) applewebkit/537.36 (khtml, like gecko) chrome/27.0.1500.55 safari/537.36", "mozilla/5.0 (macintosh; intel mac os x 10_8_2) applewebkit/537.17 (khtml, like gecko) chrome/24.0.1309.0 safari/537.17" "mozilla/5.0 (windows nt 6.1; win64; x64; rv:2.0b13pre) gecko/20110307 firefox/4.0b13pre", "mozilla/5.0 (x11; ubuntu; linux x86_64; rv:16.0) gecko/20100101 firefox/16.0", "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.11 (khtml, like gecko) chrome/23.0.1271.64 safari/537.11" ] #遍历两个日期间的所有日期 def set_url_headers(self,startdate,enddate): startdate=datetime.datetime.strptime(startdate,'%y-%m-%d') enddate=datetime.datetime.strptime(enddate,'%y-%m-%d') while startdate<=enddate: today = startdate.strftime('%y-%m-%d') for fromcode, fromcity in sorted(self.city.items(), key=itemgetter(0)): for tocode, tocity in sorted(self.city.items(), key=itemgetter(0)): if fromcode != tocode: self.url = 'http://flights.ctrip.com/domesticsearch/search/searchfirstrouteflights?dcity1=%s&acity1=%s&searchtype=s&ddate1=%s&isnearairportrecommond=0&logtoken=027e478a47494975ad74857b18283e12&rk=4.381066884522498182534&ck=9fc7881e8f373585c0e5f89152bc143d&r=0.24149333708195565406316' % (fromcode,tocode,today) self.headers = { "host": "flights.ctrip.com", "user-agent": random.choice(self.useragent), "referer": "https://flights.ctrip.com/booking/%s-%s-day-1.html?ddate1=%s" % (fromcode,tocode,today), "connection": "keep-alive", } print("%s : %s(%s) ==> %s(%s) " % (today,fromcity,fromcode,tocity,tocode)) self.get_parse_json_data(today) time.sleep(10) startdate+=datetime.timedelta(days=1) #获取一个页面中的数据 def get_one_page_json_data(self): req = urllib.request.request(self.url,headers=self.headers) body = urllib.request.urlopen(req,timeout=30).read().decode('gbk') jsondata = json.loads(body.strip("'<>() ").replace('\'', '\"')) return jsondata #获取一个页面中的数据,解析保存到数据库 def get_parse_json_data(self,today): jsondata = self.get_one_page_json_data() df = pd.dataframe(columns=['itinerardate','airline','airlinecode','flightnumber','flightnumbers','aircraft','aircraftsize' ,'airporttax','fueloiltax','fromcity','fromcitycode','fromairport','fromterminal','fromdatetime','tocity','tocitycode','toairport' ,'toterminal','todatetime','durationhour','durationminute','duration','currency','ticketprices','discount','punctualityrate','aircraftcabin']) if bool(jsondata["fis"]): #获取航空公司代码及公司名称 company = jsondata["als"] for k in company.keys(): if k not in self.airline: self.airline[k]=company[k] index = 0 for data in jsondata["fis"]: df.loc[index,'itinerardate'] = today #行程日期 #df.loc[index,'airline'] = self.airline[data["alc"].strip()] #航空公司 df.loc[index,'airline'] = self.airline[data["alc"].strip()] if (data["alc"].strip() in self.airline) else none #航空公司 df.loc[index,'airlinecode'] = data["alc"].strip() #航空公司代码 df.loc[index,'flightnumber'] = data["fn"] #航班号 df.loc[index,'flightnumbers'] = data["sdft"] #共享航班号(实际航班) df.loc[index,'aircraft'] = data["cf"]["c"] #飞机型号 df.loc[index,'aircraftsize'] = data["cf"]["s"] #型号大小(l大;m中;s小) df.loc[index,'airporttax'] = data["tax"] #机场建设费 df.loc[index,'fueloiltax'] = data["of"] #燃油税 df.loc[index,'fromcity'] = data["acn"] #出发城市 df.loc[index,'fromcitycode'] = data["acc"] #出发城市代码 df.loc[index,'fromairport'] = data["apbn"] #出发机场 df.loc[index,'fromterminal'] = data["asmsn"] #出发航站楼 df.loc[index,'fromdatetime'] = data["dt"] #出发时间 df.loc[index,'tocity'] = data["dcn"] #到达城市 df.loc[index,'tocitycode'] = data["dcc"] #到达城市代码 df.loc[index,'toairport'] = data["dpbn"] #到达机场 df.loc[index,'toterminal'] = data["dsmsn"] #到达航站楼 df.loc[index,'todatetime'] = data["at"] #到达时间 df.loc[index,'durationhour'] = int((parse(data["at"])-parse(data["dt"])).seconds/3600) #时长(小时h) df.loc[index,'durationminute'] = int((parse(data["at"])-parse(data["dt"])).seconds%3600/60) #时长(分钟m) df.loc[index,'duration'] = str(df.loc[index,'durationhour']) + 'h' + str(df.loc[index,'durationminute']) + 'm' #时长(字符串) df.loc[index,'currency'] = none #币种 df.loc[index,'ticketprices'] = data["lp"] #票价 df.loc[index,'discount'] = none #已打折扣 df.loc[index,'punctualityrate'] = none #准点率 df.loc[index,'aircraftcabin'] = none #仓位(f头等舱;c公务舱;y经济舱) index = index + 1 df.to_sql("kkflight", self.engine, index=false, if_exists='append') print("done!~") if __name__ == "__main__": fly = flight() fly.set_url_headers('2018-06-16','2018-06-16')
总结
以上所述是小编给大家介绍的python 爬取携程所有机票,希望对大家有所帮助