xpath的优化
程序员文章站
2022-03-03 10:27:23
...
# -*- coding: UTF-8 -*-
__author__ = 'ZhengXiang'
__time__ = '2020/2/10 20:26'
import requests
import xlwt
import redis
from lxml import etree
from urllib import parse
import pandas as pd
import pymysql
urls = [
'/kns/detail/detail.aspx?QueryID=0&CurRec=1017&DbCode= CJFD&dbname=CJFDLAST2015&filename=TWXB201501009&urlid=10.15940/j.cnki.0001-5245.2015.01.009&yx=Y',
'/kns/detail/detail.aspx?QueryID=0&CurRec=1017&DbCode= CJFD&dbname=CJFDLAST2016&filename=CHKD201602035&urlid=&yx=',
'/kns/detail/detail.aspx?QueryID=0&CurRec=1068&DbCode= CJFD&dbname=CJFD2000&filename=ZGTB200002021&urlid=&yx=',
'/kns/detail/detail.aspx?QueryID=0&CurRec=1078&DbCode= CJFD&dbname=CJFD2002&filename=DLYJ200202004&urlid=&yx='
]
def get_detail_url(url):
urldata=parse.urlparse(url)
result=parse.parse_qs(urldata.query)
f_n=result['filename'][0] #由于返回的是数组
d_c=result['DbCode'][0]
d_n=result['dbname'][0]
#拼接url也有更好的方法
#query = {"name": "walker", "age": 99} d = parse.urlencode(query)
data={
'dbcode':d_c,
'dbname':d_n,
'filename':f_n
}
detail_url = 'http://kns.cnki.net/KCMS/detail/detail.aspx?DbCode=CJFD&dbname=' + d_n + '&filename=' + f_n
return detail_url
l=[]
for i in urls:
# print(i)
du = get_detail_url(i)
print(du)
l.append(du)
Data1=[]
Data2=[]
Data3=[]
Data4=[]
Data5=[]
Data6=[]
Funding=[]
error=[]
Time=[]
def clean(text):
text=text.replace('\r' ,'')
text =text.replace('\n' ,'')
text =text.replace('\r\n', '')
text =text.replace("space","")
text = text.replace(" ", "")
#text =text.replace('; ','')
text=text.replace(";;",";")
text =text.strip()
return text
for i in l:
req=requests.get(i)
html = etree.HTML(req.content)
xpath_ti='//*[@id="mainArea"]/div[3]/div[1]/h2//text()'
xpath_au='//*[@id="mainArea"]/div[3]/div[1]/div[1]/span/a//text()'
xpath_ad='//*[@id="mainArea"]/div[3]/div[1]/div[2]/span/a//text()'
xpath_abs='//span[@id="ChDivSummary"]//text()'#//span[@id="ChDivSummary"]/text()
xpath_key = '//label[@id="catalog_KEYWORD"]/../a/text()'
xpath_fund='//label[@id="catalog_FUND"]/../a/text()'
xpath_time='//a[contains(@onclick,"getKns55NaviLinkIssue")]/text()'
ti=html.xpath(xpath_ti)
ti = ''.join(ti)
ti=clean(ti)
Data1.append(ti)
au = html.xpath(xpath_au)
au = ';'.join(au)
au = clean(au)
Data2.append(au)
ad = html.xpath(xpath_ad)
ad = ';'.join(ad)
ad = clean(ad)
Data3.append(ad)
abs = html.xpath(xpath_abs)
abs = ''.join(abs)
abs = clean(abs)
Data4.append(abs)
key = html.xpath(xpath_key)
key = ';'.join(key)
key = clean(key)
Data5.append(key)
tt= html.xpath(xpath_time)
tt=''.join(tt)
tt = clean(tt)
Time.append(tt)
try:
Funding.append(clean(';'.join(html.xpath(xpath_fund))))
except:
Funding.append(None)
Data6.append(i)
print(ti+'\t'+au+'\t'+abs+'\t'+ad+"\t"+key)
df1 = pd.DataFrame({'title': Data1,
'author': Data2,
'address': Data3,
'abstract': Data4,
'key_word': Data5,
'time':Time,
'funding':Funding,
'url': Data6,})
# if __name__ == '__main__':
# for i in l:
# get_detail_content(i)
#
df1.to_excel('c.xls',encoding='utf-8', index=False, )#header=False
某个节点包括哪个属性:
//a[contains(@onclick,"getKns55NaviLinkIssue")]/text()
某个节点的父节点:再来重新选择
//label[@id="catalog_FUND"]/../a/text()
上一篇: 常用反射工具类