wos文章标题、发表年份、被引次数、作者、citation等数据爬取
程序员文章站
2022-05-02 20:49:03
...
之前参照另一个博主的代码思路改动后写的爬取wos文章标题、发表年份、被引次数、作者、citation等数据的代码,写完以后发现该代码实现的功能可以通过在wos中直接导出实现(只需要写一段selenium代码),更加方便,所以这段代码放在这里做一个练习参考。
爬取得到的txt文件格式如下:
Learning Deep Architectures for AI,Bengio Y.,Foundations and Trends in Machine Learning, 2009,3847,,
A study of cross-validation and bootstrap for accuracy estimation and model selection,Kohavi R.,IJCAI-95. Proceedings of the Fourteenth International Joint Conference on Artificial Intelligence, 1995,4001,,
Feature selection based on mutual information: Criteria of max-dependency max-relevance and min-redundancy,Peng HC (Peng HC); Long FH (Long FH); Ding C (Ding C),IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE,AUG 2005,4749,feature selection; mutual information; minimal redundancy; maximal relevance; maximal dependency; classification,Distinct types of diffuse large B-cell lymphoma identified by gene expression profiling/A tutorial on Support Vector Machines for pattern recognition/BEST 2 INDEPENDENT MEASUREMENTS ARE NOT 2 BEST/Minimum redundancy feature selection from microarray gene expression data/Experiments with classifier combining rules/Registration of localization images by maximization of mutual information/A Bayesian morphometry algorithm/A comparison of methods for multiclass support vector machines/Fast ICA by a fixed-point algorithm that maximizes non-Gaussianity/Feature selection for multiclass discrimination via mixed-integer linear programming/Improved gene selection for classification of microarrays./Feature selection: Evaluation application and small sample performance/Statistical pattern recognition: A review/Wrappers for feature subset selection/Jensen's Inequality/Input feature selection by mutual information based on Parzen window/Selection of Relevant Features in Machine Learning/ESTIMATION OF A PROBABILITY DENSITY-FUNCTION AND MODE/Bayesian Clustering Methods for Morphological Analysis of MR Images/Structure search and stability enhancement of Bayesian networks/FLOATING SEARCH METHODS IN FEATURE-SELECTION/Systematic variation in gene expression patterns in human cancer cell lines/A gene expression database for the molecular pharmacology of cancer/Controlling the Generalization Ability of Learning Processes/Biomarker identification by feature wrappers/……
代码中qid、SID、cookies、url等参数需要视爬取的内容在内容网页中获取。
#encoding:utf-8
from bs4 import BeautifulSoup
import time
import requests
import threading
import io
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
requests.DEFAULT_RETRIES = 5
#Ts:已经爬取的论文数据 wf:需要写入的文件
def get_papers(page, Ts, wf):
docs = range(1,11)
params = {'product': 'UA',
'search_mode': 'AdvancedSearch',
'qid': '1',
'SID': '5C3zRPgr9fF4oEgLThG',
'page': page}
headers = {'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 XXXX',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'show_vpn=0; wengine_vpn_ticket=c6b8e875d7a181aa; refresh=0'}
url = 'https:/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/summary.do?product=UA&doc=1&qid=1&SID=6BNBhXDfAMTcyuJurHC&search_mode=AdvancedSearch&update_back2search_link_param=yes'
htm1 = requests.get(url, headers=headers, params=params, verify=True)
soup = BeautifulSoup(htm1.text, 'lxml')
for doc in docs:
t = soup.find('a', attrs={'class': 'smallV110 snowplow-full-record'
,'href':'/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))}).value.text # 如果在该步产生错误,更新cookie
title = re.sub(",", " ", str(t))
print title
if title not in Ts: # 已经爬取过的文章跳过
# 标题,作者,发表期刊,出版年,被引量,关键词,引用的参考文献
url2 = '/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))
a = 'https://webvpn.dlut.edu.cn'
url2 = "".join([a, url2])
try:
htm2 = requests.get(url2, headers=headers, params=params, verify=True,timeout=30)
except:
print('timeout,page:',page,'doc:',doc)
continue
soup2 = BeautifulSoup(htm2.text, 'lxml')
if soup2.find('p', attrs={'class': 'FR_field'}).sup!=None:
s = soup2.find('p', attrs={'class': 'FR_field'}).text
s1 = re.findall(r"\(\w+\,\s\w+\)", s)
a = "".join(s1)
author = re.sub(r"\,", "", a)
else:
s = soup2.find('p', attrs={'class': 'FR_field'}).text
a = re.sub(r"\,", "", str(s))
author = re.sub(r"作者:", "", a)
#print author
#print type(author)
if soup2.find('span', attrs={'class': 'sourceTitle'})!=None:
s3 = soup2.find('span', attrs={'class': 'sourceTitle'}).value.text
elif soup2.find('p', attrs={'class': 'sourceTitle'})!=None:
s3 = soup2.find('p', attrs={'class': 'sourceTitle'}).text
else:
s3 = soup2.find('div', attrs={'class': 'block-record-info block-record-info-source'}).a.text
journal = re.sub(",", " ", str(s3))
#print journal
#print type(journal)
for s in soup2.find_all(attrs={'class': 'FR_field'}):
if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('出版年') >= 0:
if s.select('value')!=[]:
s1 = str(s.select('value'))
else:
s1 = str(s.get_text())
s2 = re.findall(r"\w*\s?\d*\s?\d+", s1)
publish_time = "".join(s2)
#print publish_time
#print type(publish_time)
if soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'})!=None:
t = soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'}).b.text
else:
t = soup2.find('span', attrs={'class': 'large-number'}).text
t1=re.sub(",", "", str(t))
times_been_cited=t1.decode('utf-8')
#print times_been_cited
#print type(times_been_cited)
key_words = ""
for s in soup2.find_all(attrs={'class': 'FR_field'}):
if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('作者关键词:') >= 0:
s1 = str(s.get_text())
key_word = re.sub(r"作者关键词:", "", s1)
key_words = re.sub(",", " ", str(key_word))
#print key_words
cite_title = ""
if soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
href=True)!=None:
cite_number = soup2.find('a', attrs={'class': 'snowplow-citation-network-cited-reference-count-link'}).span.text # 引用的文章数,用于判断url3中page的值
cite_number = int(cite_number)
#print cite_number
url3 = soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
href=True).attrs['href']
b = 'https://webvpn.dlut.edu.cn/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/'
url3 = "".join([b, url3])
#print url3 #引用的参考文献页面
c = ""
for i in range(1,int(cite_number/30)+2):
#print i
if i!=1:
url3=re.sub("page="+str(i-1),"page="+str(i),url3)
#print url3
htm3 = requests.get(url3, headers=headers,verify=True)
soup3 = BeautifulSoup(htm3.text, 'lxml')
for s in soup3.find_all('span', attrs={'class': 'reference-title'}):
if s.value!=None:
c = c + s.value.text + '/'
cite_title = re.sub(",", " ", str(c))
#print cite_title
else:
continue
else:
cite_title=""
#print cite_title
#print type(cite_title)
print(page, doc, title)
wf.write(",".join([title, author, journal, publish_time, times_been_cited, key_words, cite_title]).replace('\n','') + '\n')
def check_titles(): # 判断当前爬取的论文是否已经存在,存在则跳过
Ts = []
with io.open('Results/wos_data7.txt', 'r', encoding='utf-8') as rf:
for line in rf.readlines():
Ts.extend([line.strip().split(',')[0]])
print str('已经爬取:{}'.format(len(Ts))).decode('utf-8')
return Ts
if __name__ == '__main__':
Ts = check_titles()
threads = [] # 存放多线程(线程池中最后有总页数个线程?)
all_pages = int(573642/10) + 1 #总页数
wf = io.open('Results/wos_data7.txt','a+', encoding='utf-8') #在results目录下建立或打开txt文件
for i in range(601,701):
threads.append(threading.Thread(target=get_papers, args=(i,Ts,wf))) #调用getpaper函数,每调用一次加入一次线程
#get_papers(i,Ts,wf)
num = 0
while num <= len(threads):
if threading.activeCount() <= 10: # 最大线程数小于10
threads[num].start()
num += 1
else:
time.sleep(10) # 否则休眠10秒去执行线程
上一篇: 小红书曲折的商业化之路
下一篇: 哪些明星离婚后才知道他们是夫妻