wos文章标题、发表年份、被引次数、作者、citation等数据爬取

程序员文章站 2022-05-02 20:49:03

...

之前参照另一个博主的代码思路改动后写的爬取wos文章标题、发表年份、被引次数、作者、citation等数据的代码，写完以后发现该代码实现的功能可以通过在wos中直接导出实现（只需要写一段selenium代码），更加方便，所以这段代码放在这里做一个练习参考。

爬取得到的txt文件格式如下：

Learning Deep Architectures for AI,Bengio Y.,Foundations and Trends in Machine Learning,  2009,3847,,
A study of cross-validation and bootstrap for accuracy estimation and model selection,Kohavi R.,IJCAI-95. Proceedings of the Fourteenth International Joint Conference on Artificial Intelligence,  1995,4001,,
Feature selection based on mutual information: Criteria of max-dependency  max-relevance  and min-redundancy,Peng HC (Peng HC); Long FH (Long FH); Ding C (Ding C),IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE,AUG 2005,4749,feature selection; mutual information; minimal redundancy; maximal relevance; maximal dependency; classification,Distinct types of diffuse large B-cell lymphoma identified by gene expression profiling/A tutorial on Support Vector Machines for pattern recognition/BEST 2 INDEPENDENT MEASUREMENTS ARE NOT 2 BEST/Minimum redundancy feature selection from microarray gene expression data/Experiments with classifier combining rules/Registration of localization images by maximization of mutual information/A Bayesian morphometry algorithm/A comparison of methods for multiclass support vector machines/Fast ICA by a fixed-point algorithm that maximizes non-Gaussianity/Feature selection for multiclass discrimination via mixed-integer linear programming/Improved gene selection for classification of microarrays./Feature selection: Evaluation  application  and small sample performance/Statistical pattern recognition: A review/Wrappers for feature subset selection/Jensen's Inequality/Input feature selection by mutual information based on Parzen window/Selection of Relevant Features in Machine Learning/ESTIMATION OF A PROBABILITY DENSITY-FUNCTION AND MODE/Bayesian Clustering Methods for Morphological Analysis of MR Images/Structure search and stability enhancement of Bayesian networks/FLOATING SEARCH METHODS IN FEATURE-SELECTION/Systematic variation in gene expression patterns in human cancer cell lines/A gene expression database for the molecular pharmacology of cancer/Controlling the Generalization Ability of Learning Processes/Biomarker identification by feature wrappers/……

代码中qid、SID、cookies、url等参数需要视爬取的内容在内容网页中获取。

#encoding:utf-8
from bs4 import BeautifulSoup
import time
import requests
import threading
import io
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
requests.DEFAULT_RETRIES = 5

#Ts:已经爬取的论文数据 wf:需要写入的文件
def get_papers(page, Ts, wf):
    docs = range(1,11)
    params = {'product': 'UA',
              'search_mode': 'AdvancedSearch',
              'qid': '1',
              'SID': '5C3zRPgr9fF4oEgLThG',
              'page': page}
    headers = {'Connection': 'keep-alive',
               'Cache-Control': 'max-age=0',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 XXXX',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
               'Sec-Fetch-Site': 'same-origin',
               'Sec-Fetch-Mode': 'navigate',
               'Sec-Fetch-User': '?1',
               'Sec-Fetch-Dest': 'document',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Cookie': 'show_vpn=0; wengine_vpn_ticket=c6b8e875d7a181aa; refresh=0'}

    url = 'https:/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/summary.do?product=UA&doc=1&qid=1&SID=6BNBhXDfAMTcyuJurHC&search_mode=AdvancedSearch&update_back2search_link_param=yes'
    htm1 = requests.get(url, headers=headers, params=params, verify=True)
    soup = BeautifulSoup(htm1.text, 'lxml')
    for doc in docs:
        t = soup.find('a', attrs={'class': 'smallV110 snowplow-full-record'
            ,'href':'/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))}).value.text  # 如果在该步产生错误，更新cookie
        title = re.sub(",", " ", str(t))
        print title
        if title not in Ts:  # 已经爬取过的文章跳过
            # 标题，作者，发表期刊，出版年,被引量,关键词,引用的参考文献
            url2 = '/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))
            a = 'https://webvpn.dlut.edu.cn'
            url2 = "".join([a, url2])
            try:
                htm2 = requests.get(url2, headers=headers, params=params, verify=True,timeout=30)
            except:
                print('timeout,page:',page,'doc:',doc)
                continue
            soup2 = BeautifulSoup(htm2.text, 'lxml')
            if soup2.find('p', attrs={'class': 'FR_field'}).sup!=None:
                s = soup2.find('p', attrs={'class': 'FR_field'}).text
                s1 = re.findall(r"\(\w+\,\s\w+\)", s)
                a = "".join(s1)
                author = re.sub(r"\,", "", a)
            else:
                s = soup2.find('p', attrs={'class': 'FR_field'}).text
                a = re.sub(r"\,", "", str(s))
                author = re.sub(r"作者:", "", a)
            #print author
            #print type(author)
            if soup2.find('span', attrs={'class': 'sourceTitle'})!=None:
                s3 = soup2.find('span', attrs={'class': 'sourceTitle'}).value.text
            elif soup2.find('p', attrs={'class': 'sourceTitle'})!=None:
                s3 = soup2.find('p', attrs={'class': 'sourceTitle'}).text
            else:
                s3 = soup2.find('div', attrs={'class': 'block-record-info block-record-info-source'}).a.text
            journal = re.sub(",", " ", str(s3))
            #print journal
            #print type(journal)
            for s in soup2.find_all(attrs={'class': 'FR_field'}):
                if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('出版年') >= 0:
                    if s.select('value')!=[]:
                        s1 = str(s.select('value'))
                    else:
                        s1 = str(s.get_text())
                    s2 = re.findall(r"\w*\s?\d*\s?\d+", s1)
                    publish_time = "".join(s2)
                    #print publish_time
                    #print type(publish_time)
            if soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'})!=None:
                t = soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'}).b.text
            else:
                t = soup2.find('span', attrs={'class': 'large-number'}).text
            t1=re.sub(",", "", str(t))
            times_been_cited=t1.decode('utf-8')
            #print times_been_cited
            #print type(times_been_cited)
            key_words = ""
            for s in soup2.find_all(attrs={'class': 'FR_field'}):
                if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('作者关键词:') >= 0:
                    s1 = str(s.get_text())
                    key_word = re.sub(r"作者关键词:", "", s1)
                    key_words = re.sub(",", " ", str(key_word))
            #print key_words
            cite_title = ""
            if soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
                              href=True)!=None:
                cite_number = soup2.find('a', attrs={'class': 'snowplow-citation-network-cited-reference-count-link'}).span.text  # 引用的文章数，用于判断url3中page的值
                cite_number = int(cite_number)
                #print cite_number
                url3 = soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
                              href=True).attrs['href']
                b = 'https://webvpn.dlut.edu.cn/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/'
                url3 = "".join([b, url3])
                #print url3   #引用的参考文献页面
                c = ""
                for i in range(1,int(cite_number/30)+2):
                    #print i
                    if i!=1:
                        url3=re.sub("page="+str(i-1),"page="+str(i),url3)
                    #print url3
                    htm3 = requests.get(url3, headers=headers,verify=True)
                    soup3 = BeautifulSoup(htm3.text, 'lxml')
                    for s in soup3.find_all('span', attrs={'class': 'reference-title'}):
                        if s.value!=None:
                            c = c + s.value.text + '/'
                            cite_title = re.sub(",", " ", str(c))
                            #print cite_title
                        else:
                            continue
            else:
                cite_title=""
            #print cite_title
            #print type(cite_title)
            print(page, doc, title)
            wf.write(",".join([title, author, journal, publish_time, times_been_cited, key_words, cite_title]).replace('\n','') + '\n')

def check_titles():    # 判断当前爬取的论文是否已经存在，存在则跳过
    Ts = []
    with io.open('Results/wos_data7.txt', 'r', encoding='utf-8') as rf:
        for line in rf.readlines():
            Ts.extend([line.strip().split(',')[0]])
    print str('已经爬取：{}'.format(len(Ts))).decode('utf-8')
    return Ts

if __name__ == '__main__':
    Ts = check_titles()
    threads = []    # 存放多线程（线程池中最后有总页数个线程？）
    all_pages = int(573642/10) + 1   #总页数
    wf = io.open('Results/wos_data7.txt','a+', encoding='utf-8')  #在results目录下建立或打开txt文件
    for i in range(601,701):
        threads.append(threading.Thread(target=get_papers, args=(i,Ts,wf)))   #调用getpaper函数，每调用一次加入一次线程
        #get_papers(i,Ts,wf)
    num = 0
    while num <= len(threads):
        if threading.activeCount() <= 10:   # 最大线程数小于10
            threads[num].start()
            num += 1
        else:
            time.sleep(10)    # 否则休眠10秒去执行线程