欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

wos文章标题、发表年份、被引次数、作者、citation等数据爬取

程序员文章站 2022-05-02 20:49:03
...

之前参照另一个博主的代码思路改动后写的爬取wos文章标题、发表年份、被引次数、作者、citation等数据的代码,写完以后发现该代码实现的功能可以通过在wos中直接导出实现(只需要写一段selenium代码),更加方便,所以这段代码放在这里做一个练习参考。

爬取得到的txt文件格式如下:

Learning Deep Architectures for AI,Bengio Y.,Foundations and Trends in Machine Learning,  2009,3847,,
A study of cross-validation and bootstrap for accuracy estimation and model selection,Kohavi R.,IJCAI-95. Proceedings of the Fourteenth International Joint Conference on Artificial Intelligence,  1995,4001,,
Feature selection based on mutual information: Criteria of max-dependency  max-relevance  and min-redundancy,Peng HC (Peng HC); Long FH (Long FH); Ding C (Ding C),IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE,AUG 2005,4749,feature selection; mutual information; minimal redundancy; maximal relevance; maximal dependency; classification,Distinct types of diffuse large B-cell lymphoma identified by gene expression profiling/A tutorial on Support Vector Machines for pattern recognition/BEST 2 INDEPENDENT MEASUREMENTS ARE NOT 2 BEST/Minimum redundancy feature selection from microarray gene expression data/Experiments with classifier combining rules/Registration of localization images by maximization of mutual information/A Bayesian morphometry algorithm/A comparison of methods for multiclass support vector machines/Fast ICA by a fixed-point algorithm that maximizes non-Gaussianity/Feature selection for multiclass discrimination via mixed-integer linear programming/Improved gene selection for classification of microarrays./Feature selection: Evaluation  application  and small sample performance/Statistical pattern recognition: A review/Wrappers for feature subset selection/Jensen's Inequality/Input feature selection by mutual information based on Parzen window/Selection of Relevant Features in Machine Learning/ESTIMATION OF A PROBABILITY DENSITY-FUNCTION AND MODE/Bayesian Clustering Methods for Morphological Analysis of MR Images/Structure search and stability enhancement of Bayesian networks/FLOATING SEARCH METHODS IN FEATURE-SELECTION/Systematic variation in gene expression patterns in human cancer cell lines/A gene expression database for the molecular pharmacology of cancer/Controlling the Generalization Ability of Learning Processes/Biomarker identification by feature wrappers/……

代码中qid、SID、cookies、url等参数需要视爬取的内容在内容网页中获取。

#encoding:utf-8
from bs4 import BeautifulSoup
import time
import requests
import threading
import io
import re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
requests.DEFAULT_RETRIES = 5

#Ts:已经爬取的论文数据 wf:需要写入的文件
def get_papers(page, Ts, wf):
    docs = range(1,11)
    params = {'product': 'UA',
              'search_mode': 'AdvancedSearch',
              'qid': '1',
              'SID': '5C3zRPgr9fF4oEgLThG',
              'page': page}
    headers = {'Connection': 'keep-alive',
               'Cache-Control': 'max-age=0',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 XXXX',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
               'Sec-Fetch-Site': 'same-origin',
               'Sec-Fetch-Mode': 'navigate',
               'Sec-Fetch-User': '?1',
               'Sec-Fetch-Dest': 'document',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Cookie': 'show_vpn=0; wengine_vpn_ticket=c6b8e875d7a181aa; refresh=0'}

    url = 'https:/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/summary.do?product=UA&doc=1&qid=1&SID=6BNBhXDfAMTcyuJurHC&search_mode=AdvancedSearch&update_back2search_link_param=yes'
    htm1 = requests.get(url, headers=headers, params=params, verify=True)
    soup = BeautifulSoup(htm1.text, 'lxml')
    for doc in docs:
        t = soup.find('a', attrs={'class': 'smallV110 snowplow-full-record'
            ,'href':'/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))}).value.text  # 如果在该步产生错误,更新cookie
        title = re.sub(",", " ", str(t))
        print title
        if title not in Ts:  # 已经爬取过的文章跳过
            # 标题,作者,发表期刊,出版年,被引量,关键词,引用的参考文献
            url2 = '/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/full_record.do?'+'product='+params['product']+'&search_mode='+params['search_mode']+'&qid='+params['qid']+'&SID='+params['SID']+'&page='+str(page)+'&doc='+str(doc+10*(page-1))
            a = 'https://webvpn.dlut.edu.cn'
            url2 = "".join([a, url2])
            try:
                htm2 = requests.get(url2, headers=headers, params=params, verify=True,timeout=30)
            except:
                print('timeout,page:',page,'doc:',doc)
                continue
            soup2 = BeautifulSoup(htm2.text, 'lxml')
            if soup2.find('p', attrs={'class': 'FR_field'}).sup!=None:
                s = soup2.find('p', attrs={'class': 'FR_field'}).text
                s1 = re.findall(r"\(\w+\,\s\w+\)", s)
                a = "".join(s1)
                author = re.sub(r"\,", "", a)
            else:
                s = soup2.find('p', attrs={'class': 'FR_field'}).text
                a = re.sub(r"\,", "", str(s))
                author = re.sub(r"作者:", "", a)
            #print author
            #print type(author)
            if soup2.find('span', attrs={'class': 'sourceTitle'})!=None:
                s3 = soup2.find('span', attrs={'class': 'sourceTitle'}).value.text
            elif soup2.find('p', attrs={'class': 'sourceTitle'})!=None:
                s3 = soup2.find('p', attrs={'class': 'sourceTitle'}).text
            else:
                s3 = soup2.find('div', attrs={'class': 'block-record-info block-record-info-source'}).a.text
            journal = re.sub(",", " ", str(s3))
            #print journal
            #print type(journal)
            for s in soup2.find_all(attrs={'class': 'FR_field'}):
                if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('出版年') >= 0:
                    if s.select('value')!=[]:
                        s1 = str(s.select('value'))
                    else:
                        s1 = str(s.get_text())
                    s2 = re.findall(r"\w*\s?\d*\s?\d+", s1)
                    publish_time = "".join(s2)
                    #print publish_time
                    #print type(publish_time)
            if soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'})!=None:
                t = soup2.find('a', attrs={'class': 'snowplow-times-cited-within-see-more-data-fields'}).b.text
            else:
                t = soup2.find('span', attrs={'class': 'large-number'}).text
            t1=re.sub(",", "", str(t))
            times_been_cited=t1.decode('utf-8')
            #print times_been_cited
            #print type(times_been_cited)
            key_words = ""
            for s in soup2.find_all(attrs={'class': 'FR_field'}):
                if s.find(class_='FR_label') and s.find(class_='FR_label').text.find('作者关键词:') >= 0:
                    s1 = str(s.get_text())
                    key_word = re.sub(r"作者关键词:", "", s1)
                    key_words = re.sub(",", " ", str(key_word))
            #print key_words
            cite_title = ""
            if soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
                              href=True)!=None:
                cite_number = soup2.find('a', attrs={'class': 'snowplow-citation-network-cited-reference-count-link'}).span.text  # 引用的文章数,用于判断url3中page的值
                cite_number = int(cite_number)
                #print cite_number
                url3 = soup2.find('a', attrs={'class': 'view-all-link snowplow-view-all-in-cited-references-page-bottom'},
                              href=True).attrs['href']
                b = 'https://webvpn.dlut.edu.cn/http/77726476706e69737468656265737421f1e7518f69276d52710e82a297422f30a0c6fa320a29ae/'
                url3 = "".join([b, url3])
                #print url3   #引用的参考文献页面
                c = ""
                for i in range(1,int(cite_number/30)+2):
                    #print i
                    if i!=1:
                        url3=re.sub("page="+str(i-1),"page="+str(i),url3)
                    #print url3
                    htm3 = requests.get(url3, headers=headers,verify=True)
                    soup3 = BeautifulSoup(htm3.text, 'lxml')
                    for s in soup3.find_all('span', attrs={'class': 'reference-title'}):
                        if s.value!=None:
                            c = c + s.value.text + '/'
                            cite_title = re.sub(",", " ", str(c))
                            #print cite_title
                        else:
                            continue
            else:
                cite_title=""
            #print cite_title
            #print type(cite_title)
            print(page, doc, title)
            wf.write(",".join([title, author, journal, publish_time, times_been_cited, key_words, cite_title]).replace('\n','') + '\n')

def check_titles():    # 判断当前爬取的论文是否已经存在,存在则跳过
    Ts = []
    with io.open('Results/wos_data7.txt', 'r', encoding='utf-8') as rf:
        for line in rf.readlines():
            Ts.extend([line.strip().split(',')[0]])
    print str('已经爬取:{}'.format(len(Ts))).decode('utf-8')
    return Ts

if __name__ == '__main__':
    Ts = check_titles()
    threads = []    # 存放多线程(线程池中最后有总页数个线程?)
    all_pages = int(573642/10) + 1   #总页数
    wf = io.open('Results/wos_data7.txt','a+', encoding='utf-8')  #在results目录下建立或打开txt文件
    for i in range(601,701):
        threads.append(threading.Thread(target=get_papers, args=(i,Ts,wf)))   #调用getpaper函数,每调用一次加入一次线程
        #get_papers(i,Ts,wf)
    num = 0
    while num <= len(threads):
        if threading.activeCount() <= 10:   # 最大线程数小于10
            threads[num].start()
            num += 1
        else:
            time.sleep(10)    # 否则休眠10秒去执行线程