Pyhton零基础投喂(综合练习:1.论⽂数据统计)
程序员文章站
2022-07-11 09:46:53
...
1.导⼊package并读取原始数据
# 导⼊所需的package
import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式,匹配字符串的模式
import requests #⽤于⽹络连接,发送⽹络请求,使⽤域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图⼯具
# 读⼊数据
data = [] #初始化
#使⽤with语句优势:1.⾃动关闭⽂件句柄;2.⾃动显示(处理)⽂件读取数据异常
with open("E:\\DW学习\\Python 学习代码\\arxiv-metadata-oai-2019.json","r") as f:
for line in f:
data.append(json.loads(line))
data=pd.DataFrame(data) #将list变为dataframe格式,⽅便使⽤pandas进⾏分析
data.shape #显示数据大小
(170618, 14)
data.head()
id | submitter | authors | title | comments | journal-ref | doi | report-no | categories | license | abstract | versions | update_date | authors_parsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0704.0297 | Sung-Chul Yoon | Sung-Chul Yoon, Philipp Podsiadlowski and Step... | Remnant evolution after a carbon-oxygen white ... | 15 pages, 15 figures, 3 tables, submitted to M... | None | 10.1111/j.1365-2966.2007.12161.x | None | astro-ph | None | We systematically explore the evolution of t... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Yoon, Sung-Chul, ], [Podsiadlowski, Philipp,... |
1 | 0704.0342 | Patrice Ntumba Pungu | B. Dugmore and PP. Ntumba | Cofibrations in the Category of Frolicher Spac... | 27 pages | None | None | None | math.AT | None | Cofibrations are defined in the category of ... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Dugmore, B., ], [Ntumba, PP., ]] |
2 | 0704.0360 | Zaqarashvili | T.V. Zaqarashvili and K Murawski | Torsional oscillations of longitudinally inhom... | 6 pages, 3 figures, accepted in A&A | None | 10.1051/0004-6361:20077246 | None | astro-ph | None | We explore the effect of an inhomogeneous ma... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Zaqarashvili, T. V., ], [Murawski, K, ]] |
3 | 0704.0525 | Sezgin Ayg\"un | Sezgin Aygun, Ismail Tarhan, Husnu Baysal | On the Energy-Momentum Problem in Static Einst... | This submission has been withdrawn by arXiv ad... | Chin.Phys.Lett.24:355-358,2007 | 10.1088/0256-307X/24/2/015 | None | gr-qc | None | This paper has been removed by arXiv adminis... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-10-21 | [[Aygun, Sezgin, ], [Tarhan, Ismail, ], [Baysa... |
4 | 0704.0535 | Antonio Pipino | Antonio Pipino (1,3), Thomas H. Puzia (2,4), a... | The Formation of Globular Cluster Systems in M... | 32 pages (referee format), 9 figures, ApJ acce... | Astrophys.J.665:295-305,2007 | 10.1086/519546 | None | astro-ph | None | The most massive elliptical galaxies show a ... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-08-19 | [[Pipino, Antonio, ], [Puzia, Thomas H., ], [M... |
2.数据预处理
'''
count:⼀列数据的元素个数;
unique:⼀列数据中元素的种类;
top:⼀列数据中出现频率最⾼的元素;
freq:⼀列数据中出现频率最⾼的元素的个数;
'''
data["categories"].describe()
count 170618
unique 15592
top cs.CV
freq 5559
Name: categories, dtype: object
# 所有的种类(独⽴的)
unique_categories = set([i for l in [x.split(' ') for x in data["categories"]] for i in l])
len(unique_categories)
172
unique_categories
'''这⾥使⽤了 split 函数将多类别使⽤ “ ”(空格)分开,组成list,并使⽤ for 循环将独⽴出现的类别找出
来,并使⽤ set 类别,将重复项去除得到最终所有的独⽴paper种类。'''
'这⾥使⽤了 split 函数将多类别使⽤ “ ”(空格)分开,组成list,并使⽤ for 循环将独⽴出现的类别找出\n来,并使⽤ set 类别,将重复项去除得到最终所有的独⽴paper种类。'
data["year"]=pd.to_datetime(data["update_date"]).dt.year
#将update_date从例如2019-02-20的str变为datetime格式,并提取处year
data["year"]
0 2019
1 2019
2 2019
3 2019
4 2019
...
170613 2019
170614 2019
170615 2019
170616 2019
170617 2019
Name: year, Length: 170618, dtype: int64
'''del data["update_date"] #删除 update_date特征,其使命已完成
data = data[data["year"] >= 2019] #找出 year 中2019年以后的数据,并将其他数据删除'''
data.groupby(['categories','year'])
data.reset_index(drop=True, inplace=True) #重新编号
data #查看结果
id | submitter | authors | title | comments | journal-ref | doi | report-no | categories | license | abstract | versions | update_date | authors_parsed | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0704.0297 | Sung-Chul Yoon | Sung-Chul Yoon, Philipp Podsiadlowski and Step... | Remnant evolution after a carbon-oxygen white ... | 15 pages, 15 figures, 3 tables, submitted to M... | None | 10.1111/j.1365-2966.2007.12161.x | None | astro-ph | None | We systematically explore the evolution of t... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Yoon, Sung-Chul, ], [Podsiadlowski, Philipp,... | 2019 |
1 | 0704.0342 | Patrice Ntumba Pungu | B. Dugmore and PP. Ntumba | Cofibrations in the Category of Frolicher Spac... | 27 pages | None | None | None | math.AT | None | Cofibrations are defined in the category of ... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Dugmore, B., ], [Ntumba, PP., ]] | 2019 |
2 | 0704.0360 | Zaqarashvili | T.V. Zaqarashvili and K Murawski | Torsional oscillations of longitudinally inhom... | 6 pages, 3 figures, accepted in A&A | None | 10.1051/0004-6361:20077246 | None | astro-ph | None | We explore the effect of an inhomogeneous ma... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Zaqarashvili, T. V., ], [Murawski, K, ]] | 2019 |
3 | 0704.0525 | Sezgin Ayg\"un | Sezgin Aygun, Ismail Tarhan, Husnu Baysal | On the Energy-Momentum Problem in Static Einst... | This submission has been withdrawn by arXiv ad... | Chin.Phys.Lett.24:355-358,2007 | 10.1088/0256-307X/24/2/015 | None | gr-qc | None | This paper has been removed by arXiv adminis... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-10-21 | [[Aygun, Sezgin, ], [Tarhan, Ismail, ], [Baysa... | 2019 |
4 | 0704.0535 | Antonio Pipino | Antonio Pipino (1,3), Thomas H. Puzia (2,4), a... | The Formation of Globular Cluster Systems in M... | 32 pages (referee format), 9 figures, ApJ acce... | Astrophys.J.665:295-305,2007 | 10.1086/519546 | None | astro-ph | None | The most massive elliptical galaxies show a ... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-08-19 | [[Pipino, Antonio, ], [Puzia, Thomas H., ], [M... | 2019 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
170613 | quant-ph/9904032 | Mikhail Lukin | V. A. Sautenkov, M. D. Lukin, C. J. Bednar, G.... | Enhancement of Magneto-Optic Effects via Large... | None | None | 10.1103/PhysRevA.62.023810 | None | quant-ph | None | We utilize the generation of large atomic co... | [{'version': 'v1', 'created': 'Thu, 8 Apr 1999... | 2019-08-17 | [[Sautenkov, V. A., ], [Lukin, M. D., ], [Bedn... | 2019 |
170614 | solv-int/9511005 | Wen-Xiu Ma | Wen-Xiu Ma, Benno Fuchssteiner | Explicit and Exact Solutions to a Kolmogorov-P... | 14pages, Latex, to appear in Intern. J. Nonlin... | None | 10.1016/0020-7462(95)00064-X | None | solv-int nlin.SI | None | Some explicit traveling wave solutions to a ... | [{'version': 'v1', 'created': 'Tue, 14 Nov 199... | 2019-08-15 | [[Ma, Wen-Xiu, ], [Fuchssteiner, Benno, ]] | 2019 |
170615 | solv-int/9809008 | Victor Enolskii | J C Eilbeck, V Z Enol'skii, V B Kuznetsov, D V... | Linear r-Matrix Algebra for a Hierarchy of One... | plain LaTeX, 28 pages | None | None | None | solv-int nlin.SI | None | We consider a hierarchy of many-particle sys... | [{'version': 'v1', 'created': 'Wed, 2 Sep 1998... | 2019-08-17 | [[Eilbeck, J C, ], [Enol'skii, V Z, ], [Kuznet... | 2019 |
170616 | solv-int/9909010 | Pierre van Moerbeke | M. Adler, T. Shiota and P. van Moerbeke | Pfaff tau-functions | 42 pages | None | None | None | solv-int adap-org hep-th nlin.AO nlin.SI | None | Consider the evolution $$ \frac{\pl m_\iy}{\... | [{'version': 'v1', 'created': 'Wed, 15 Sep 199... | 2019-08-17 | [[Adler, M., ], [Shiota, T., ], [van Moerbeke,... | 2019 |
170617 | solv-int/9909014 | David Fairlie | D.B. Fairlie and A.N. Leznov | The General Solution of the Complex Monge-Amp\... | 13 pages, latex, no figures | None | 10.1088/0305-4470/33/25/307 | None | solv-int nlin.SI | None | A general solution to the Complex Monge-Amp\... | [{'version': 'v1', 'created': 'Thu, 16 Sep 199... | 2019-08-21 | [[Fairlie, D. B., ], [Leznov, A. N., ]] | 2019 |
170618 rows × 15 columns
data_2019 = data[data['year']>=2019].reset_index()
data_2019.head()
index | id | submitter | authors | title | comments | journal-ref | doi | report-no | categories | license | abstract | versions | update_date | authors_parsed | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0704.0297 | Sung-Chul Yoon | Sung-Chul Yoon, Philipp Podsiadlowski and Step... | Remnant evolution after a carbon-oxygen white ... | 15 pages, 15 figures, 3 tables, submitted to M... | None | 10.1111/j.1365-2966.2007.12161.x | None | astro-ph | None | We systematically explore the evolution of t... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Yoon, Sung-Chul, ], [Podsiadlowski, Philipp,... | 2019 |
1 | 1 | 0704.0342 | Patrice Ntumba Pungu | B. Dugmore and PP. Ntumba | Cofibrations in the Category of Frolicher Spac... | 27 pages | None | None | None | math.AT | None | Cofibrations are defined in the category of ... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Dugmore, B., ], [Ntumba, PP., ]] | 2019 |
2 | 2 | 0704.0360 | Zaqarashvili | T.V. Zaqarashvili and K Murawski | Torsional oscillations of longitudinally inhom... | 6 pages, 3 figures, accepted in A&A | None | 10.1051/0004-6361:20077246 | None | astro-ph | None | We explore the effect of an inhomogeneous ma... | [{'version': 'v1', 'created': 'Tue, 3 Apr 2007... | 2019-08-19 | [[Zaqarashvili, T. V., ], [Murawski, K, ]] | 2019 |
3 | 3 | 0704.0525 | Sezgin Ayg\"un | Sezgin Aygun, Ismail Tarhan, Husnu Baysal | On the Energy-Momentum Problem in Static Einst... | This submission has been withdrawn by arXiv ad... | Chin.Phys.Lett.24:355-358,2007 | 10.1088/0256-307X/24/2/015 | None | gr-qc | None | This paper has been removed by arXiv adminis... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-10-21 | [[Aygun, Sezgin, ], [Tarhan, Ismail, ], [Baysa... | 2019 |
4 | 4 | 0704.0535 | Antonio Pipino | Antonio Pipino (1,3), Thomas H. Puzia (2,4), a... | The Formation of Globular Cluster Systems in M... | 32 pages (referee format), 9 figures, ApJ acce... | Astrophys.J.665:295-305,2007 | 10.1086/519546 | None | astro-ph | None | The most massive elliptical galaxies show a ... | [{'version': 'v1', 'created': 'Wed, 4 Apr 2007... | 2019-08-19 | [[Pipino, Antonio, ], [Puzia, Thomas H., ], [M... | 2019 |
#挑选出计算机领域内的所有文章(爬网页)
website_url = requests.get('https://arxiv.org/category_taxonomy').text
soup = BeautifulSoup(website_url,'lxml')#爬取数据,使用lxml解析器
root = soup.find('div',{'id':'category_taxonomy_list'})#找出BeautifulSoup对应的标签入口
tags = root.find_all(['h2','h3','h4','p'],recursive=True)
#初始化 str 和 list 变量
level_1_name = ""
level_2_name = ""
level_2_code = ""
level_1_names = []
level_2_codes = []
level_2_names = []
level_3_codes = []
level_3_names = []
level_3_notes = []
for t in tags:
if t.name == "h2":#t.name指标签</>的内容即‘h2’、‘h3’等
#h2标签为<h2 class="accordion-head">Mathematics</h2>,我们只需要获取“Mathematics”这个文本内容
level_1_name = t.text#t.text为去掉</>标签后的文本内容
level_2_code = t.text
level_2_name = t.text
elif t.name == "h3":
raw = t.text#<h3>Quantum Physics<br/><span>(quant-ph)</span></h3>,t.text:Quantum Physics(quant-ph)'
level_2_code = re.sub(r"(.*)\((.*)\)",r"\2",raw) #正则表达式:模式字符串:(.*)\((.*)\);被替换字符串"\2";被处理字符串:raw
#"(.*)\((.*)\)"匹配第一个括号前的内容和第一个括号内的内容,r"\2"表示获取匹配第二个(.*)的内容
level_2_name = re.sub(r"(.*)\((.*)\)",r"\1",raw)
elif t.name == "h4":
raw = t.text#h4:<h4>stat.TH <span>(Statistics Theory)</span></h4>,h4.text:'stat.TH (Statistics Theory)'
level_3_code = re.sub(r"(.*) \((.*)\)",r"\1",raw)
level_3_name = re.sub(r"(.*) \((.*)\)",r"\2",raw)
elif t.name == "p":
notes = t.text
#</p><p>stat.TH is an alias for math.ST. Asymptotics, Bayesian Inference, Decision Theory, Estimation, Foundations, Inference, Testing.</p>
level_1_names.append(level_1_name)#在上面判断h2、h3、h4时已经赋值
level_2_names.append(level_2_name)
level_2_codes.append(level_2_code)
level_3_names.append(level_3_name)
level_3_codes.append(level_3_code)
level_3_notes.append(notes)
df_taxonomy = pd.DataFrame({
'group_name' : level_1_names,
'archive_name' : level_2_names,
'archive_id' : level_2_codes,
'category_name' : level_3_names,
'categories' : level_3_codes,
'category_description': level_3_notes
})
#按照 "group_name" 进行分组,在组内使用 "archive_name" 进行排序
df_taxonomy.groupby(["group_name","archive_name"])
df_taxonomy
group_name | archive_name | archive_id | category_name | categories | category_description | |
---|---|---|---|---|---|---|
0 | Computer Science | Computer Science | Computer Science | Artificial Intelligence | cs.AI | Covers all areas of AI except Vision, Robotics... |
1 | Computer Science | Computer Science | Computer Science | Hardware Architecture | cs.AR | Covers systems organization and hardware archi... |
2 | Computer Science | Computer Science | Computer Science | Computational Complexity | cs.CC | Covers models of computation, complexity class... |
3 | Computer Science | Computer Science | Computer Science | Computational Engineering, Finance, and Science | cs.CE | Covers applications of computer science to the... |
4 | Computer Science | Computer Science | Computer Science | Computational Geometry | cs.CG | Roughly includes material in ACM Subject Class... |
... | ... | ... | ... | ... | ... | ... |
150 | Statistics | Statistics | Statistics | Computation | stat.CO | Algorithms, Simulation, Visualization |
151 | Statistics | Statistics | Statistics | Methodology | stat.ME | Design, Surveys, Model Selection, Multiple Tes... |
152 | Statistics | Statistics | Statistics | Machine Learning | stat.ML | Covers machine learning papers (supervised, un... |
153 | Statistics | Statistics | Statistics | Other Statistics | stat.OT | Work in statistics that does not fit into the ... |
154 | Statistics | Statistics | Statistics | Statistics Theory | stat.TH | stat.TH is an alias for math.ST. Asymptotics, ... |
155 rows × 6 columns
import re
phone = "2004-959-559 # 假设一个电话号码"
#删除注释
num = re.sub(r'#.*$', "", phone)
print ("电话号码 : ", num)
电话号码 : 2004-959-559
# 移除⾮数字的内容
num = re.sub(r'\D', "", phone)
print ("电话号码 : ", num)
电话号码 : 2004959559
1.4.3 数据分析及可视化
_df = data.merge(df_taxonomy, on="categories",how="left").drop_duplicates(["id","group_name"]).groupby("group_name").agg({"id":"count"}).sort_values(by="id",ascending=False).reset_index()
_df
group_name | id | |
---|---|---|
0 | Physics | 38379 |
1 | Mathematics | 24495 |
2 | Computer Science | 18087 |
3 | Statistics | 1802 |
4 | Electrical Engineering and Systems Science | 1371 |
5 | Quantitative Biology | 886 |
6 | Quantitative Finance | 352 |
7 | Economics | 173 |
fig = plt.figure(figsize=(15,12))
explode = (0.3, 0.2, 0.1, 0, 0, 0, 0, 0)
plt.pie(_df["id"], labels=_df["group_name"], autopct='%1.2f%%',
startangle=160, explode=explode)
plt.tight_layout() #作用是自动调整子图参数,使之填充整个图像区域。
plt.show()
'''autopct :控制饼图内百分比设置,可以使用format字符串或者format function'%1.1f'指小数点前后位数(没有用空格补齐);
explode :(每一块)离开中心距离;
startangle :起始绘制角度,默认图是从x轴正方向逆时针画起,如设定=90则从y轴正方向画起;
'''
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-i1fqKZQY-1610551534663)(output_20_0.png)]
group_name="Computer Science"
cats = data.merge(df_taxonomy, on="categories").query("group_name [email protected]_name")
cats.groupby(["year","category_name"]).count().reset_index().pivot(index="category_name", columns="year",values="id")
year | 2019 |
---|---|
category_name | |
Artificial Intelligence | 558 |
Computation and Language | 2153 |
Computational Complexity | 131 |
Computational Engineering, Finance, and Science | 108 |
Computational Geometry | 199 |
Computer Science and Game Theory | 281 |
Computer Vision and Pattern Recognition | 5559 |
Computers and Society | 346 |
Cryptography and Security | 1067 |
Data Structures and Algorithms | 711 |
Databases | 282 |
Digital Libraries | 125 |
Discrete Mathematics | 84 |
Distributed, Parallel, and Cluster Computing | 715 |
Emerging Technologies | 101 |
Formal Languages and Automata Theory | 152 |
General Literature | 5 |
Graphics | 116 |
Hardware Architecture | 95 |
Human-Computer Interaction | 420 |
Information Retrieval | 245 |
Logic in Computer Science | 470 |
Machine Learning | 177 |
Mathematical Software | 27 |
Multiagent Systems | 85 |
Multimedia | 76 |
Networking and Internet Architecture | 864 |
Neural and Evolutionary Computing | 235 |
Numerical Analysis | 40 |
Operating Systems | 36 |
Other Computer Science | 67 |
Performance | 45 |
Programming Languages | 268 |
Robotics | 917 |
Social and Information Networks | 202 |
Software Engineering | 659 |
Sound | 7 |
Symbolic Computation | 44 |
Systems and Control | 415 |
上一篇: [Python]自学笔记34:论一只爬虫的自我修养1
下一篇: 安装docker的心路历程