第4.3章 request爬取小学3000词语
程序员文章站
2022-01-20 23:14:55
...
爬虫并不是一定要用scrapy框架,下面介绍的这个就是通过requests直接获取的,代码如下
生成田字格的代码参考第4.1章给小朋友写的飞鸟集打乱后组词的爬虫
import requests
import os
import re
from pyquery import PyQuery as pq
from word_deal.primary_spelling import to_doc,duplicate_removal
OUT_PATH = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\out\\'
def gen_yuwen_txt(xx_name):
r = requests.get('http://k.sina.com.cn/article_6429307123_17f3770f30010033nv.html?from=baby')
soup = pq(r.content)
lines = soup('#artibody p>font')
paras = []
for line in lines:
paras.append(pq(line).text())
# 过滤包含数字的,因为文章中包含数字的,才是真实有效的数字
paras = filter(lambda x:re.findall('\d',x),paras)
file = open(xx_name,'w',encoding='utf-8')
for para in paras:
# 有些错别字的需要纠正
para = para.replace('识字','')
file.writelines(para+'\n')
file.close()
def get_lines(xx_name):
lines = []
file = open(xx_name,encoding='utf-8')
lines = file.readlines()
file.close()
return lines
def gen_by_nianji(xx_name):
lines = get_lines(xx_name)
# 根据关键字获取索引,才好匹配出对应生字内容
# 一年级
up_index_1 = lines.index('一年级上册生字: 100个\n')
down_index_1 = lines.index('一年级下册生字:250个\n')
# 二年级
up_index_2 = lines.index('二年级上册生字:350个\n')
down_index_2 = lines.index('二年级下册生字:300个\n')
# 三年级
up_index_3 = lines.index('三年级上册生字300个\n')
down_index_3 = lines.index('三年级下册生字300个\n')
# 四年级
up_index_4 = lines.index('四年级上册生字200个\n')
down_index_4 = lines.index('四年级下册生字200个\n')
# 五年级
up_index_5 = lines.index('五年级上册生字150个\n')
down_index_5 = lines.index('五年级下册生字150个\n')
# 六年级
up_index_6 = lines.index('六年级上册生字80个\n')
down_index_6 = lines.index('六年级下册生字80个\n')
# 逐年生成
words10 = get_words(lines[up_index_1:down_index_1])
words11 = get_words(lines[down_index_1:up_index_2])
words20 = get_words(lines[up_index_2:down_index_2])
words21 = get_words(lines[down_index_2:up_index_3])
words30 = get_words(lines[up_index_3:down_index_3])
words31 = get_words(lines[down_index_3:up_index_4])
words40 = get_words(lines[up_index_4:down_index_4])
words41 = get_words(lines[down_index_4:up_index_5])
words50 = get_words(lines[up_index_5:down_index_5])
words51 = get_words(lines[down_index_5:up_index_6])
words60 = get_words(lines[up_index_6:down_index_6])
words61 = get_words(lines[down_index_6:])
to_pinyin(words10, '一年级上册')
to_pinyin(words11, '一年级下册')
to_pinyin(words20, '二年级上册')
to_pinyin(words21, '二年级下册')
to_pinyin(words30, '三年级上册')
to_pinyin(words31, '三年级下册')
to_pinyin(words40, '四年级上册')
to_pinyin(words41, '四年级下册')
to_pinyin(words50, '五年级上册')
to_pinyin(words51, '五年级下册')
to_pinyin(words60, '六年级上册')
to_pinyin(words61, '六年级下册')
def get_words(lines):
# 过滤出以数字开头的内容
lines = filter(lambda x:re.match(r'^\d',x),lines)
words = []
for line in lines:
# 通过下面的语句过滤出包含中文字的内容
m = re.findall(r'[\u4e00-\u9fa5]+',line)
words.append(str(m))
return words
def to_pinyin(paragraphs,file_name):
words = duplicate_removal(paragraphs)
file_name = file_name+'.docx'
to_doc(list(words),file_name)
if __name__ == '__main__':
xx_name = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\in\\xx.txt'
gen_yuwen_txt(xx_name)
gen_by_nianji(xx_name)