第4.3章 request爬取小学3000词语

程序员文章站 2022-01-20 23:14:55
...
爬虫并不是一定要用scrapy框架，下面介绍的这个就是通过requests直接获取的，代码如下
生成田字格的代码参考第4.1章给小朋友写的飞鸟集打乱后组词的爬虫
import requests
import os
import re
from pyquery import PyQuery as pq
from word_deal.primary_spelling import to_doc,duplicate_removal

OUT_PATH = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\out\\'

def gen_yuwen_txt(xx_name):
    r = requests.get('http://k.sina.com.cn/article_6429307123_17f3770f30010033nv.html?from=baby')
    soup = pq(r.content)
    lines = soup('#artibody p>font')
    paras = []
    for line in lines:
        paras.append(pq(line).text())
    # 过滤包含数字的，因为文章中包含数字的，才是真实有效的数字
    paras = filter(lambda x:re.findall('\d',x),paras)
    file = open(xx_name,'w',encoding='utf-8')
    for para in paras:
    	# 有些错别字的需要纠正
        para = para.replace('识字','')
        file.writelines(para+'\n')
    file.close()

def get_lines(xx_name):
    lines = []
    file = open(xx_name,encoding='utf-8')
    lines = file.readlines()
    file.close()
    return lines

def gen_by_nianji(xx_name):
    lines = get_lines(xx_name)
    # 根据关键字获取索引，才好匹配出对应生字内容
    # 一年级
    up_index_1 = lines.index('一年级上册生字： 100个\n')
    down_index_1 = lines.index('一年级下册生字：250个\n')
    # 二年级
    up_index_2 = lines.index('二年级上册生字：350个\n')
    down_index_2 = lines.index('二年级下册生字：300个\n')
    # 三年级
    up_index_3 = lines.index('三年级上册生字300个\n')
    down_index_3 = lines.index('三年级下册生字300个\n')
    # 四年级
    up_index_4 = lines.index('四年级上册生字200个\n')
    down_index_4 = lines.index('四年级下册生字200个\n')
    # 五年级
    up_index_5 = lines.index('五年级上册生字150个\n')
    down_index_5 = lines.index('五年级下册生字150个\n')
    # 六年级
    up_index_6 = lines.index('六年级上册生字80个\n')
    down_index_6 = lines.index('六年级下册生字80个\n')
    # 逐年生成
    words10 = get_words(lines[up_index_1:down_index_1])
    words11 = get_words(lines[down_index_1:up_index_2])
    words20 = get_words(lines[up_index_2:down_index_2])
    words21 = get_words(lines[down_index_2:up_index_3])
    words30 = get_words(lines[up_index_3:down_index_3])
    words31 = get_words(lines[down_index_3:up_index_4])
    words40 = get_words(lines[up_index_4:down_index_4])
    words41 = get_words(lines[down_index_4:up_index_5])
    words50 = get_words(lines[up_index_5:down_index_5])
    words51 = get_words(lines[down_index_5:up_index_6])
    words60 = get_words(lines[up_index_6:down_index_6])
    words61 = get_words(lines[down_index_6:])
    to_pinyin(words10, '一年级上册')
    to_pinyin(words11, '一年级下册')
    to_pinyin(words20, '二年级上册')
    to_pinyin(words21, '二年级下册')
    to_pinyin(words30, '三年级上册')
    to_pinyin(words31, '三年级下册')
    to_pinyin(words40, '四年级上册')
    to_pinyin(words41, '四年级下册')
    to_pinyin(words50, '五年级上册')
    to_pinyin(words51, '五年级下册')
    to_pinyin(words60, '六年级上册')
    to_pinyin(words61, '六年级下册')

def get_words(lines):
	# 过滤出以数字开头的内容
    lines = filter(lambda x:re.match(r'^\d',x),lines)
    words = []
    for line in lines:
    	# 通过下面的语句过滤出包含中文字的内容
        m = re.findall(r'[\u4e00-\u9fa5]+',line)
        words.append(str(m))
    return words

def to_pinyin(paragraphs,file_name):
    words = duplicate_removal(paragraphs)
    file_name = file_name+'.docx'
    to_doc(list(words),file_name)

if __name__ == '__main__':
    xx_name = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\in\\xx.txt'
    gen_yuwen_txt(xx_name)
    gen_by_nianji(xx_name)