欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

第4.3章 request爬取小学3000词语

程序员文章站 2022-01-20 23:14:55
...

爬虫并不是一定要用scrapy框架,下面介绍的这个就是通过requests直接获取的,代码如下
生成田字格的代码参考第4.1章给小朋友写的飞鸟集打乱后组词的爬虫

import requests
import os
import re
from pyquery import PyQuery as pq
from word_deal.primary_spelling import to_doc,duplicate_removal

OUT_PATH = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\out\\'

def gen_yuwen_txt(xx_name):
    r = requests.get('http://k.sina.com.cn/article_6429307123_17f3770f30010033nv.html?from=baby')
    soup = pq(r.content)
    lines = soup('#artibody p>font')
    paras = []
    for line in lines:
        paras.append(pq(line).text())
    # 过滤包含数字的,因为文章中包含数字的,才是真实有效的数字
    paras = filter(lambda x:re.findall('\d',x),paras)
    file = open(xx_name,'w',encoding='utf-8')
    for para in paras:
    	# 有些错别字的需要纠正
        para = para.replace('识字','')
        file.writelines(para+'\n')
    file.close()

def get_lines(xx_name):
    lines = []
    file = open(xx_name,encoding='utf-8')
    lines = file.readlines()
    file.close()
    return lines

def gen_by_nianji(xx_name):
    lines = get_lines(xx_name)
    # 根据关键字获取索引,才好匹配出对应生字内容
    # 一年级
    up_index_1 = lines.index('一年级上册生字: 100个\n')
    down_index_1 = lines.index('一年级下册生字:250个\n')
    # 二年级
    up_index_2 = lines.index('二年级上册生字:350个\n')
    down_index_2 = lines.index('二年级下册生字:300个\n')
    # 三年级
    up_index_3 = lines.index('三年级上册生字300个\n')
    down_index_3 = lines.index('三年级下册生字300个\n')
    # 四年级
    up_index_4 = lines.index('四年级上册生字200个\n')
    down_index_4 = lines.index('四年级下册生字200个\n')
    # 五年级
    up_index_5 = lines.index('五年级上册生字150个\n')
    down_index_5 = lines.index('五年级下册生字150个\n')
    # 六年级
    up_index_6 = lines.index('六年级上册生字80个\n')
    down_index_6 = lines.index('六年级下册生字80个\n')
    # 逐年生成
    words10 = get_words(lines[up_index_1:down_index_1])
    words11 = get_words(lines[down_index_1:up_index_2])
    words20 = get_words(lines[up_index_2:down_index_2])
    words21 = get_words(lines[down_index_2:up_index_3])
    words30 = get_words(lines[up_index_3:down_index_3])
    words31 = get_words(lines[down_index_3:up_index_4])
    words40 = get_words(lines[up_index_4:down_index_4])
    words41 = get_words(lines[down_index_4:up_index_5])
    words50 = get_words(lines[up_index_5:down_index_5])
    words51 = get_words(lines[down_index_5:up_index_6])
    words60 = get_words(lines[up_index_6:down_index_6])
    words61 = get_words(lines[down_index_6:])
    to_pinyin(words10, '一年级上册')
    to_pinyin(words11, '一年级下册')
    to_pinyin(words20, '二年级上册')
    to_pinyin(words21, '二年级下册')
    to_pinyin(words30, '三年级上册')
    to_pinyin(words31, '三年级下册')
    to_pinyin(words40, '四年级上册')
    to_pinyin(words41, '四年级下册')
    to_pinyin(words50, '五年级上册')
    to_pinyin(words51, '五年级下册')
    to_pinyin(words60, '六年级上册')
    to_pinyin(words61, '六年级下册')

def get_words(lines):
	# 过滤出以数字开头的内容
    lines = filter(lambda x:re.match(r'^\d',x),lines)
    words = []
    for line in lines:
    	# 通过下面的语句过滤出包含中文字的内容
        m = re.findall(r'[\u4e00-\u9fa5]+',line)
        words.append(str(m))
    return words

def to_pinyin(paragraphs,file_name):
    words = duplicate_removal(paragraphs)
    file_name = file_name+'.docx'
    to_doc(list(words),file_name)

if __name__ == '__main__':
    xx_name = 'G:\\dzmfile\\pythonwork\\small_routine\\others\\in\\xx.txt'
    gen_yuwen_txt(xx_name)
    gen_by_nianji(xx_name)
相关标签: scrapy

上一篇: scrapy的安装

下一篇: Scrapy