欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬虫-爬取浏览器信息存入MongoDB和文件

程序员文章站 2022-05-09 22:01:09
...

一.新建项目
1.安装组件和数据库MongoDB

查看组件:pip list
安装组件:pip install Scrapy
		 pip install beautifulsoup4
		 pip install requests
		 pip install pymongo
数据库链接: https://pan.baidu.com/s/1Amr_oxGc3QL0LSLA6lJ6QA 提取码: bj1g

2.测试数据库

.\mongod --storageEngine=mmapv1

新cmd窗口
mongo
show dbs
db.demo.save({code:'E01', name:'Jack'})
db.demo.find()
db.demo.find().pretty()

3.打开cmd (无中文路径)创建项目

scrapy startproject DemoProject

4.创建爬虫
Spider文件

import scrapy
from ..items import TedItem
 
class TedSpider(scrapy.Spider):
    name = 'ted'
    start_urls = ['https://www.ted.com/talks']
 
    def parse(self, response):
        ru = response.xpath('//*[@id="browse-results"]/div[1]/div[@class="col"]')
        for element in ru:
            ti = TedItem()
            ti['talk'] = element.xpath('./div/div/div/div[2]/h4[2]/a/text()').extract_first()
            ti['link'] = element.xpath('./div/div/div/div[2]/h4[2]/a/@href').extract_first()
            yield  ti

5.封装数据
Item文件

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
 
class DemoprojectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
 
class TedItem(scrapy.Item):
    talk = scrapy.Field()
    link = scrapy.Field()

6.数据库配置
settings文件

ITEM_PIPELINES = {
    'DemoProject.pipelines.JsonPipeline': 1,
    'DemoProject.pipelines.MongoPipeline': 2,
}
# MongoDB
MONGO_URI = '127.0.0.1'
MONGO_DATABASE = 'teddb'

7.构建pipeline
pipelines 文件

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 
 
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
import pymongo


class DemoprojectPipeline:
    def process_item(self, item, spider):
        return item
 
class JsonPipeline:
    def open_spider(self, spider):
        self.file = open('ted.json','w', encoding='UTF-8')
 
    def close_spider(self, spider):
        self.file.close()
 
    def process_item(self, item, spider):
        print(json.dumps(dict(item)))
        line = json.dumps(dict(item)) + '\n'
        self.file.write(line)
        return item


class MongoPipeline:
    collection_name = 'ted'

    def __init__(self,mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri= crawler.settings.get('MONGO_URI'),
            mongo_db= crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(dict(item))
        return item

8.爬取数据

scrapy crawl ted

9.查看数据

show dbs
use teddb
show collections
db.ted.find().pretty()

10.爬取图片
10.1创建存储文件的文件夹

'D:\\images'

10.2爬虫编写

from bs4 import BeautifulSoup
import requests
from contextlib import closing
import os
import sys

class ImageSpider(object):
    def __init__(self):
        self.site = 'https://www.ted.com'
        self.url = 'https://www.ted.com/talks'
        self.links = []
 
    def get_image_links(self):
        request = requests.get(url=self.url)
        html_text = request.text
        bs = BeautifulSoup(html_text, features='html.parser')
        results_div = bs.find('div', id='browse-results')
        results_div_img = results_div.find_all('img')
        # print(results_div_img)
        for item in results_div_img:
            self.links.append(item.get('src'))
 
    def download_image(self, path, image_url, filename):
        image_path = os.path.join(path, filename)
        request_headers = {'Accept': '*/*',
                           'Accept-Encoding': 'gzip, deflate, br',
                           'Accept-Language': 'zh-CN,zh;q=0.9',
                           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
        size = 0
        with closing(requests.get(image_url, headers=request_headers, stream=True)) as response:
            chunk_size = 1024
            content_size = int(response.headers['content-length'])
            if response.status_code == 200:
                sys.stdout.write(filename + ' downloading...\n')
                sys.stdout.write('File Size: %0.2f MB\n' % (content_size / chunk_size / 1024))
 
                with open(image_path, 'wb') as file:
                    for data in response.iter_content(chunk_size=chunk_size):
                        file.write(data)
                        size += len(data)
                        file.flush()
                        sys.stdout.write('In Progress: %.2f%%' % float(size / content_size * 100) + '\r')
                        sys.stdout.flush()
 
    def start_download(self, links):
        for link in links:
            temp = link.split('/')[-1]
            filename = temp.split('?')[-2]
            self.download_image('D:\\images', link, filename)
 
if __name__ == '__main__':
    imageSpider = ImageSpider()
    imageSpider.get_image_links()
    print(imageSpider.links)
    imageSpider.start_download(imageSpider.links)

10.3执行爬虫(文件所在文件夹cmd)

python .\文件名
python .\spider_image.py

11.爬取视频
11.1创建存储文件的文件夹

d:\\videos

11.2爬虫编写

import requests
import threading
import datetime
import os
 
count = 0
 
 
def downloader(start, end, url, resources):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
 
    for resource in resources[start: end]:
        global count
 
        request = requests.get(resource.replace("\n", ""),
                         headers=headers,
                         stream=True)
        bytes = resource.split('=')[-1]
        with open("d:\\videos/" + bytes.replace("\n", ""), "wb") as code:
            code.write(request.content)
        count = count + 1
        print("In Progress:%.2f" % (count / len(resources)))
 
 
def download_vedio(url, num_thread=100):
    cwd = os.getcwd()
    file = open('index.m3u8', 'r', encoding='UTF-8')
    text_list = file.readlines()
    resource_list = []
    for text in text_list:
        if text.find('#EX') == -1:
            resource_list.append(text)
 
    file.close()
    file_size = len(resource_list)
 
    part = file_size // num_thread
    for n in range(num_thread):
        start = part * n
        if n == num_thread - 1:
            end = file_size
        else:
            end = start + part
 
        thread = threading.Thread(target=downloader, kwargs={'start': start, 'end': end, 'url': url, 'resources': resource_list})
        thread.setDaemon(True)
        thread.start()
 
    currentThread = threading.current_thread()
    for t in threading.enumerate():
        if t is currentThread:
            continue
        t.join()
 
 
def build_merge_cmd():
    cwd = os.getcwd()
    f = open('index.m3u8', 'r', encoding='UTF-8')
    text_list = f.readlines()
    files = []
    for i in text_list:
        if i.find('#EX') == -1:
            files.append(i)
    f.close()
    tmp = []
    for file in files[0:1024]:
        bytes = file.split('=')[-1]
        tmp.append(bytes.replace("\n", ""))
 
    shell_str = '+'.join(tmp)
    shell_str = 'copy /b ' + shell_str + ' ted.mp4' + '\n' + 'del *.ts'
    return shell_str
 
 
def generate_merge_cmd(cmdString):
    cwd = os.getcwd()
    f = open("merge.bat", 'w')
    f.write(cmdString)
    f.close()
 
 
if __name__ == '__main__':
    url = ""
    start = datetime.datetime.now().replace(microsecond=0)
    download_vedio(url)
    end = datetime.datetime.now().replace(microsecond=0)
    print(end - start)
    cmd = build_merge_cmd()
    generate_merge_cmd(cmd)

11.3执行爬虫(文件所在文件夹cmd)

python .\文件名
python .\spider_video.py

11.4合并文件
点击生成的.bat文件

相关标签: Python 爬虫