Python爬虫-爬取浏览器信息存入MongoDB和文件
程序员文章站
2022-05-09 22:01:09
...
一.新建项目
1.安装组件和数据库MongoDB
查看组件:pip list
安装组件:pip install Scrapy
pip install beautifulsoup4
pip install requests
pip install pymongo
数据库链接: https://pan.baidu.com/s/1Amr_oxGc3QL0LSLA6lJ6QA 提取码: bj1g
2.测试数据库
.\mongod --storageEngine=mmapv1
新cmd窗口
mongo
show dbs
db.demo.save({code:'E01', name:'Jack'})
db.demo.find()
db.demo.find().pretty()
3.打开cmd (无中文路径)创建项目
scrapy startproject DemoProject
4.创建爬虫
Spider文件
import scrapy
from ..items import TedItem
class TedSpider(scrapy.Spider):
name = 'ted'
start_urls = ['https://www.ted.com/talks']
def parse(self, response):
ru = response.xpath('//*[@id="browse-results"]/div[1]/div[@class="col"]')
for element in ru:
ti = TedItem()
ti['talk'] = element.xpath('./div/div/div/div[2]/h4[2]/a/text()').extract_first()
ti['link'] = element.xpath('./div/div/div/div[2]/h4[2]/a/@href').extract_first()
yield ti
5.封装数据
Item文件
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DemoprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class TedItem(scrapy.Item):
talk = scrapy.Field()
link = scrapy.Field()
6.数据库配置
settings文件
ITEM_PIPELINES = {
'DemoProject.pipelines.JsonPipeline': 1,
'DemoProject.pipelines.MongoPipeline': 2,
}
# MongoDB
MONGO_URI = '127.0.0.1'
MONGO_DATABASE = 'teddb'
7.构建pipeline
pipelines 文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
import pymongo
class DemoprojectPipeline:
def process_item(self, item, spider):
return item
class JsonPipeline:
def open_spider(self, spider):
self.file = open('ted.json','w', encoding='UTF-8')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
print(json.dumps(dict(item)))
line = json.dumps(dict(item)) + '\n'
self.file.write(line)
return item
class MongoPipeline:
collection_name = 'ted'
def __init__(self,mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri= crawler.settings.get('MONGO_URI'),
mongo_db= crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
8.爬取数据
scrapy crawl ted
9.查看数据
show dbs
use teddb
show collections
db.ted.find().pretty()
10.爬取图片
10.1创建存储文件的文件夹
'D:\\images'
10.2爬虫编写
from bs4 import BeautifulSoup
import requests
from contextlib import closing
import os
import sys
class ImageSpider(object):
def __init__(self):
self.site = 'https://www.ted.com'
self.url = 'https://www.ted.com/talks'
self.links = []
def get_image_links(self):
request = requests.get(url=self.url)
html_text = request.text
bs = BeautifulSoup(html_text, features='html.parser')
results_div = bs.find('div', id='browse-results')
results_div_img = results_div.find_all('img')
# print(results_div_img)
for item in results_div_img:
self.links.append(item.get('src'))
def download_image(self, path, image_url, filename):
image_path = os.path.join(path, filename)
request_headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
size = 0
with closing(requests.get(image_url, headers=request_headers, stream=True)) as response:
chunk_size = 1024
content_size = int(response.headers['content-length'])
if response.status_code == 200:
sys.stdout.write(filename + ' downloading...\n')
sys.stdout.write('File Size: %0.2f MB\n' % (content_size / chunk_size / 1024))
with open(image_path, 'wb') as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
size += len(data)
file.flush()
sys.stdout.write('In Progress: %.2f%%' % float(size / content_size * 100) + '\r')
sys.stdout.flush()
def start_download(self, links):
for link in links:
temp = link.split('/')[-1]
filename = temp.split('?')[-2]
self.download_image('D:\\images', link, filename)
if __name__ == '__main__':
imageSpider = ImageSpider()
imageSpider.get_image_links()
print(imageSpider.links)
imageSpider.start_download(imageSpider.links)
10.3执行爬虫(文件所在文件夹cmd)
python .\文件名
python .\spider_image.py
11.爬取视频
11.1创建存储文件的文件夹
d:\\videos
11.2爬虫编写
import requests
import threading
import datetime
import os
count = 0
def downloader(start, end, url, resources):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
for resource in resources[start: end]:
global count
request = requests.get(resource.replace("\n", ""),
headers=headers,
stream=True)
bytes = resource.split('=')[-1]
with open("d:\\videos/" + bytes.replace("\n", ""), "wb") as code:
code.write(request.content)
count = count + 1
print("In Progress:%.2f" % (count / len(resources)))
def download_vedio(url, num_thread=100):
cwd = os.getcwd()
file = open('index.m3u8', 'r', encoding='UTF-8')
text_list = file.readlines()
resource_list = []
for text in text_list:
if text.find('#EX') == -1:
resource_list.append(text)
file.close()
file_size = len(resource_list)
part = file_size // num_thread
for n in range(num_thread):
start = part * n
if n == num_thread - 1:
end = file_size
else:
end = start + part
thread = threading.Thread(target=downloader, kwargs={'start': start, 'end': end, 'url': url, 'resources': resource_list})
thread.setDaemon(True)
thread.start()
currentThread = threading.current_thread()
for t in threading.enumerate():
if t is currentThread:
continue
t.join()
def build_merge_cmd():
cwd = os.getcwd()
f = open('index.m3u8', 'r', encoding='UTF-8')
text_list = f.readlines()
files = []
for i in text_list:
if i.find('#EX') == -1:
files.append(i)
f.close()
tmp = []
for file in files[0:1024]:
bytes = file.split('=')[-1]
tmp.append(bytes.replace("\n", ""))
shell_str = '+'.join(tmp)
shell_str = 'copy /b ' + shell_str + ' ted.mp4' + '\n' + 'del *.ts'
return shell_str
def generate_merge_cmd(cmdString):
cwd = os.getcwd()
f = open("merge.bat", 'w')
f.write(cmdString)
f.close()
if __name__ == '__main__':
url = ""
start = datetime.datetime.now().replace(microsecond=0)
download_vedio(url)
end = datetime.datetime.now().replace(microsecond=0)
print(end - start)
cmd = build_merge_cmd()
generate_merge_cmd(cmd)
11.3执行爬虫(文件所在文件夹cmd)
python .\文件名
python .\spider_video.py
11.4合并文件
点击生成的.bat文件
推荐阅读
-
【Python爬虫】requests+BeautifulSoup4+MongoDB 爬取51job招聘信息
-
【Python Scrapy 爬虫框架】 5、利用 pipelines 和 settings 将爬取数据存储到 MongoDB
-
Python爬虫-爬取浏览器信息存入MongoDB和文件
-
Python 爬虫之 selenium 爬虫,模拟浏览器爬取天猫信息
-
爬取拉钩上海Python职位信息并存入MongoDB数据库
-
python学习笔记(二十二)爬虫基础(2):模拟浏览器,ajax动态爬取,爬取数据写入文件、图片爬虫
-
【python爬虫专项(18)】基于爬虫的MongoDB的应用(将爬取的数据存入到数据库)
-
【Python爬虫】requests+BeautifulSoup4+MongoDB 爬取51job招聘信息
-
爬取周董歌曲信息分别保存为excel文件和存入MySQL数据库以及发送到你的邮箱中
-
使用scrapy框架,用模拟浏览器的方法爬取京东上面膜信息,并存入mysql,sqlite,mongodb数据库