nodejs通过phantomjs实现下载网页_node.js
程序员文章站
2022-05-04 07:54:39
...
功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源
当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下
首先当然是下载 nodejs 和 phantomjs
下面是 phantomjs.exe 执行的 down.js
var page = require('webpage').create(), system = require('system'); var spawn = require("child_process").spawn if (system.args.length === 1) { console.log('Usage: netsniff.js'); phantom.exit(1); } else { var urls = []; page.address = system.args[1]; page.onResourceReceived = function (res) { if (res.stage === 'start') { urls.push(res.url); } }; page.open(page.address, function (status) { var har; if (status !== 'success') { console.log('FAIL to load the address'); phantom.exit(1); } else { console.log('down resource ' + urls.length + ' urls.'); var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')]) child.stdout.on("data", function (data) { console.log(data); }) child.stderr.on("data", function (data) { console.log(data); }) child.on("exit", function (code) { phantom.exit(); }) } }); }
下面是对应的node运行的 downHtml.js
"use strict"; var fs = require('fs'); var http = require('http'); var path = require('path'); var r_url = require('url'); var dirCache = {};//缓存减少判断 function makedir (pathStr, callback) { if (dirCache[pathStr] == 1) { callback(); } else { fs.exists(pathStr, function (exists) { if (exists == true) { dirCache[pathStr] == 1; callback(); } else { makedir(path.dirname(pathStr), function () { fs.mkdir(pathStr, function () { dirCache[pathStr] == 1; callback(); }) }); } }) } }; var reg = /[:,]\s*url\(['"]?.*?(\1)\)/g var reg2 = /\((['"]?)(.*?)(\1)\)/ var isDownMap = {}; var downImgFromCss = function (URL) { http.get(URL, function(res) { //console.log(path.resolve(process.cwd(), 'index.min.css')) //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css'))); var body = ""; res.setEncoding('utf8'); res.on('data', function (chunk) { body += chunk; }); res.on('end', function () { var match = body.match(reg); for (var i = 0, len = match.length; i
down.js downHtml.js 放在同一个文件夹下 通过下列 cmd 运行
D:\phantomjs-2.0.0-windows\bin\phantomjs.exe down.js http://www.youku.com/
以上所述就是本文的全部内容了,希望大家能够喜欢。