原理
在服务端模拟请求,不存在跨域问题
实现
let http = require('http');
let fs = require('fs')
let opts = {
host:'news.baidu.com',
}
http.createServer(function (req,res) {
let client = http.request(opts,function (r) {
let arr= [];
r.on('data',function (data) {
arr.push(data);
});
r.on('end',function() {
let result = Buffer.concat(arr).toString();
console.log(result)
let lis = result.match(/<li class="bold-item"(?:[\s\S]*?)<\/li>/img);
res.setHeader('Content-Type','text/html;charset=utf8');
fs.appendFileSync('./crawl.txt',lis);
res.end('结束');
})
});
client.end();
}).listen(3000)
复制代码
curl模拟请求,然后爬虫开始,把爬取的内容放到crawl.txt文件,结束,返回
➜ July curl -v localhost:3000
* Rebuilt URL to: localhost:3000/
* Trying 127.0.0.1...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 3000 (#0)
> GET / HTTP/1.1
> Host: localhost:3000
> User-Agent: curl/7.54.0
> Accept: */*
>
< HTTP/1.1 200 OK
< Content-Type: text/html;charset=utf8
< Date: Wed, 22 Aug 2018 08:46:08 GMT
< Connection: keep-alive
< Content-Length: 9
<
* Connection #0 to host localhost left intact
结束%
复制代码
本地会生成存储爬取内容的文件。