房价在手,天下我有 --反手就撸一个爬虫(终)
接上篇,科科,好,我们继续
我们在这里先把json数据入库吧~
首先,database/scheme里定义好数据类型。
const mongoose = require('mongoose')
const detailHouseSchema = new mongoose.Schema({ //定义数据模式
link:String,
text:String,
_id:String,
areaDetail:[
{
link: String,
text: String,
_id: String,
house:[
{
name: String,
huxing: String,
favorPos: String,
aroundPrice: Number,
adress: String,
area: String
}
]
}
]
})
mongoose.model('detailHouse',detailHouseSchema)
然后我们需要到中间件里去建立连接数据库和执行插入的动作。
middleWares/database.js
import mongoose from 'mongoose'
import config from '../config'
import fs from 'fs'
import { resolve } from 'path'
const r = path => resolve(__dirname,path) //将路径片段转成一个绝对路径
const models = r('../database/schema')
/**
* 依次引入本地爬去好的json文件,插入数据库
*/
var areaJson = require('database/json/AreaDetail.json')
var areaHouseJson = require('database/json/AreaHouse.json')
var detailHouseJson = require('database/json/detailHouse.json')
/**
* 依次引入schema
*/
fs.readdirSync(models) //读取文件
.filter(file => ~file.search(/^[^\.].*js$/)) //筛选出后缀是js的文件
.forEach(file => require(resolve(models,file)))
export const database = app =>{
mongoose.set('debug',true)
mongoose.connect(config.db)
mongoose.connection.on('disconnected', ()=>{
mongoose.connect(config.db)
})
mongoose.connection.on('error', err =>{
console.log(err)
})
mongoose.connection.on('open', async () =>{
console.log('connected to MongoDb',config.db)
/**
* 杭州主城区数据入库
*/
let area = mongoose.model('area')
let areaDataBase = await area.find({}).exec()
if (!areaDataBase.length) area.insertMany(areaJson)
/**
* 杭州主城区的房价数据入库
*/
let areaHouse = mongoose.model('areaHouse')
let areaHouseDataBase = await areaHouse.find({}).exec()
if(!areaHouseDataBase.length) areaHouse.insertMany(areaHouseJson)
/**
* 杭州主城区里包括了分区的房价数据入库
*/
let detailHouse = mongoose.model('detailHouse')
let detailHouseDataBase = await detailHouse.find({}).exec()
if(!detailHouseDataBase.length) detailHouse.insertMany(detailHouseJson)
})
}
成功的话,如下图~ bling~~~
走到这里,我们要停下来对后端的路由做一个提取个封装。首先,我这项目页面量不大,如果单纯的用koa-router去原生去写是没有问题的,但是如果你是实际的项目,路由很多,这个时候再去那么写,代码的可读性就很差了。
Decorator可以动态地给一个对象添加额外的职责。虽然,利用子类继承也可以实现这样的功能,但是Decorator提供了一个更灵活的方式。因为继承会为类型引入的静态特质,使得这种扩展方式缺乏灵活性;并且随着子类的增多(扩展功能的增多),各种子类的组合(扩展功能的组合)会导致更多子类的膨胀。
那么我们要在decorator/router.js里要定义一些公用的方法,其中还添加了打印日志功能,在调试的时候也是美滋滋的一匹。
先去middlewares/routers/router.js里去调用我们用修饰起封装好的方法和Route。
import Route from '../decorator/router'
import { resolve } from 'path'
const r = path => resolve(__dirname, path)
export const router = app => {
const apiPath = r('../routes')
/**
* 路由分离
*/
const router = new Route(app, apiPath)
router.init()
}
现在去封装Route
decorator/router.js
import Router from 'koa-router'
import { resolve } from 'path'
import _ from 'lodash'
import { glob } from 'glob' //用正则去匹配文件
export let routesMap = new Map()
export const symbolPrefix = Symbol('prefix')
export const normalizePath = path => path.startsWith('/') ? path : `/${path}`
export const isArray = c => _.isArray(c) ? c : [c]
export default class Route{
constructor(app,apipath){
this.app = app
this.router = new Router()
this.apipath = apipath
}
init(){
/**
* 这里利用传进来的apipath去引入后缀为js的文件
*/
glob
.sync(resolve(this.apipath,'./*.js'))
.forEach(require);
for(let [ conf , controller ] of routesMap){
/*
*思路就是把每一个路由文件的controller拎出来
* 然后跟它的路由做一个一一匹配
* */
const controllers = isArray(controller)
let prefixPath = conf.target[symbolPrefix]
if(prefixPath) prefixPath = normalizePath(prefixPath)
const routerPath = prefixPath conf.path
this.router[conf.method](routerPath,...controllers) //function (name, path, middlewares)
}
this.app.use(this.router.routes()) // 添加路由中间件
this.app.use(this.router.allowedMethods()) // 对请求进行一些限制处理
}
}
/**
*
* @param {path,target}
* 保证每一个controller都是独一无二的
*/
export const controller = path => target => target.prototype[symbolPrefix] = path
/**
*
* @param {conf}
* 定义简单的route
*/
export const route = conf => (target, key, desc) =>{
conf.path = normalizePath(conf.path)
routesMap.set({
target:target,
...conf,
},target[key])
}
/**
*
* @param {path}
* 定义get方法
*/
export const get = path => route({
method:'get',
path:path
})
/**
*
* @param {path}
* 定义post方法
*/
export const post = path => route({
method:'post',
path:path
})
/**
*
* 打印日志
*/
let reqID = 0
const decorate = (args, middleware) => {
let [ target, key, descriptor ] = args
target[key] = isArray(target[key])
target[key].unshift(middleware)
return descriptor
}
export const convert = middleware => (...args) => decorate(args, middleware)
export const log = convert(async (ctx, next) => {
let currentReqID = reqID
console.time(`${currentReqID} ${ctx.method} ${ctx.url}`)
await next()
console.timeEnd(`${currentReqID} ${ctx.method} ${ctx.url}`)
})
然后再来看看我们接口定义的文件,代码赶紧简洁的一匹.
routes/crawler.js
import { controller, get , log} from '../decorator/router'
import mongoose from 'mongoose'
const areaDataBase = mongoose.model('area')
const areaHouseDataBase = mongoose.model('areaHouse')
const detailHouse = mongoose.model('detailHouse')
@controller('')
export class Crawler{
/**
* 获取杭州城区下的房子信息
*/
@get('/getDetail')
@log
async detailHouse (ctx,next){
let query = ctx.query
let { _id } = query;
if (!_id) return (ctx.body = '_id is required')
let area = await detailHouse
.findById(_id)
.exec()
ctx.body = {
code:0,
area
}
}
/**
* 获取杭州城区下的房子信息
*/
@get('/getAreaHouse')
@log
async areaHouse (ctx,next){
let areaHouse = await areaHouseDataBase
.find({})
.exec()
ctx.body = {
code:0,
areaHouse
}
}
/**
* 获取杭州城区单条的名称
*/
@get('/getArea/:_id')
@log
async getArea (ctx,next){
const { params } = ctx
const { _id } = params
if (!_id) return (ctx.body = '_id is required')
let area = await areaDataBase
.findById(_id)
.exec()
ctx.body = area
}
/**
* 获取杭州城区的名称
*/
@get('/getArea')
@log
async Area (ctx,next){
let area = await areaDataBase
.find({})
.exec()
ctx.body = {
code:0,
area
}
}
}
走到这里,后端的代码基本上全部完成了,我们从数据的爬取-->入数据库-->-->接口的定义。
剩下的就是简单的前端的接口调用啦~ 我这里就不具体展示出代码啦~接口调用完成,基本上就能完成我们的目标样子啦~
真心的话要放在最后,这是小弟第一次从后到前撸的项目,对于node,mongo,数据库如何建表研究的还很肤浅,小弟在这里班门弄斧啦~真心希望和我一样喜欢倒腾的小伙伴可以自己也上手玩玩~真的能学到不少知识~
本来还想放上源码的,碍于注释都没有添加的很全,容老夫慢慢把注释不全了在贴出来~
2018/07/31
上一篇: python爬虫之链家郑州二手房爬取
下一篇: 我和shiro有个故事03