GO语言利用K近邻算法实现小说鉴黄
程序员文章站
2022-11-15 12:10:59
usuage:
go run knn.go --file="data.txt"
关键是向量点的选择和阈值的判定
样本数据来自国家新闻出版总署发布通知公布...
usuage:
go run knn.go --file="data.txt"
关键是向量点的选择和阈值的判定
样本数据来自国家新闻出版总署发布通知公布的《40部淫秽色情网络小说名单》
package main import ( "bufio" "flag" "fmt" "io" "log" "math" "os" "path" "path/filepath" ) var debug bool = false var data_dir string = "./moyan" //文件存放目录 var limen float64 = 0.1159203888322267 //阈值 const ( min_hanzi rune = 0x3400 max_hanzi rune = 0x9fbb ) var labels []rune = []rune{ 0x817f, 0x80f8, 0x4e73, 0x81c0, 0x5c41, 0x80a1, 0x88f8, 0x6deb, } func errhandle(err error) { if err != nil { log.fatal(err) } } func load(name string) (m map[rune]int, err error) { f, err := os.open(name) if err != nil { return nil, err } defer f.close() buf := bufio.newreader(f) m = make(map[rune]int) var r rune for { r, _, err = buf.readrune() if err != nil { if err == io.eof { break } return nil, err } if r >= min_hanzi && r <= max_hanzi { m[r] += 1 } } return m, nil } func classify(m map[rune]int) (idv []float64, dis float64) { len_m := len(m) for i, v := range labels { if debug { fmt.println(i, m[v], string(v), float64(m[v])/float64(len_m)) } idv = append(idv, float64(m[v])/float64(len_m)) } for _, v := range idv { dis += math.pow(v, 2) } dis = math.sqrt(dis) return } func check(fp string, dis float64) { switch { case dis >= limen: fmt.println(fp, dis, "涉黄") case dis == 1.0: fmt.println(fp, dis, "你在作弊吗") case dis == 0: fmt.println(fp, dis, "检查一下文件字符编码是不是utf8格式吧") default: fmt.println(fp, dis, "正常") } } func walkfunc(fp string, info os.fileinfo, err error) error { if path.ext(fp) == ".txt" { m, err := load(fp) errhandle(err) _, dis := classify(m) check(fp, dis) } return err } var file string func init() { _, err := os.stat(data_dir) if err != nil { err = os.mkdir(data_dir, os.modeperm) errhandle(err) } flag.stringvar(&file, "file", "", "file read in,if you don't give the file read in,"+ "it will create a data dictionary,just pust your files in it") } func main() { flag.parse() if file == "" { filepath.walk(data_dir, walkfunc) return } m, err := load(file) errhandle(err) _, dis := classify(m) check(file, dis) }
以上所述就是本文的全部内容了,希望大家能够喜欢。
上一篇: 黄花菜凉菜怎么做呢
下一篇: 豆浆和红薯的好处你知道多少?