golang使用正则表达式解析网页
程序员文章站
2022-11-15 11:55:31
废话少说,直接奉上代码:
复制代码 代码如下:
package main
import (
"fmt"
"time"
"io/ioutil"
"net/htt...
废话少说,直接奉上代码:
复制代码 代码如下:
package main
import (
"fmt"
"time"
"io/ioutil"
"net/http"
"regexp"
"strings"
)
func main() {
ip_pool := []string{
"172.16.1.128",
"172.16.1.129",
"172.16.1.131",
"172.16.1.132",
"172.16.1.133",
"172.16.1.134",
"172.16.1.135",
"172.16.1.136",
"172.16.1.137",
"172.16.1.138",
"172.16.1.190",
}
for {
for i:=0;i<len(ip_pool);i++ {
url := ""
//fmt.println("-----------------",ip_pool[i],"---------")
get_url(url)
time.sleep(1*time.millisecond)
}
//time.sleep(time.second * 60)
}
}
func get_url(url string){
fmt.println("----------",url,"----------------")
resp, err := http.get(url)
if err != nil {
fmt.println("http get error.")
}
defer resp.body.close()
body, err := ioutil.readall(resp.body)
if err != nil {
fmt.println("http read error")
}
src := string(body)
//将html标签全转换成小写
re, _ := regexp.compile("\\<[\\s\\s]+?\\>")
src = re.replaceallstringfunc(src, strings.tolower)
// 提取table 标签
re,_ = regexp.compile("\\<!doc[\\s\\s]+?\\<table")
src = re.replaceallstring(src, "<table")
re,_ = regexp.compile("</table\\>[\\s\\s]+?\\</html\\>")
src = re.replaceallstring(src, "</table>")
// 干掉行首的空格
re,_ = regexp.compile("(\\<tr>)([\\s\\s\\<>\"\\d]+?)(\\</tr>)")
//src = re.replaceallstring(src, "$2")
src = re.replaceallstring(src, "$2]")
// 去掉<>标签
re,_ = regexp.compile("<[\\s\\s]+?>")
src = re.replaceallstring(src, "")
re,_ = regexp.compile("\n")
src = re.replaceallstring(src, "")
re,_ = regexp.compile("[ ]+")
src = re.replaceallstring(src, " ")
re,_ = regexp.compile("]")
src = re.replaceallstring(src, "\n")
// 变成 json 1 2 3 4
re,_ = regexp.compile("(\\w*)(\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2})([a-za-z ]*)(} } )([v\\d\\.]* )(}( })?)?)")
/* (((})+ (})*?)")
*/
src = re.replaceallstring(src, "$1,$2,$3,$4,$5,$6,")
//re,_ = regexp.compile("(<[\\s\\s]*?\">)([\\s\\s]+?)(</t")
//src = re.replaceallstring(src, "$2,")
// re,_ = regexp.compile("<[\\s\\s]+?>")
// src = re.replaceallstring(src, "")
//reg := regexp.mustcompile("([a-za-z]+?)(\n[\\s]+)([\\d]+)")
// src = reg.replaceallstring(src, "$1:$3")
//去除连续的换行符
//re, _ = regexp.compile(",}")
//src = re.replaceallstring(src, "\n")
//re = regexp.mustcompile("\n\\d+")
//fmt.println(re.replaceallliteralstring("hello\n2\nwork", '\d'))
src = strings.replace(src,"虚拟机名称 虚拟机mac 虚拟机状态 心跳时间 引擎版本 病毒库日期 扫描样本数" , "vm_name,vm_mac,vm_state,vm_heart,vm_eg,vm_av_db,vm_count",-1)
fmt.println(src)
//reg, err := regexp.compile("[[0-9a-za-z]{2}:?]{6}")
//fmt.printf("%q,%v\n", reg.findstring("00:16:3e:4a:29:35"), err)
// "hello",
// text := "hello\n123\ngo\n123"
// reg = regexp.mustcompile("([a-za-z]+?)(\n)([\\d]+)")
// fmt.printf("%q\n", reg.replaceallstring(text, "$3:$1"))
//fmt.println(strings.trimspace(src))
//去除style
//re, _ = regexp.compile("\\<style[\\s\\s]+?\\</style\\>")
//src = re.replaceallstring(src, "")
//去除script
//re, _ = regexp.compile("\\<script[\\s\\s]+?\\</script\\>")
//src = re.replaceallstring(src, "")
//去除所有尖括号内的html代码,并换成换行符
//re, _ = regexp.compile("\\<[\\s\\s]+?\\>")
//src = re.replaceallstring(src, "\n")
//去除连续的换行符
//re, _ = regexp.compile(",}")
//src = re.replaceallstring(src, "\n")
//fmt.println(strings.trimspace(src))
}
以上就是本文给大家分享的代码了,希望大家能够喜欢。
下一篇: 从Node.js 转到 Go平台