欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

golang使用正则表达式解析网页

程序员文章站 2022-11-15 11:55:31
废话少说,直接奉上代码: 复制代码 代码如下: package main import ( "fmt" "time" "io/ioutil" "net/htt...

废话少说,直接奉上代码:

复制代码 代码如下:

package main
import (
"fmt"
"time"
"io/ioutil"
"net/http"
"regexp"
"strings"
)
func main() {
    ip_pool := []string{
                "172.16.1.128",
                "172.16.1.129",
                "172.16.1.131",
                "172.16.1.132",
                "172.16.1.133",
                "172.16.1.134",
                "172.16.1.135",
                "172.16.1.136",
                "172.16.1.137",
                "172.16.1.138",
                "172.16.1.190",
            }
    for {
        for i:=0;i<len(ip_pool);i++  {
            url := ""
            //fmt.println("-----------------",ip_pool[i],"---------")
             get_url(url)
            time.sleep(1*time.millisecond)
        }
//time.sleep(time.second * 60)
    }
}
func get_url(url string){
    fmt.println("----------",url,"----------------")
    resp, err := http.get(url)
    if err != nil {
        fmt.println("http get error.")
    }
    defer resp.body.close()
    body, err := ioutil.readall(resp.body)
    if err != nil {
        fmt.println("http read error")
    }
src := string(body)
//将html标签全转换成小写
re, _ := regexp.compile("\\<[\\s\\s]+?\\>")
src = re.replaceallstringfunc(src, strings.tolower)
//  提取table 标签
re,_ = regexp.compile("\\<!doc[\\s\\s]+?\\<table")
src = re.replaceallstring(src, "<table")
re,_ = regexp.compile("</table\\>[\\s\\s]+?\\</html\\>")
src = re.replaceallstring(src, "</table>")
// 干掉行首的空格
 re,_ = regexp.compile("(\\<tr>)([\\s\\s\\<>\"\\d]+?)(\\</tr>)")
 //src = re.replaceallstring(src, "$2")
 src = re.replaceallstring(src, "$2]")
 // 去掉<>标签
re,_ = regexp.compile("<[\\s\\s]+?>")
src = re.replaceallstring(src, "")
re,_ = regexp.compile("\n")
src = re.replaceallstring(src, "")
re,_ = regexp.compile("[ ]+")
src = re.replaceallstring(src, " ")
re,_ = regexp.compile("]")
 src = re.replaceallstring(src, "\n")
// 变成 json                1           2                3           4
re,_ = regexp.compile("(\\w*)(\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2}:\\w{2})([a-za-z ]*)(} } )([v\\d\\.]* )(}( })?)?)")
/* (((})+ (})*?)")
*/
src = re.replaceallstring(src, "$1,$2,$3,$4,$5,$6,")
//re,_ = regexp.compile("(<[\\s\\s]*?\">)([\\s\\s]+?)(</t")
//src = re.replaceallstring(src, "$2,")
// re,_ = regexp.compile("<[\\s\\s]+?>")
// src = re.replaceallstring(src, "")
//reg := regexp.mustcompile("([a-za-z]+?)(\n[\\s]+)([\\d]+)")
// src =  reg.replaceallstring(src, "$1:$3")
//去除连续的换行符
//re, _ = regexp.compile(",}")
//src = re.replaceallstring(src, "\n")
//re = regexp.mustcompile("\n\\d+")
//fmt.println(re.replaceallliteralstring("hello\n2\nwork", '\d'))
src = strings.replace(src,"虚拟机名称 虚拟机mac 虚拟机状态 心跳时间 引擎版本 病毒库日期 扫描样本数" , "vm_name,vm_mac,vm_state,vm_heart,vm_eg,vm_av_db,vm_count",-1)
fmt.println(src)
//reg, err := regexp.compile("[[0-9a-za-z]{2}:?]{6}")
//fmt.printf("%q,%v\n", reg.findstring("00:16:3e:4a:29:35"), err)
// "hello",
// text := "hello\n123\ngo\n123"
// reg = regexp.mustcompile("([a-za-z]+?)(\n)([\\d]+)")
// fmt.printf("%q\n", reg.replaceallstring(text, "$3:$1"))
//fmt.println(strings.trimspace(src))
//去除style
//re, _ = regexp.compile("\\<style[\\s\\s]+?\\</style\\>")
//src = re.replaceallstring(src, "")
//去除script
//re, _ = regexp.compile("\\<script[\\s\\s]+?\\</script\\>")
//src = re.replaceallstring(src, "")
//去除所有尖括号内的html代码,并换成换行符
//re, _ = regexp.compile("\\<[\\s\\s]+?\\>")
//src = re.replaceallstring(src, "\n")
//去除连续的换行符
//re, _ = regexp.compile(",}")
//src = re.replaceallstring(src, "\n")
//fmt.println(strings.trimspace(src))
}

以上就是本文给大家分享的代码了,希望大家能够喜欢。