使用状态分析HTML语法
程序员文章站
2024-03-19 18:05:04
...
利用基于字符匹配的映射到状态的方式,实现解析html文档语法,匹配标签、文字、属性、值等。
列出了主要思路和主要代码。
if (DocStatus.Read_StartTag.equals(docStatus)) {
if (TagStatus.Read_Tag_Start.equals(tagStatus)) {
tagStatus = TagStatus.Read_Tag_Type;
tagName = "";
attrs.clear();
}
else if (TagStatus.Read_Tag_Type.equals(tagStatus)) {
if (ch == '!') {
tagType = "declare";
tagStatus = TagStatus.Read_Tag_Name;
}
else if (ch == '/') {
tagType = "close";
tagStatus = TagStatus.Read_Tag_Name;
}
else if (ch == '>') {
tagStatus = TagStatus.Read_Tag_End;
}
else if (Character.isLetter(ch)) {
tagType = "open";
tagStatus = TagStatus.Read_Tag_Name;
}
}
else if (TagStatus.Read_Continue.equals(tagStatus)) {
if (Character.isWhitespace(ch))
tagStatus = TagStatus.Read_Continue;
else if (ch == '/')
tagStatus = TagStatus.Read_Tag_WillEnd;
else {
tagStatus = TagStatus.Read_Tag_AttrName;
attrName = "";
attrValue = "";
}
}
else if (TagStatus.Read_Tag_AttrName_White.equals(tagStatus)) {
if (Character.isWhitespace(ch))
tagStatus = TagStatus.Read_Tag_AttrName_White;
else if (ch == '=')
tagStatus = TagStatus.Read_Tag_WillAttrValue;
else if (Character.isLetter(ch))
tagStatus = TagStatus.Read_Tag_Name;
}
else if (TagStatus.Read_Tag_AttrValue_White.equals(tagStatus)) {
if (Character.isWhitespace(ch))
tagStatus = TagStatus.Read_Tag_AttrValue_White;
else if (Character.isLetter(ch))
tagStatus = TagStatus.Read_Tag_AttrValue;
}
else if (TagStatus.Read_Tag_WillAttrValue.equals(tagStatus)) {
if (Character.isWhitespace(ch))
tagStatus = TagStatus.Read_Tag_WillAttrValue;
else
tagStatus = TagStatus.Read_Tag_AttrValue;
}
if (TagStatus.Read_Tag_Name.equals(tagStatus)) {
if (Character.isWhitespace(ch))
tagStatus = TagStatus.Read_Continue;
else if (ch == '>')
tagStatus = TagStatus.Read_Tag_End;
else
tagName += ch;
}
else if (TagStatus.Read_Tag_AttrName.equals(tagStatus)) {
if (Character.isWhitespace(ch)) {
tagStatus = TagStatus.Read_Continue;
attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
attrName = "";
attrValue = "";
}
else if (ch == '=') {
tagStatus = TagStatus.Read_Tag_WillAttrValue;
attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
}
else if (ch == '>') {
attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
tagStatus = TagStatus.Read_Tag_End;
}
else
attrName += ch;
}
else if (TagStatus.Read_Tag_AttrValue.equals(tagStatus)) {
if (Character.isWhitespace(ch)) {
tagStatus = TagStatus.Read_Continue;
AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
e.setValue(attrValue);
}
else if (ch == '>') {
tagStatus = TagStatus.Read_Tag_End;
AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
e.setValue(attrValue);
}
else if (ch == '\'')
tagStatus = TagStatus.Read_Tag_AttrValue_Quote;
else if (ch == '"')
tagStatus = TagStatus.Read_Tag_AttrValue_DoubleQuote;
else
attrValue += ch;
}
else if (TagStatus.Read_Tag_AttrValue_Quote.equals(tagStatus)) {
if (ch == '\'')
tagStatus = TagStatus.Read_Tag_AttrValue;
else
attrValue += ch;
}
else if (TagStatus.Read_Tag_AttrValue_DoubleQuote.equals(tagStatus)) {
if (ch == '"')
tagStatus = TagStatus.Read_Tag_AttrValue;
else
attrValue += ch;
}
else if (TagStatus.Read_Tag_WillEnd.equals(tagStatus)) {
if (ch == '>') {
tagType = "standard";
tagStatus = TagStatus.Read_Tag_End;
}
else
tagStatus = TagStatus.Read_Continue;
}
if (TagStatus.Read_Tag_End.equals(tagStatus)) {
AbstractNode node = null;
if (tagType.equals("declare")) {
node = new DeclareNode(tagName);
node.addAttrs(attrs);
root.peek().addNode(node);
stack.pop();
docStatus = DocStatus.Read_Any;
stack.push(docStatus);
}
else if (tagType.equals("open")) {
node = new ElementNode(tagName);
node.addAttrs(attrs);
root.peek().addNode(node);
root.push(node);
if (tagName.toUpperCase().equals("META")) {
root.pop();
stack.pop();
docStatus = DocStatus.Read_Any;
stack.push(docStatus);
}
else if (tagName.toUpperCase().equals("SCRIPT")) {
docStatus = DocStatus.Read_Script;
stack.push(docStatus);
tagStatus = TagStatus.Read_Script_Start;
sb.setLength(0);
}
else {
docStatus = DocStatus.Read_Any;
stack.push(docStatus);
}
}
else if (tagType.equals("standard")) {
node = new ElementNode(tagName);
node.addAttrs(attrs);
root.peek().addNode(node);
stack.pop();
docStatus = DocStatus.Read_Any;
stack.push(docStatus);
}
else if (tagType.equals("close")) {
stack.pop();
AbstractNode p = root.peek();
if (tagName.equals("/"+p.getName())) {
root.pop();
stack.pop();
}
docStatus = DocStatus.Read_Any;
stack.push(docStatus);
}
}
}
sb.append(ch);
if (DocStatus.Read_Any.equals(docStatus)) {
//System.out.print(sb.toString());
sb.setLength(0);
}
对简单html语法测试,显示如下:
推荐阅读
-
使用状态分析HTML语法
-
MySQL使用profile分析SQL执行状态 博客分类: 数据库
-
在laravel中使用Symfony的Crawler组件分析HTML
-
在laravel中使用Symfony的Crawler组件分析HTML
-
使用Scala从头实现一个简单的语法分析器组合字库
-
使用 Emmet 生成 HTML 的语法详解_html/css_WEB-ITnose
-
Python获取基金网站网页内容、使用BeautifulSoup库分析html操作示例
-
Python获取基金网站网页内容、使用BeautifulSoup库分析html操作示例
-
Vue.js绑定HTML class数组语法错误的原因分析
-
HTML5 标签语法变化和使用概念