欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

使用状态分析HTML语法​

程序员文章站 2024-03-19 18:05:04
...

利用基于字符匹配的映射到状态的方式,实现解析html文档语法,匹配标签、文字、属性、值等。

列出了主要思路和主要代码。

if (DocStatus.Read_StartTag.equals(docStatus)) {
	if (TagStatus.Read_Tag_Start.equals(tagStatus)) {
		tagStatus = TagStatus.Read_Tag_Type;
		tagName = "";
		attrs.clear();
	}
	else if (TagStatus.Read_Tag_Type.equals(tagStatus)) {
		if (ch == '!') {
			tagType = "declare";
			tagStatus = TagStatus.Read_Tag_Name;
		}
		else if (ch == '/') {
			tagType = "close";
			tagStatus = TagStatus.Read_Tag_Name;
		}
		else if (ch == '>') {
			tagStatus = TagStatus.Read_Tag_End;
		}
		else if (Character.isLetter(ch)) {
			tagType = "open";
			tagStatus = TagStatus.Read_Tag_Name;
		}
			
	}
	else if (TagStatus.Read_Continue.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Continue;
		else if (ch == '/')
			tagStatus = TagStatus.Read_Tag_WillEnd;
		else {
			tagStatus = TagStatus.Read_Tag_AttrName;
			attrName = "";
			attrValue = "";
		}
	}
	else if (TagStatus.Read_Tag_AttrName_White.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Tag_AttrName_White;
		else if (ch == '=')
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
		else if (Character.isLetter(ch))
			tagStatus = TagStatus.Read_Tag_Name;
	}
	else if (TagStatus.Read_Tag_AttrValue_White.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Tag_AttrValue_White;
		else if (Character.isLetter(ch))
			tagStatus = TagStatus.Read_Tag_AttrValue;
	}
	else if (TagStatus.Read_Tag_WillAttrValue.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) 
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
		else
			tagStatus = TagStatus.Read_Tag_AttrValue;
	}
	
	if (TagStatus.Read_Tag_Name.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Continue;
		else if (ch == '>')
			tagStatus = TagStatus.Read_Tag_End;
		else 
			tagName += ch;
	}
	else if (TagStatus.Read_Tag_AttrName.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) {
			tagStatus = TagStatus.Read_Continue;
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
			attrName = "";
			attrValue = "";
		}
		else if (ch == '=') {
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
		}
		else if (ch == '>') {
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
			tagStatus = TagStatus.Read_Tag_End;
		}
		else 
			attrName += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) {
			tagStatus = TagStatus.Read_Continue;
			AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
			e.setValue(attrValue);
		}
		else if (ch == '>') {
			tagStatus = TagStatus.Read_Tag_End;
			AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
			e.setValue(attrValue);
		}
		else if (ch == '\'')
			tagStatus = TagStatus.Read_Tag_AttrValue_Quote;
		else if (ch == '"') 
			tagStatus = TagStatus.Read_Tag_AttrValue_DoubleQuote;
		else 
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue_Quote.equals(tagStatus)) {
		if (ch == '\'')
			tagStatus = TagStatus.Read_Tag_AttrValue;
		else
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue_DoubleQuote.equals(tagStatus)) {
		if (ch == '"')
			tagStatus = TagStatus.Read_Tag_AttrValue;
		else 
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_WillEnd.equals(tagStatus)) {
		if (ch == '>') {
			tagType = "standard";
			tagStatus = TagStatus.Read_Tag_End;
		}
		else
			tagStatus = TagStatus.Read_Continue;
	}
	
	if (TagStatus.Read_Tag_End.equals(tagStatus)) {
		AbstractNode node = null;
		if (tagType.equals("declare")) {
			node = new DeclareNode(tagName);
			node.addAttrs(attrs);
			
			root.peek().addNode(node);
			stack.pop();
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);

		}
		else if (tagType.equals("open")) {
			node = new ElementNode(tagName);
			node.addAttrs(attrs);
			root.peek().addNode(node);
			root.push(node);
			
			if (tagName.toUpperCase().equals("META")) {
				root.pop();
				stack.pop();
				docStatus = DocStatus.Read_Any;
				stack.push(docStatus);
			}
			else if (tagName.toUpperCase().equals("SCRIPT")) {
				docStatus = DocStatus.Read_Script;
				stack.push(docStatus);
				tagStatus = TagStatus.Read_Script_Start;
				sb.setLength(0);
			}
			else {
				docStatus = DocStatus.Read_Any;
				stack.push(docStatus);
			}
		}
		else if (tagType.equals("standard")) {
			node = new ElementNode(tagName);
			node.addAttrs(attrs);
			root.peek().addNode(node);
			stack.pop();
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);
		}
		else if (tagType.equals("close")) {
			stack.pop();
			AbstractNode p = root.peek();
			if (tagName.equals("/"+p.getName())) {
				root.pop();
				stack.pop();
			}
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);
				
		}
	}
}
sb.append(ch);
if (DocStatus.Read_Any.equals(docStatus)) {
	//System.out.print(sb.toString());
	sb.setLength(0);
}

对简单html语法测试,显示如下:

使用状态分析HTML语法​
状态转换方式的代码对HTML语法解析结果