C#实现将HTML转换成纯文本的方法
程序员文章站
2023-11-20 21:40:22
本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
复制代码 代码如下:htmltotext convert = new...
本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
复制代码 代码如下:
htmltotext convert = new htmltotext();
textbox2.text = convert.convert(textbox1.text);
textbox2.text = convert.convert(textbox1.text);
c#代码如下:
/// <summary> /// converts html to plain text. /// </summary> class htmltotext { // static data tables protected static dictionary<string, string> _tags; protected static hashset<string> _ignoretags; // instance variables protected textbuilder _text; protected string _html; protected int _pos; // static constructor (one time only) static htmltotext() { _tags = new dictionary<string, string>(); _tags.add("address", "\n"); _tags.add("blockquote", "\n"); _tags.add("div", "\n"); _tags.add("dl", "\n"); _tags.add("fieldset", "\n"); _tags.add("form", "\n"); _tags.add("h1", "\n"); _tags.add("/h1", "\n"); _tags.add("h2", "\n"); _tags.add("/h2", "\n"); _tags.add("h3", "\n"); _tags.add("/h3", "\n"); _tags.add("h4", "\n"); _tags.add("/h4", "\n"); _tags.add("h5", "\n"); _tags.add("/h5", "\n"); _tags.add("h6", "\n"); _tags.add("/h6", "\n"); _tags.add("p", "\n"); _tags.add("/p", "\n"); _tags.add("table", "\n"); _tags.add("/table", "\n"); _tags.add("ul", "\n"); _tags.add("/ul", "\n"); _tags.add("ol", "\n"); _tags.add("/ol", "\n"); _tags.add("/li", "\n"); _tags.add("br", "\n"); _tags.add("/td", "\t"); _tags.add("/tr", "\n"); _tags.add("/pre", "\n"); _ignoretags = new hashset<string>(); _ignoretags.add("script"); _ignoretags.add("noscript"); _ignoretags.add("style"); _ignoretags.add("object"); } /// <summary> /// converts the given html to plain text and returns the result. /// </summary> /// <param name="html">html to be converted</param> /// <returns>resulting plain text</returns> public string convert(string html) { // initialize state variables _text = new textbuilder(); _html = html; _pos = 0; // process input while (!endoftext) { if (peek() == '<') { // html tag bool selfclosing; string tag = parsetag(out selfclosing); // handle special tag cases if (tag == "body") { // discard content before <body> _text.clear(); } else if (tag == "/body") { // discard content after </body> _pos = _html.length; } else if (tag == "pre") { // enter preformatted mode _text.preformatted = true; eatwhitespacetonextline(); } else if (tag == "/pre") { // exit preformatted mode _text.preformatted = false; } string value; if (_tags.trygetvalue(tag, out value)) _text.write(value); if (_ignoretags.contains(tag)) eatinnercontent(tag); } else if (char.iswhitespace(peek())) { // whitespace (treat all as space) _text.write(_text.preformatted ? peek() : ' '); moveahead(); } else { // other text _text.write(peek()); moveahead(); } } // return result return httputility.htmldecode(_text.tostring()); } // eats all characters that are part of the current tag // and returns information about that tag protected string parsetag(out bool selfclosing) { string tag = string.empty; selfclosing = false; if (peek() == '<') { moveahead(); // parse tag name eatwhitespace(); int start = _pos; if (peek() == '/') moveahead(); while (!endoftext && !char.iswhitespace(peek()) && peek() != '/' && peek() != '>') moveahead(); tag = _html.substring(start, _pos - start).tolower(); // parse rest of tag while (!endoftext && peek() != '>') { if (peek() == '"' || peek() == '\'') eatquotedvalue(); else { if (peek() == '/') selfclosing = true; moveahead(); } } moveahead(); } return tag; } // consumes inner content from the current tag protected void eatinnercontent(string tag) { string endtag = "/" + tag; while (!endoftext) { if (peek() == '<') { // consume a tag bool selfclosing; if (parsetag(out selfclosing) == endtag) return; // use recursion to consume nested tags if (!selfclosing && !tag.startswith("/")) eatinnercontent(tag); } else moveahead(); } } // returns true if the current position is at the end of // the string protected bool endoftext { get { return (_pos >= _html.length); } } // safely returns the character at the current position protected char peek() { return (_pos < _html.length) ? _html[_pos] : (char)0; } // safely advances to current position to the next character protected void moveahead() { _pos = math.min(_pos + 1, _html.length); } // moves the current position to the next non-whitespace // character. protected void eatwhitespace() { while (char.iswhitespace(peek())) moveahead(); } // moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void eatwhitespacetonextline() { while (char.iswhitespace(peek())) { char c = peek(); moveahead(); if (c == '\n') break; } } // moves the current position past a quoted value protected void eatquotedvalue() { char c = peek(); if (c == '"' || c == '\'') { // opening quote moveahead(); // find end of value int start = _pos; _pos = _html.indexofany(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.length; else moveahead(); // closing quote } } /// <summary> /// a stringbuilder class that helps eliminate excess whitespace. /// </summary> protected class textbuilder { private stringbuilder _text; private stringbuilder _currline; private int _emptylines; private bool _preformatted; // construction public textbuilder() { _text = new stringbuilder(); _currline = new stringbuilder(); _emptylines = 0; _preformatted = false; } /// <summary> /// normally, extra whitespace characters are discarded. /// if this property is set to true, they are passed /// through unchanged. /// </summary> public bool preformatted { get { return _preformatted; } set { if (value) { // clear line buffer if changing to // preformatted mode if (_currline.length > 0) flushcurrline(); _emptylines = 0; } _preformatted = value; } } /// <summary> /// clears all current text. /// </summary> public void clear() { _text.length = 0; _currline.length = 0; _emptylines = 0; } /// <summary> /// writes the given string to the output buffer. /// </summary> /// <param name="s"></param> public void write(string s) { foreach (char c in s) write(c); } /// <summary> /// writes the given character to the output buffer. /// </summary> /// <param name="c">character to write</param> public void write(char c) { if (_preformatted) { // write preformatted character _text.append(c); } else { if (c == '\r') { // ignore carriage returns. we'll process // '\n' if it comes next } else if (c == '\n') { // flush current line flushcurrline(); } else if (char.iswhitespace(c)) { // write single space character int len = _currline.length; if (len == 0 || !char.iswhitespace(_currline[len - 1])) _currline.append(' '); } else { // add character to current line _currline.append(c); } } } // appends the current line to output buffer protected void flushcurrline() { // get current line string line = _currline.tostring().trim(); // determine if line contains non-space characters string tmp = line.replace(" ", string.empty); if (tmp.length == 0) { // an empty line _emptylines++; if (_emptylines < 2 && _text.length > 0) _text.appendline(line); } else { // a non-empty line _emptylines = 0; _text.appendline(line); } // reset current line _currline.length = 0; } /// <summary> /// returns the current output as a string. /// </summary> public override string tostring() { if (_currline.length > 0) flushcurrline(); return _text.tostring(); } } }
希望本文所述对大家的c#程序设计有所帮助。