欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

C#实现将HTML转换成纯文本的方法

程序员文章站 2023-11-20 21:40:22
本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下: 使用方法: 复制代码 代码如下:htmltotext convert = new...

本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下:

使用方法:

复制代码 代码如下:
htmltotext convert = new htmltotext();
textbox2.text = convert.convert(textbox1.text);

c#代码如下:

/// <summary>
/// converts html to plain text.
/// </summary>
class htmltotext
{
  // static data tables
  protected static dictionary<string, string> _tags;
  protected static hashset<string> _ignoretags;
  // instance variables
  protected textbuilder _text;
  protected string _html;
  protected int _pos;
  // static constructor (one time only)
  static htmltotext()
  {
    _tags = new dictionary<string, string>();
    _tags.add("address", "\n");
    _tags.add("blockquote", "\n");
    _tags.add("div", "\n");
    _tags.add("dl", "\n");
    _tags.add("fieldset", "\n");
    _tags.add("form", "\n");
    _tags.add("h1", "\n");
    _tags.add("/h1", "\n");
    _tags.add("h2", "\n");
    _tags.add("/h2", "\n");
    _tags.add("h3", "\n");
    _tags.add("/h3", "\n");
    _tags.add("h4", "\n");
    _tags.add("/h4", "\n");
    _tags.add("h5", "\n");
    _tags.add("/h5", "\n");
    _tags.add("h6", "\n");
    _tags.add("/h6", "\n");
    _tags.add("p", "\n");
    _tags.add("/p", "\n");
    _tags.add("table", "\n");
    _tags.add("/table", "\n");
    _tags.add("ul", "\n");
    _tags.add("/ul", "\n");
    _tags.add("ol", "\n");
    _tags.add("/ol", "\n");
    _tags.add("/li", "\n");
    _tags.add("br", "\n");
    _tags.add("/td", "\t");
    _tags.add("/tr", "\n");
    _tags.add("/pre", "\n");
    _ignoretags = new hashset<string>();
    _ignoretags.add("script");
    _ignoretags.add("noscript");
    _ignoretags.add("style");
    _ignoretags.add("object");
  }
  /// <summary>
  /// converts the given html to plain text and returns the result.
  /// </summary>
  /// <param name="html">html to be converted</param>
  /// <returns>resulting plain text</returns>
  public string convert(string html)
  {
    // initialize state variables
    _text = new textbuilder();
    _html = html;
    _pos = 0;
    // process input
    while (!endoftext)
    {
      if (peek() == '<')
      {
        // html tag
        bool selfclosing;
        string tag = parsetag(out selfclosing);
        // handle special tag cases
        if (tag == "body")
        {
          // discard content before <body>
          _text.clear();
        }
        else if (tag == "/body")
        {
          // discard content after </body>
          _pos = _html.length;
        }
        else if (tag == "pre")
        {
          // enter preformatted mode
          _text.preformatted = true;
          eatwhitespacetonextline();
        }
        else if (tag == "/pre")
        {
          // exit preformatted mode
          _text.preformatted = false;
        }
        string value;
        if (_tags.trygetvalue(tag, out value))
          _text.write(value);
        if (_ignoretags.contains(tag))
          eatinnercontent(tag);
      }
      else if (char.iswhitespace(peek()))
      {
        // whitespace (treat all as space)
        _text.write(_text.preformatted ? peek() : ' ');
        moveahead();
      }
      else
      {
        // other text
        _text.write(peek());
        moveahead();
      }
    }
    // return result
    return httputility.htmldecode(_text.tostring());
  }
  // eats all characters that are part of the current tag
  // and returns information about that tag
  protected string parsetag(out bool selfclosing)
  {
    string tag = string.empty;
    selfclosing = false;
    if (peek() == '<')
    {
      moveahead();
      // parse tag name
      eatwhitespace();
      int start = _pos;
      if (peek() == '/')
        moveahead();
      while (!endoftext && !char.iswhitespace(peek()) &&
        peek() != '/' && peek() != '>')
        moveahead();
      tag = _html.substring(start, _pos - start).tolower();
      // parse rest of tag
      while (!endoftext && peek() != '>')
      {
        if (peek() == '"' || peek() == '\'')
          eatquotedvalue();
        else
        {
          if (peek() == '/')
            selfclosing = true;
          moveahead();
        }
      }
      moveahead();
    }
    return tag;
  }
  // consumes inner content from the current tag
  protected void eatinnercontent(string tag)
  {
    string endtag = "/" + tag;
    while (!endoftext)
    {
      if (peek() == '<')
      {
        // consume a tag
        bool selfclosing;
        if (parsetag(out selfclosing) == endtag)
          return;
        // use recursion to consume nested tags
        if (!selfclosing && !tag.startswith("/"))
          eatinnercontent(tag);
      }
      else moveahead();
    }
  }
  // returns true if the current position is at the end of
  // the string
  protected bool endoftext
  {
    get { return (_pos >= _html.length); }
  }
  // safely returns the character at the current position
  protected char peek()
  {
    return (_pos < _html.length) ? _html[_pos] : (char)0;
  }
  // safely advances to current position to the next character
  protected void moveahead()
  {
    _pos = math.min(_pos + 1, _html.length);
  }
  // moves the current position to the next non-whitespace
  // character.
  protected void eatwhitespace()
  {
    while (char.iswhitespace(peek()))
      moveahead();
  }
  // moves the current position to the next non-whitespace
  // character or the start of the next line, whichever
  // comes first
  protected void eatwhitespacetonextline()
  {
    while (char.iswhitespace(peek()))
    {
      char c = peek();
      moveahead();
      if (c == '\n')
        break;
    }
  }
  // moves the current position past a quoted value
  protected void eatquotedvalue()
  {
    char c = peek();
    if (c == '"' || c == '\'')
    {
      // opening quote
      moveahead();
      // find end of value
      int start = _pos;
      _pos = _html.indexofany(new char[] { c, '\r', '\n' }, _pos);
      if (_pos < 0)
        _pos = _html.length;
      else
        moveahead();  // closing quote
    }
  }
  /// <summary>
  /// a stringbuilder class that helps eliminate excess whitespace.
  /// </summary>
  protected class textbuilder
  {
    private stringbuilder _text;
    private stringbuilder _currline;
    private int _emptylines;
    private bool _preformatted;
    // construction
    public textbuilder()
    {
      _text = new stringbuilder();
      _currline = new stringbuilder();
      _emptylines = 0;
      _preformatted = false;
    }
    /// <summary>
    /// normally, extra whitespace characters are discarded.
    /// if this property is set to true, they are passed
    /// through unchanged.
    /// </summary>
    public bool preformatted
    {
      get
      {
        return _preformatted;
      }
      set
      {
        if (value)
        {
          // clear line buffer if changing to
          // preformatted mode
          if (_currline.length > 0)
            flushcurrline();
          _emptylines = 0;
        }
        _preformatted = value;
      }
    }
    /// <summary>
    /// clears all current text.
    /// </summary>
    public void clear()
    {
      _text.length = 0;
      _currline.length = 0;
      _emptylines = 0;
    }
    /// <summary>
    /// writes the given string to the output buffer.
    /// </summary>
    /// <param name="s"></param>
    public void write(string s)
    {
      foreach (char c in s)
        write(c);
    }
    /// <summary>
    /// writes the given character to the output buffer.
    /// </summary>
    /// <param name="c">character to write</param>
    public void write(char c)
    {
      if (_preformatted)
      {
        // write preformatted character
        _text.append(c);
      }
      else
      {
        if (c == '\r')
        {
          // ignore carriage returns. we'll process
          // '\n' if it comes next
        }
        else if (c == '\n')
        {
          // flush current line
          flushcurrline();
        }
        else if (char.iswhitespace(c))
        {
          // write single space character
          int len = _currline.length;
          if (len == 0 || !char.iswhitespace(_currline[len - 1]))
            _currline.append(' ');
        }
        else
        {
          // add character to current line
          _currline.append(c);
        }
      }
    }
    // appends the current line to output buffer
    protected void flushcurrline()
    {
      // get current line
      string line = _currline.tostring().trim();
      // determine if line contains non-space characters
      string tmp = line.replace(" ", string.empty);
      if (tmp.length == 0)
      {
        // an empty line
        _emptylines++;
        if (_emptylines < 2 && _text.length > 0)
          _text.appendline(line);
      }
      else
      {
        // a non-empty line
        _emptylines = 0;
        _text.appendline(line);
      }
      // reset current line
      _currline.length = 0;
    }
    /// <summary>
    /// returns the current output as a string.
    /// </summary>
    public override string tostring()
    {
      if (_currline.length > 0)
        flushcurrline();
      return _text.tostring();
    }
  }
}

希望本文所述对大家的c#程序设计有所帮助。