using System; using System.Collections.Generic; using System.Text; using System.Web; namespace MsgViewer.Helpers { /// /// Converts HTML to plain text. /// public class HtmlToText { #region Internal class TextBuilder /// /// A StringBuilder class that helps eliminate excess whitespace. /// internal class TextBuilder { private readonly StringBuilder _currLine; private readonly StringBuilder _text; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } /// /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } /// /// Clears all current text. /// public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } /// /// Writes the given string to the output buffer. /// /// public void Write(string s) { foreach (var c in s) Write(c); } /// /// Writes the given character to the output buffer. /// /// Character to write public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { switch (c) { case '\r': break; case '\n': FlushCurrLine(); break; default: if (Char.IsWhiteSpace(c)) { // Write single space character var len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(' '); } else { // Add character to current line _currLine.Append(c); } break; } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line var line = _currLine.ToString().Trim(); // Determine if line contains non-space characters var tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } /// /// Returns the current output as a string. /// public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } #endregion #region Fields private static Dictionary _tags; private static HashSet _ignoreTags; private string _html; private int _pos; private TextBuilder _text; #endregion #region Constructor public HtmlToText() { _tags = new Dictionary { {"address", "\n"}, {"blockquote", "\n"}, {"div", "\n"}, {"dl", "\n"}, {"fieldset", "\n"}, {"form", "\n"}, {"h1", "\n"}, {"/h1", "\n"}, {"h2", "\n"}, {"/h2", "\n"}, {"h3", "\n"}, {"/h3", "\n"}, {"h4", "\n"}, {"/h4", "\n"}, {"h5", "\n"}, {"/h5", "\n"}, {"h6", "\n"}, {"/h6", "\n"}, {"p", "\n"}, {"/p", "\n"}, {"table", "\n"}, {"/table", "\n"}, {"ul", "\n"}, {"/ul", "\n"}, {"ol", "\n"}, {"/ol", "\n"}, {"/li", "\n"}, {"br", "\n"}, {"/td", "\t"}, {"/tr", "\n"}, {"/pre", "\n"} }; _ignoreTags = new HashSet {"script", "noscript", "style", "object"}; } #endregion #region EndOfText /// /// Returns true when we are at the end of te text /// protected bool EndOfText { get { return (_pos >= _html.Length); } } #endregion #region Convert /// /// Converts the given HTML to plain text and returns the result. /// /// HTML to be converted /// Resulting plain text public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == '<') { // HTML tag bool selfClosing; var tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") { // Discard content before _text.Clear(); } else if (tag == "/body") { // Discard content after _pos = _html.Length; } else if (tag == "pre") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == "/pre") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ' '); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } #endregion #region ParseTag /// /// Eats all characters that are part of the current tag and returns information about that tag /// /// /// private string ParseTag(out bool selfClosing) { var tag = String.Empty; selfClosing = false; if (Peek() != '<') return tag; MoveAhead(); // Parse tag name EatWhitespace(); var start = _pos; if (Peek() == '/') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '/' && Peek() != '>') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != '>') { if (Peek() == '"' || Peek() == '\'') EatQuotedValue(); else { if (Peek() == '/') selfClosing = true; MoveAhead(); } } MoveAhead(); return tag; } #endregion #region EatInnerContent /// /// Consumes inner content from the current tag /// /// private void EatInnerContent(string tag) { var endTag = "/" + tag; while (!EndOfText) { if (Peek() == '<') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith("/")) EatInnerContent(tag); } else MoveAhead(); } } #endregion #region Helpers /// /// Safely returns the character at the current position /// /// private char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char) 0; } /// /// Safely advances to current position to the next character /// private void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } /// /// Moves the current position to the next non-whitespace character /// private void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } /// /// Moves the current position to the next non-whitespace /// character or the start of the next line, whichever /// comes first /// private void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { var c = Peek(); MoveAhead(); if (c == '\n') break; } } /// /// Moves the current position past a quoted value /// private void EatQuotedValue() { var c = Peek(); if (c != '"' && c != '\'') return; // Opening quote MoveAhead(); // Find end of value _pos = _html.IndexOfAny(new[] {c, '\r', '\n'}, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } #endregion } }