using System;
using System.Collections.Generic;
using System.Text;
using System.Web;
namespace MsgViewer.Helpers
{
///
/// Converts HTML to plain text.
///
public class HtmlToText
{
#region Internal class TextBuilder
///
/// A StringBuilder class that helps eliminate excess whitespace.
///
internal class TextBuilder
{
private readonly StringBuilder _currLine;
private readonly StringBuilder _text;
private int _emptyLines;
private bool _preformatted;
// Construction
public TextBuilder()
{
_text = new StringBuilder();
_currLine = new StringBuilder();
_emptyLines = 0;
_preformatted = false;
}
///
/// Normally, extra whitespace characters are discarded.
/// If this property is set to true, they are passed
/// through unchanged.
///
public bool Preformatted
{
get { return _preformatted; }
set
{
if (value)
{
// Clear line buffer if changing to
// preformatted mode
if (_currLine.Length > 0)
FlushCurrLine();
_emptyLines = 0;
}
_preformatted = value;
}
}
///
/// Clears all current text.
///
public void Clear()
{
_text.Length = 0;
_currLine.Length = 0;
_emptyLines = 0;
}
///
/// Writes the given string to the output buffer.
///
///
public void Write(string s)
{
foreach (var c in s)
Write(c);
}
///
/// Writes the given character to the output buffer.
///
/// Character to write
public void Write(char c)
{
if (_preformatted)
{
// Write preformatted character
_text.Append(c);
}
else
{
switch (c)
{
case '\r':
break;
case '\n':
FlushCurrLine();
break;
default:
if (Char.IsWhiteSpace(c))
{
// Write single space character
var len = _currLine.Length;
if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
_currLine.Append(' ');
}
else
{
// Add character to current line
_currLine.Append(c);
}
break;
}
}
}
// Appends the current line to output buffer
protected void FlushCurrLine()
{
// Get current line
var line = _currLine.ToString().Trim();
// Determine if line contains non-space characters
var tmp = line.Replace(" ", String.Empty);
if (tmp.Length == 0)
{
// An empty line
_emptyLines++;
if (_emptyLines < 2 && _text.Length > 0)
_text.AppendLine(line);
}
else
{
// A non-empty line
_emptyLines = 0;
_text.AppendLine(line);
}
// Reset current line
_currLine.Length = 0;
}
///
/// Returns the current output as a string.
///
public override string ToString()
{
if (_currLine.Length > 0)
FlushCurrLine();
return _text.ToString();
}
}
#endregion
#region Fields
private static Dictionary _tags;
private static HashSet _ignoreTags;
private string _html;
private int _pos;
private TextBuilder _text;
#endregion
#region Constructor
public HtmlToText()
{
_tags = new Dictionary
{
{"address", "\n"},
{"blockquote", "\n"},
{"div", "\n"},
{"dl", "\n"},
{"fieldset", "\n"},
{"form", "\n"},
{"h1", "\n"},
{"/h1", "\n"},
{"h2", "\n"},
{"/h2", "\n"},
{"h3", "\n"},
{"/h3", "\n"},
{"h4", "\n"},
{"/h4", "\n"},
{"h5", "\n"},
{"/h5", "\n"},
{"h6", "\n"},
{"/h6", "\n"},
{"p", "\n"},
{"/p", "\n"},
{"table", "\n"},
{"/table", "\n"},
{"ul", "\n"},
{"/ul", "\n"},
{"ol", "\n"},
{"/ol", "\n"},
{"/li", "\n"},
{"br", "\n"},
{"/td", "\t"},
{"/tr", "\n"},
{"/pre", "\n"}
};
_ignoreTags = new HashSet {"script", "noscript", "style", "object"};
}
#endregion
#region EndOfText
///
/// Returns true when we are at the end of te text
///
protected bool EndOfText
{
get { return (_pos >= _html.Length); }
}
#endregion
#region Convert
///
/// Converts the given HTML to plain text and returns the result.
///
/// HTML to be converted
/// Resulting plain text
public string Convert(string html)
{
// Initialize state variables
_text = new TextBuilder();
_html = html;
_pos = 0;
// Process input
while (!EndOfText)
{
if (Peek() == '<')
{
// HTML tag
bool selfClosing;
var tag = ParseTag(out selfClosing);
// Handle special tag cases
if (tag == "body")
{
// Discard content before
_text.Clear();
}
else if (tag == "/body")
{
// Discard content after
_pos = _html.Length;
}
else if (tag == "pre")
{
// Enter preformatted mode
_text.Preformatted = true;
EatWhitespaceToNextLine();
}
else if (tag == "/pre")
{
// Exit preformatted mode
_text.Preformatted = false;
}
string value;
if (_tags.TryGetValue(tag, out value))
_text.Write(value);
if (_ignoreTags.Contains(tag))
EatInnerContent(tag);
}
else if (Char.IsWhiteSpace(Peek()))
{
// Whitespace (treat all as space)
_text.Write(_text.Preformatted ? Peek() : ' ');
MoveAhead();
}
else
{
// Other text
_text.Write(Peek());
MoveAhead();
}
}
// Return result
return HttpUtility.HtmlDecode(_text.ToString());
}
#endregion
#region ParseTag
///
/// Eats all characters that are part of the current tag and returns information about that tag
///
///
///
private string ParseTag(out bool selfClosing)
{
var tag = String.Empty;
selfClosing = false;
if (Peek() != '<') return tag;
MoveAhead();
// Parse tag name
EatWhitespace();
var start = _pos;
if (Peek() == '/')
MoveAhead();
while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
Peek() != '/' && Peek() != '>')
MoveAhead();
tag = _html.Substring(start, _pos - start).ToLower();
// Parse rest of tag
while (!EndOfText && Peek() != '>')
{
if (Peek() == '"' || Peek() == '\'')
EatQuotedValue();
else
{
if (Peek() == '/')
selfClosing = true;
MoveAhead();
}
}
MoveAhead();
return tag;
}
#endregion
#region EatInnerContent
///
/// Consumes inner content from the current tag
///
///
private void EatInnerContent(string tag)
{
var endTag = "/" + tag;
while (!EndOfText)
{
if (Peek() == '<')
{
// Consume a tag
bool selfClosing;
if (ParseTag(out selfClosing) == endTag)
return;
// Use recursion to consume nested tags
if (!selfClosing && !tag.StartsWith("/"))
EatInnerContent(tag);
}
else
MoveAhead();
}
}
#endregion
#region Helpers
///
/// Safely returns the character at the current position
///
///
private char Peek()
{
return (_pos < _html.Length) ? _html[_pos] : (char) 0;
}
///
/// Safely advances to current position to the next character
///
private void MoveAhead()
{
_pos = Math.Min(_pos + 1, _html.Length);
}
///
/// Moves the current position to the next non-whitespace character
///
private void EatWhitespace()
{
while (Char.IsWhiteSpace(Peek()))
MoveAhead();
}
///
/// Moves the current position to the next non-whitespace
/// character or the start of the next line, whichever
/// comes first
///
private void EatWhitespaceToNextLine()
{
while (Char.IsWhiteSpace(Peek()))
{
var c = Peek();
MoveAhead();
if (c == '\n')
break;
}
}
///
/// Moves the current position past a quoted value
///
private void EatQuotedValue()
{
var c = Peek();
if (c != '"' && c != '\'')
return;
// Opening quote
MoveAhead();
// Find end of value
_pos = _html.IndexOfAny(new[] {c, '\r', '\n'}, _pos);
if (_pos < 0)
_pos = _html.Length;
else
MoveAhead(); // Closing quote
}
#endregion
}
}