using System;
using System.Text.RegularExpressions;
namespace MsgReader.Mime.Decode
{
///
/// Utility class for dealing with encoded word strings
///
/// EncodedWord encoded strings are only in ASCII, but can embed information
/// about characters in other character sets.
///
/// It is done by specifying the character set, an encoding that maps from ASCII to
/// the correct bytes and the actual encoded string.
///
/// It is specified in a format that is best summarized by a BNF:
/// "=?" character_set "?" encoding "?" encoded-text "?="
///
///
/// =?ISO-8859-1?Q?=2D?=
/// Here ISO-8859-1 is the character set.
/// Q is the encoding method (quoted-printable). B is also supported (Base 64).
/// The encoded text is the =2D part which is decoded to a space.
///
internal static class EncodedWord
{
#region Decode
///
/// Decode text that is encoded with the encoding.
///
/// This method will decode any encoded-word found in the string.
/// All parts which is not encoded will not be touched.
///
/// From RFC 2047:
///
/// Generally, an "encoded-word" is a sequence of printable ASCII
/// characters that begins with "=?", ends with "?=", and has two "?"s in
/// between. It specifies a character set and an encoding method, and
/// also includes the original text encoded as graphic ASCII characters,
/// according to the rules for that encoding method.
///
/// Example:
/// =?ISO-8859-1?q?this=20is=20some=20text?= other text here
///
/// See RFC 2047 section 2 "Syntax of encoded-words" for more details
/// Source text. May be content which is not encoded.
/// Decoded text
/// If is
public static string Decode(string encodedWords)
{
if (encodedWords == null)
throw new ArgumentNullException(nameof(encodedWords));
// Notice that RFC2231 redefines the BNF to
// encoded-word := "=?" charset ["*" language] "?" encoded-text "?="
// but no usage of this BNF have been spotted yet. It is here to
// ease debugging if such a case is discovered.
// This is the regex that should fit the BNF
// RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples
// where whitespace is there, and therefore this regex allows for such.
const string encodedWordRegex = @"\=\?(?\S+?)\?(?\w)\?(?.*?)\?\=";
// \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]".
// \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]".
// +? non-greedy equivalent to +
// (?REGEX) is a named group with name NAME and regular expression REGEX
// Any amount of linear-space-white between 'encoded-word's,
// even if it includes a CRLF followed by one or more SPACEs,
// is ignored for the purposes of display.
// http://tools.ietf.org/html/rfc2047#page-12
// Define a regular expression that captures two encoded words with some whitespace between them
const string replaceRegex = @"(?" + encodedWordRegex + @")\s+(?" + encodedWordRegex + ")";
// Then, find an occurrence of such an expression, but remove the whitespace in between when found
// Need to be done twice for encodings such as "=?UTF-8?Q?a?= =?UTF-8?Q?b?= =?UTF-8?Q?c?="
// to be replaced correctly
encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}");
encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}");
var decodedWords = encodedWords;
var matches = Regex.Matches(encodedWords, encodedWordRegex);
foreach (Match match in matches)
{
// If this match was not a success, we should not use it
if (!match.Success) continue;
var fullMatchValue = match.Value;
var encodedText = match.Groups["Content"].Value;
var encoding = match.Groups["Encoding"].Value;
var charset = match.Groups["Charset"].Value;
// Get the encoding which corresponds to the character set
var charsetEncoding = EncodingFinder.FindEncoding(charset);
// Store decoded text here when done
string decodedText;
// Encoding may also be written in lowercase
switch (encoding.ToUpperInvariant())
{
// RFC:
// The "B" encoding is identical to the "BASE64"
// encoding defined by RFC 2045.
// http://tools.ietf.org/html/rfc2045#section-6.8
case "B":
decodedText = Base64.Decode(encodedText, charsetEncoding);
break;
// RFC:
// The "Q" encoding is similar to the "Quoted-Printable" content-
// transfer-encoding defined in RFC 2045.
// There are more details to this. Please check
// http://tools.ietf.org/html/rfc2047#section-4.2
//
case "Q":
decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding);
break;
default:
throw new ArgumentException("The encoding " + encoding + " was not recognized");
}
// Replace our encoded value with our decoded value
decodedWords = decodedWords.Replace(fullMatchValue, decodedText);
}
return decodedWords;
}
#endregion
}
}