You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
6.2 KiB
133 lines
6.2 KiB
using System;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace MsgReader.Mime.Decode
|
|
{
|
|
/// <summary>
|
|
/// Utility class for dealing with encoded word strings<br/>
|
|
/// <br/>
|
|
/// EncodedWord encoded strings are only in ASCII, but can embed information
|
|
/// about characters in other character sets.<br/>
|
|
/// <br/>
|
|
/// It is done by specifying the character set, an encoding that maps from ASCII to
|
|
/// the correct bytes and the actual encoded string.<br/>
|
|
/// <br/>
|
|
/// It is specified in a format that is best summarized by a BNF:<br/>
|
|
/// <c>"=?" character_set "?" encoding "?" encoded-text "?="</c><br/>
|
|
/// </summary>
|
|
/// <example>
|
|
/// <c>=?ISO-8859-1?Q?=2D?=</c>
|
|
/// Here <c>ISO-8859-1</c> is the character set.<br/>
|
|
/// <c>Q</c> is the encoding method (quoted-printable). <c>B</c> is also supported (Base 64).<br/>
|
|
/// The encoded text is the <c>=2D</c> part which is decoded to a space.
|
|
/// </example>
|
|
internal static class EncodedWord
|
|
{
|
|
#region Decode
|
|
/// <summary>
|
|
/// Decode text that is encoded with the <see cref="EncodedWord"/> encoding.<br/>
|
|
///<br/>
|
|
/// This method will decode any encoded-word found in the string.<br/>
|
|
/// All parts which is not encoded will not be touched.<br/>
|
|
/// <br/>
|
|
/// From <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>:<br/>
|
|
/// <code>
|
|
/// Generally, an "encoded-word" is a sequence of printable ASCII
|
|
/// characters that begins with "=?", ends with "?=", and has two "?"s in
|
|
/// between. It specifies a character set and an encoding method, and
|
|
/// also includes the original text encoded as graphic ASCII characters,
|
|
/// according to the rules for that encoding method.
|
|
/// </code>
|
|
/// Example:<br/>
|
|
/// <c>=?ISO-8859-1?q?this=20is=20some=20text?= other text here</c>
|
|
/// </summary>
|
|
/// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-2">RFC 2047 section 2</a> "Syntax of encoded-words" for more details</remarks>
|
|
/// <param name="encodedWords">Source text. May be content which is not encoded.</param>
|
|
/// <returns>Decoded text</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="encodedWords"/> is <see langword="null"/></exception>
|
|
public static string Decode(string encodedWords)
|
|
{
|
|
if (encodedWords == null)
|
|
throw new ArgumentNullException(nameof(encodedWords));
|
|
|
|
// Notice that RFC2231 redefines the BNF to
|
|
// encoded-word := "=?" charset ["*" language] "?" encoded-text "?="
|
|
// but no usage of this BNF have been spotted yet. It is here to
|
|
// ease debugging if such a case is discovered.
|
|
|
|
// This is the regex that should fit the BNF
|
|
// RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples
|
|
// where whitespace is there, and therefore this regex allows for such.
|
|
const string encodedWordRegex = @"\=\?(?<Charset>\S+?)\?(?<Encoding>\w)\?(?<Content>.*?)\?\=";
|
|
// \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]".
|
|
// \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]".
|
|
// +? non-greedy equivalent to +
|
|
// (?<NAME>REGEX) is a named group with name NAME and regular expression REGEX
|
|
|
|
// Any amount of linear-space-white between 'encoded-word's,
|
|
// even if it includes a CRLF followed by one or more SPACEs,
|
|
// is ignored for the purposes of display.
|
|
// http://tools.ietf.org/html/rfc2047#page-12
|
|
// Define a regular expression that captures two encoded words with some whitespace between them
|
|
const string replaceRegex = @"(?<first>" + encodedWordRegex + @")\s+(?<second>" + encodedWordRegex + ")";
|
|
|
|
// Then, find an occurrence of such an expression, but remove the whitespace in between when found
|
|
// Need to be done twice for encodings such as "=?UTF-8?Q?a?= =?UTF-8?Q?b?= =?UTF-8?Q?c?="
|
|
// to be replaced correctly
|
|
encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}");
|
|
encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}");
|
|
|
|
var decodedWords = encodedWords;
|
|
|
|
var matches = Regex.Matches(encodedWords, encodedWordRegex);
|
|
foreach (Match match in matches)
|
|
{
|
|
// If this match was not a success, we should not use it
|
|
if (!match.Success) continue;
|
|
|
|
var fullMatchValue = match.Value;
|
|
|
|
var encodedText = match.Groups["Content"].Value;
|
|
var encoding = match.Groups["Encoding"].Value;
|
|
var charset = match.Groups["Charset"].Value;
|
|
|
|
// Get the encoding which corresponds to the character set
|
|
var charsetEncoding = EncodingFinder.FindEncoding(charset);
|
|
|
|
// Store decoded text here when done
|
|
string decodedText;
|
|
|
|
// Encoding may also be written in lowercase
|
|
switch (encoding.ToUpperInvariant())
|
|
{
|
|
// RFC:
|
|
// The "B" encoding is identical to the "BASE64"
|
|
// encoding defined by RFC 2045.
|
|
// http://tools.ietf.org/html/rfc2045#section-6.8
|
|
case "B":
|
|
decodedText = Base64.Decode(encodedText, charsetEncoding);
|
|
break;
|
|
|
|
// RFC:
|
|
// The "Q" encoding is similar to the "Quoted-Printable" content-
|
|
// transfer-encoding defined in RFC 2045.
|
|
// There are more details to this. Please check
|
|
// http://tools.ietf.org/html/rfc2047#section-4.2
|
|
//
|
|
case "Q":
|
|
decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding);
|
|
break;
|
|
|
|
default:
|
|
throw new ArgumentException("The encoding " + encoding + " was not recognized");
|
|
}
|
|
|
|
// Replace our encoded value with our decoded value
|
|
decodedWords = decodedWords.Replace(fullMatchValue, decodedText);
|
|
}
|
|
|
|
return decodedWords;
|
|
}
|
|
#endregion
|
|
}
|
|
} |