using System; using System.Text.RegularExpressions; namespace MsgReader.Mime.Decode { /// /// Utility class for dealing with encoded word strings
///
/// EncodedWord encoded strings are only in ASCII, but can embed information /// about characters in other character sets.
///
/// It is done by specifying the character set, an encoding that maps from ASCII to /// the correct bytes and the actual encoded string.
///
/// It is specified in a format that is best summarized by a BNF:
/// "=?" character_set "?" encoding "?" encoded-text "?="
///
/// /// =?ISO-8859-1?Q?=2D?= /// Here ISO-8859-1 is the character set.
/// Q is the encoding method (quoted-printable). B is also supported (Base 64).
/// The encoded text is the =2D part which is decoded to a space. ///
internal static class EncodedWord { #region Decode /// /// Decode text that is encoded with the encoding.
///
/// This method will decode any encoded-word found in the string.
/// All parts which is not encoded will not be touched.
///
/// From RFC 2047:
/// /// Generally, an "encoded-word" is a sequence of printable ASCII /// characters that begins with "=?", ends with "?=", and has two "?"s in /// between. It specifies a character set and an encoding method, and /// also includes the original text encoded as graphic ASCII characters, /// according to the rules for that encoding method. /// /// Example:
/// =?ISO-8859-1?q?this=20is=20some=20text?= other text here ///
/// See RFC 2047 section 2 "Syntax of encoded-words" for more details /// Source text. May be content which is not encoded. /// Decoded text /// If is public static string Decode(string encodedWords) { if (encodedWords == null) throw new ArgumentNullException(nameof(encodedWords)); // Notice that RFC2231 redefines the BNF to // encoded-word := "=?" charset ["*" language] "?" encoded-text "?=" // but no usage of this BNF have been spotted yet. It is here to // ease debugging if such a case is discovered. // This is the regex that should fit the BNF // RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples // where whitespace is there, and therefore this regex allows for such. const string encodedWordRegex = @"\=\?(?\S+?)\?(?\w)\?(?.*?)\?\="; // \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]". // \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]". // +? non-greedy equivalent to + // (?REGEX) is a named group with name NAME and regular expression REGEX // Any amount of linear-space-white between 'encoded-word's, // even if it includes a CRLF followed by one or more SPACEs, // is ignored for the purposes of display. // http://tools.ietf.org/html/rfc2047#page-12 // Define a regular expression that captures two encoded words with some whitespace between them const string replaceRegex = @"(?" + encodedWordRegex + @")\s+(?" + encodedWordRegex + ")"; // Then, find an occurrence of such an expression, but remove the whitespace in between when found // Need to be done twice for encodings such as "=?UTF-8?Q?a?= =?UTF-8?Q?b?= =?UTF-8?Q?c?=" // to be replaced correctly encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); var decodedWords = encodedWords; var matches = Regex.Matches(encodedWords, encodedWordRegex); foreach (Match match in matches) { // If this match was not a success, we should not use it if (!match.Success) continue; var fullMatchValue = match.Value; var encodedText = match.Groups["Content"].Value; var encoding = match.Groups["Encoding"].Value; var charset = match.Groups["Charset"].Value; // Get the encoding which corresponds to the character set var charsetEncoding = EncodingFinder.FindEncoding(charset); // Store decoded text here when done string decodedText; // Encoding may also be written in lowercase switch (encoding.ToUpperInvariant()) { // RFC: // The "B" encoding is identical to the "BASE64" // encoding defined by RFC 2045. // http://tools.ietf.org/html/rfc2045#section-6.8 case "B": decodedText = Base64.Decode(encodedText, charsetEncoding); break; // RFC: // The "Q" encoding is similar to the "Quoted-Printable" content- // transfer-encoding defined in RFC 2045. // There are more details to this. Please check // http://tools.ietf.org/html/rfc2047#section-4.2 // case "Q": decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding); break; default: throw new ArgumentException("The encoding " + encoding + " was not recognized"); } // Replace our encoded value with our decoded value decodedWords = decodedWords.Replace(fullMatchValue, decodedText); } return decodedWords; } #endregion } }