using System; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace MsgReader.Mime.Decode { /// /// Used for decoding Quoted-Printable text.
/// This is a robust implementation of a Quoted-Printable decoder defined in RFC 2045 and RFC 2047.
/// Every measurement has been taken to conform to the RFC. ///
internal static class QuotedPrintable { #region DecodeEncodedWord /// /// Decodes a Quoted-Printable string according to RFC 2047.
/// RFC 2047 is used for decoding Encoded-Word encoded strings. ///
/// Quoted-Printable encoded string /// Specifies which encoding the returned string will be in /// A decoded string in the correct encoding /// If or is public static string DecodeEncodedWord(string toDecode, Encoding encoding) { if (toDecode == null) throw new ArgumentNullException(nameof(toDecode)); if (encoding == null) throw new ArgumentNullException(nameof(encoding)); // Decode the QuotedPrintable string and return it return encoding.GetString(Rfc2047QuotedPrintableDecode(toDecode, true)); } #endregion #region DecodeContentTransferEncoding /// /// Decodes a Quoted-Printable string according to RFC 2045.
/// RFC 2045 specifies the decoding of a body encoded with Content-Transfer-Encoding of quoted-printable. ///
/// Quoted-Printable encoded string /// A decoded byte array that the Quoted-Printable encoded string described /// If is public static byte[] DecodeContentTransferEncoding(string toDecode) { if (toDecode == null) throw new ArgumentNullException(nameof(toDecode)); // Decode the QuotedPrintable string and return it return Rfc2047QuotedPrintableDecode(toDecode, false); } #endregion #region Rfc2047QuotedPrintableDecode /// /// This is the actual decoder. /// /// The string to be decoded from Quoted-Printable /// /// If , specifies that RFC 2047 quoted printable decoding is used.
/// This is for quoted-printable encoded words
///
/// If , specifies that RFC 2045 quoted printable decoding is used.
/// This is for quoted-printable Content-Transfer-Encoding /// /// A decoded byte array that was described by /// If is /// See RFC 2047 section 4.2 for RFC details private static byte[] Rfc2047QuotedPrintableDecode(string toDecode, bool encodedWordVariant) { if (toDecode == null) throw new ArgumentNullException(nameof(toDecode)); // Create a byte array builder which is roughly equivalent to a StringBuilder using (var byteArrayBuilder = new MemoryStream()) { // Remove illegal control characters toDecode = RemoveIllegalControlCharacters(toDecode); // Run through the whole string that needs to be decoded for (var i = 0; i < toDecode.Length; i++) { var currentChar = toDecode[i]; if (currentChar == '=') { // Check that there is at least two characters behind the equal sign if (toDecode.Length - i < 3) { // We are at the end of the toDecode string, but something is missing. Handle it the way RFC 2045 states WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSignNotLongEnough(toDecode.Substring(i))); // Since it was the last part, we should stop parsing anymore break; } // Decode the Quoted-Printable part var quotedPrintablePart = toDecode.Substring(i, 3); WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSign(quotedPrintablePart)); // We now consumed two extra characters. Go forward two extra characters i += 2; } else { // This character is not quoted printable hex encoded. // Could it be the _ character, which represents space // and are we using the encoded word variant of QuotedPrintable if (currentChar == '_' && encodedWordVariant) { // The RFC specifies that the "_" always represents hexadecimal 20 even if the // SPACE character occupies a different code position in the character set in use. byteArrayBuilder.WriteByte(0x20); } else { // This is not encoded at all. This is a literal which should just be included into the output. byteArrayBuilder.WriteByte((byte) currentChar); } } } return byteArrayBuilder.ToArray(); } } #endregion #region WriteAllBytesToStream /// /// Writes all bytes in a byte array to a stream /// /// The stream to write to /// The bytes to write to the private static void WriteAllBytesToStream(Stream stream, byte[] toWrite) { stream.Write(toWrite, 0, toWrite.Length); } #endregion #region RemoveIllegalControlCharacters /// /// RFC 2045 states about robustness:
/// /// Control characters other than TAB, or CR and LF as parts of CRLF pairs, /// must not appear. The same is true for octets with decimal values greater /// than 126. If found in incoming quoted-printable data by a decoder, a /// robust implementation might exclude them from the decoded data and warn /// the user that illegal characters were discovered. /// /// Control characters are defined in RFC 2396 as
/// control = US-ASCII coded characters 00-1F and 7F hexadecimal ///
/// String to be stripped from illegal control characters /// A string with no illegal control characters /// If is private static string RemoveIllegalControlCharacters(string input) { if (input == null) throw new ArgumentNullException(nameof(input)); // First we remove any \r or \n which is not part of a \r\n pair input = RemoveCarriageReturnAndNewLinewIfNotInPair(input); // Here only legal \r\n is left over // We now simply keep them, and the \t which is also allowed // \x0A = \n // \x0D = \r // \x09 = \t) return Regex.Replace(input, "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", ""); } #endregion #region RemoveCarriageReturnAndNewLinewIfNotInPair /// /// This method will remove any \r and \n which is not paired as \r\n /// /// String to remove lonely \r and \n's from /// A string without lonely \r and \n's /// If is private static string RemoveCarriageReturnAndNewLinewIfNotInPair(string input) { if (input == null) throw new ArgumentNullException(nameof(input)); // Use this for building up the new string. This is used for performance instead // of altering the input string each time a illegal token is found var newString = new StringBuilder(input.Length); for (var i = 0; i < input.Length; i++) { // There is a character after it // Check for lonely \r // There is a lonely \r if it is the last character in the input or if there // is no \n following it if (input[i] == '\r' && (i + 1 >= input.Length || input[i + 1] != '\n')) { // Illegal token \r found. Do not add it to the new string // Check for lonely \n // There is a lonely \n if \n is the first character or if there // is no \r in front of it } else if (input[i] == '\n' && (i - 1 < 0 || input[i - 1] != '\r')) { // Illegal token \n found. Do not add it to the new string } else { // No illegal tokens found. Simply insert the character we are at // in our new string newString.Append(input[i]); } } return newString.ToString(); } #endregion #region DecodeEqualSignNotLongEnough /// /// RFC 2045 says that a robust implementation should handle:
/// /// An "=" cannot be the ultimate or penultimate character in an encoded /// object. This could be handled as in case (2) above. /// /// Case (2) is:
/// /// An "=" followed by a character that is neither a /// hexadecimal digit (including "abcdef") nor the CR character of a CRLF pair /// is illegal. This case can be the result of US-ASCII text having been /// included in a quoted-printable part of a message without itself having /// been subjected to quoted-printable encoding. A reasonable approach by a /// robust implementation might be to include the "=" character and the /// following character in the decoded data without any transformation and, if /// possible, indicate to the user that proper decoding was not possible at /// this point in the data. /// ///
/// /// The string to decode which cannot have length above or equal to 3 /// and must start with an equal sign. /// /// A decoded byte array /// If is /// Thrown if a the parameter has length above 2 or does not start with an equal sign. private static byte[] DecodeEqualSignNotLongEnough(string decode) { if (decode == null) throw new ArgumentNullException(nameof(decode)); // We can only decode wrong length equal signs if (decode.Length >= 3) throw new ArgumentException(@"decode must have length lower than 3", nameof(decode)); if (decode.Length <= 0) throw new ArgumentException(@"decode must have length lower at least 1", nameof(decode)); // First char must be = if (decode[0] != '=') throw new ArgumentException(@"First part of decode must be an equal sign", nameof(decode)); // We will now believe that the string sent to us, was actually not encoded // Therefore it must be in US-ASCII and we will return the bytes it corrosponds to return Encoding.ASCII.GetBytes(decode); } #endregion #region DecodeEqualSign /// /// This helper method will decode a string of the form "=XX" where X is any character.
/// This method will never fail, unless an argument of length not equal to three is passed. ///
/// The length 3 character that needs to be decoded /// A decoded byte array /// If is /// Thrown if a the parameter does not have length 3 or does not start with an equal sign. private static byte[] DecodeEqualSign(string decode) { if (decode == null) throw new ArgumentNullException(nameof(decode)); // We can only decode the string if it has length 3 - other calls to this function is invalid if (decode.Length != 3) throw new ArgumentException(@"decode must have length 3", nameof(decode)); // First char must be = if (decode[0] != '=') throw new ArgumentException(@"decode must start with an equal sign", nameof(decode)); // There are two cases where an equal sign might appear // It might be a // - hex-string like =3D, denoting the character with hex value 3D // - it might be the last character on the line before a CRLF // pair, denoting a soft linebreak, which simply // splits the text up, because of the 76 chars per line restriction if (decode.Contains("\r\n")) { // Soft break detected // We want to return string.Empty which is equivalent to a zero-length byte array return new byte[0]; } // Hex string detected. Convertion needed. // It might be that the string located after the equal sign is not hex characters // An example: =JU // In that case we would like to catch the FormatException and do something else try { // The number part of the string is the last two digits. Here we simply remove the equal sign var numberString = decode.Substring(1); // Now we create a byte array with the converted number encoded in the string as a hex value (base 16) // This will also handle illegal encodings like =3d where the hex digits are not uppercase, // which is a robustness requirement from RFC 2045. var oneByte = new[] {Convert.ToByte(numberString, 16)}; // Simply return our one byte byte array return oneByte; } catch (FormatException) { // RFC 2045 says about robust implementation: // An "=" followed by a character that is neither a // hexadecimal digit (including "abcdef") nor the CR // character of a CRLF pair is illegal. This case can be // the result of US-ASCII text having been included in a // quoted-printable part of a message without itself // having been subjected to quoted-printable encoding. A // reasonable approach by a robust implementation might be // to include the "=" character and the following // character in the decoded data without any // transformation and, if possible, indicate to the user // that proper decoding was not possible at this point in // the data. // So we choose to believe this is actually an un-encoded string // Therefore it must be in US-ASCII and we will return the bytes it corrosponds to return Encoding.ASCII.GetBytes(decode); } } #endregion } }