You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

344 lines
16 KiB

using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace MsgReader.Mime.Decode
{
/// <summary>
/// Used for decoding Quoted-Printable text.<br/>
/// This is a robust implementation of a Quoted-Printable decoder defined in <a href="http://tools.ietf.org/html/rfc2045">RFC 2045</a> and <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>.<br/>
/// Every measurement has been taken to conform to the RFC.
/// </summary>
internal static class QuotedPrintable
{
#region DecodeEncodedWord
/// <summary>
/// Decodes a Quoted-Printable string according to <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>.<br/>
/// RFC 2047 is used for decoding Encoded-Word encoded strings.
/// </summary>
/// <param name="toDecode">Quoted-Printable encoded string</param>
/// <param name="encoding">Specifies which encoding the returned string will be in</param>
/// <returns>A decoded string in the correct encoding</returns>
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> or <paramref name="encoding"/> is <see langword="null"/></exception>
public static string DecodeEncodedWord(string toDecode, Encoding encoding)
{
if (toDecode == null)
throw new ArgumentNullException(nameof(toDecode));
if (encoding == null)
throw new ArgumentNullException(nameof(encoding));
// Decode the QuotedPrintable string and return it
return encoding.GetString(Rfc2047QuotedPrintableDecode(toDecode, true));
}
#endregion
#region DecodeContentTransferEncoding
/// <summary>
/// Decodes a Quoted-Printable string according to <a href="http://tools.ietf.org/html/rfc2045">RFC 2045</a>.<br/>
/// RFC 2045 specifies the decoding of a body encoded with Content-Transfer-Encoding of quoted-printable.
/// </summary>
/// <param name="toDecode">Quoted-Printable encoded string</param>
/// <returns>A decoded byte array that the Quoted-Printable encoded string described</returns>
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> is <see langword="null"/></exception>
public static byte[] DecodeContentTransferEncoding(string toDecode)
{
if (toDecode == null)
throw new ArgumentNullException(nameof(toDecode));
// Decode the QuotedPrintable string and return it
return Rfc2047QuotedPrintableDecode(toDecode, false);
}
#endregion
#region Rfc2047QuotedPrintableDecode
/// <summary>
/// This is the actual decoder.
/// </summary>
/// <param name="toDecode">The string to be decoded from Quoted-Printable</param>
/// <param name="encodedWordVariant">
/// If <see langword="true"/>, specifies that RFC 2047 quoted printable decoding is used.<br/>
/// This is for quoted-printable encoded words<br/>
/// <br/>
/// If <see langword="false"/>, specifies that RFC 2045 quoted printable decoding is used.<br/>
/// This is for quoted-printable Content-Transfer-Encoding
/// </param>
/// <returns>A decoded byte array that was described by <paramref name="toDecode"/></returns>
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> is <see langword="null"/></exception>
/// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-4.2">RFC 2047 section 4.2</a> for RFC details</remarks>
private static byte[] Rfc2047QuotedPrintableDecode(string toDecode, bool encodedWordVariant)
{
if (toDecode == null)
throw new ArgumentNullException(nameof(toDecode));
// Create a byte array builder which is roughly equivalent to a StringBuilder
using (var byteArrayBuilder = new MemoryStream())
{
// Remove illegal control characters
toDecode = RemoveIllegalControlCharacters(toDecode);
// Run through the whole string that needs to be decoded
for (var i = 0; i < toDecode.Length; i++)
{
var currentChar = toDecode[i];
if (currentChar == '=')
{
// Check that there is at least two characters behind the equal sign
if (toDecode.Length - i < 3)
{
// We are at the end of the toDecode string, but something is missing. Handle it the way RFC 2045 states
WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSignNotLongEnough(toDecode.Substring(i)));
// Since it was the last part, we should stop parsing anymore
break;
}
// Decode the Quoted-Printable part
var quotedPrintablePart = toDecode.Substring(i, 3);
WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSign(quotedPrintablePart));
// We now consumed two extra characters. Go forward two extra characters
i += 2;
}
else
{
// This character is not quoted printable hex encoded.
// Could it be the _ character, which represents space
// and are we using the encoded word variant of QuotedPrintable
if (currentChar == '_' && encodedWordVariant)
{
// The RFC specifies that the "_" always represents hexadecimal 20 even if the
// SPACE character occupies a different code position in the character set in use.
byteArrayBuilder.WriteByte(0x20);
}
else
{
// This is not encoded at all. This is a literal which should just be included into the output.
byteArrayBuilder.WriteByte((byte) currentChar);
}
}
}
return byteArrayBuilder.ToArray();
}
}
#endregion
#region WriteAllBytesToStream
/// <summary>
/// Writes all bytes in a byte array to a stream
/// </summary>
/// <param name="stream">The stream to write to</param>
/// <param name="toWrite">The bytes to write to the <paramref name="stream"/></param>
private static void WriteAllBytesToStream(Stream stream, byte[] toWrite)
{
stream.Write(toWrite, 0, toWrite.Length);
}
#endregion
#region RemoveIllegalControlCharacters
/// <summary>
/// RFC 2045 states about robustness:<br/>
/// <code>
/// Control characters other than TAB, or CR and LF as parts of CRLF pairs,
/// must not appear. The same is true for octets with decimal values greater
/// than 126. If found in incoming quoted-printable data by a decoder, a
/// robust implementation might exclude them from the decoded data and warn
/// the user that illegal characters were discovered.
/// </code>
/// Control characters are defined in RFC 2396 as<br/>
/// <c>control = US-ASCII coded characters 00-1F and 7F hexadecimal</c>
/// </summary>
/// <param name="input">String to be stripped from illegal control characters</param>
/// <returns>A string with no illegal control characters</returns>
/// <exception cref="ArgumentNullException">If <paramref name="input"/> is <see langword="null"/></exception>
private static string RemoveIllegalControlCharacters(string input)
{
if (input == null)
throw new ArgumentNullException(nameof(input));
// First we remove any \r or \n which is not part of a \r\n pair
input = RemoveCarriageReturnAndNewLinewIfNotInPair(input);
// Here only legal \r\n is left over
// We now simply keep them, and the \t which is also allowed
// \x0A = \n
// \x0D = \r
// \x09 = \t)
return Regex.Replace(input, "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");
}
#endregion
#region RemoveCarriageReturnAndNewLinewIfNotInPair
/// <summary>
/// This method will remove any \r and \n which is not paired as \r\n
/// </summary>
/// <param name="input">String to remove lonely \r and \n's from</param>
/// <returns>A string without lonely \r and \n's</returns>
/// <exception cref="ArgumentNullException">If <paramref name="input"/> is <see langword="null"/></exception>
private static string RemoveCarriageReturnAndNewLinewIfNotInPair(string input)
{
if (input == null)
throw new ArgumentNullException(nameof(input));
// Use this for building up the new string. This is used for performance instead
// of altering the input string each time a illegal token is found
var newString = new StringBuilder(input.Length);
for (var i = 0; i < input.Length; i++)
{
// There is a character after it
// Check for lonely \r
// There is a lonely \r if it is the last character in the input or if there
// is no \n following it
if (input[i] == '\r' && (i + 1 >= input.Length || input[i + 1] != '\n'))
{
// Illegal token \r found. Do not add it to the new string
// Check for lonely \n
// There is a lonely \n if \n is the first character or if there
// is no \r in front of it
}
else if (input[i] == '\n' && (i - 1 < 0 || input[i - 1] != '\r'))
{
// Illegal token \n found. Do not add it to the new string
}
else
{
// No illegal tokens found. Simply insert the character we are at
// in our new string
newString.Append(input[i]);
}
}
return newString.ToString();
}
#endregion
#region DecodeEqualSignNotLongEnough
/// <summary>
/// RFC 2045 says that a robust implementation should handle:<br/>
/// <code>
/// An "=" cannot be the ultimate or penultimate character in an encoded
/// object. This could be handled as in case (2) above.
/// </code>
/// Case (2) is:<br/>
/// <code>
/// An "=" followed by a character that is neither a
/// hexadecimal digit (including "abcdef") nor the CR character of a CRLF pair
/// is illegal. This case can be the result of US-ASCII text having been
/// included in a quoted-printable part of a message without itself having
/// been subjected to quoted-printable encoding. A reasonable approach by a
/// robust implementation might be to include the "=" character and the
/// following character in the decoded data without any transformation and, if
/// possible, indicate to the user that proper decoding was not possible at
/// this point in the data.
/// </code>
/// </summary>
/// <param name="decode">
/// The string to decode which cannot have length above or equal to 3
/// and must start with an equal sign.
/// </param>
/// <returns>A decoded byte array</returns>
/// <exception cref="ArgumentNullException">If <paramref name="decode"/> is <see langword="null"/></exception>
/// <exception cref="ArgumentException">Thrown if a the <paramref name="decode"/> parameter has length above 2 or does not start with an equal sign.</exception>
private static byte[] DecodeEqualSignNotLongEnough(string decode)
{
if (decode == null)
throw new ArgumentNullException(nameof(decode));
// We can only decode wrong length equal signs
if (decode.Length >= 3)
throw new ArgumentException(@"decode must have length lower than 3", nameof(decode));
if (decode.Length <= 0)
throw new ArgumentException(@"decode must have length lower at least 1", nameof(decode));
// First char must be =
if (decode[0] != '=')
throw new ArgumentException(@"First part of decode must be an equal sign", nameof(decode));
// We will now believe that the string sent to us, was actually not encoded
// Therefore it must be in US-ASCII and we will return the bytes it corrosponds to
return Encoding.ASCII.GetBytes(decode);
}
#endregion
#region DecodeEqualSign
/// <summary>
/// This helper method will decode a string of the form "=XX" where X is any character.<br/>
/// This method will never fail, unless an argument of length not equal to three is passed.
/// </summary>
/// <param name="decode">The length 3 character that needs to be decoded</param>
/// <returns>A decoded byte array</returns>
/// <exception cref="ArgumentNullException">If <paramref name="decode"/> is <see langword="null"/></exception>
/// <exception cref="ArgumentException">Thrown if a the <paramref name="decode"/> parameter does not have length 3 or does not start with an equal sign.</exception>
private static byte[] DecodeEqualSign(string decode)
{
if (decode == null)
throw new ArgumentNullException(nameof(decode));
// We can only decode the string if it has length 3 - other calls to this function is invalid
if (decode.Length != 3)
throw new ArgumentException(@"decode must have length 3", nameof(decode));
// First char must be =
if (decode[0] != '=')
throw new ArgumentException(@"decode must start with an equal sign", nameof(decode));
// There are two cases where an equal sign might appear
// It might be a
// - hex-string like =3D, denoting the character with hex value 3D
// - it might be the last character on the line before a CRLF
// pair, denoting a soft linebreak, which simply
// splits the text up, because of the 76 chars per line restriction
if (decode.Contains("\r\n"))
{
// Soft break detected
// We want to return string.Empty which is equivalent to a zero-length byte array
return new byte[0];
}
// Hex string detected. Convertion needed.
// It might be that the string located after the equal sign is not hex characters
// An example: =JU
// In that case we would like to catch the FormatException and do something else
try
{
// The number part of the string is the last two digits. Here we simply remove the equal sign
var numberString = decode.Substring(1);
// Now we create a byte array with the converted number encoded in the string as a hex value (base 16)
// This will also handle illegal encodings like =3d where the hex digits are not uppercase,
// which is a robustness requirement from RFC 2045.
var oneByte = new[] {Convert.ToByte(numberString, 16)};
// Simply return our one byte byte array
return oneByte;
}
catch (FormatException)
{
// RFC 2045 says about robust implementation:
// An "=" followed by a character that is neither a
// hexadecimal digit (including "abcdef") nor the CR
// character of a CRLF pair is illegal. This case can be
// the result of US-ASCII text having been included in a
// quoted-printable part of a message without itself
// having been subjected to quoted-printable encoding. A
// reasonable approach by a robust implementation might be
// to include the "=" character and the following
// character in the decoded data without any
// transformation and, if possible, indicate to the user
// that proper decoding was not possible at this point in
// the data.
// So we choose to believe this is actually an un-encoded string
// Therefore it must be in US-ASCII and we will return the bytes it corrosponds to
return Encoding.ASCII.GetBytes(decode);
}
}
#endregion
}
}