You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
344 lines
16 KiB
344 lines
16 KiB
using System;
|
|
using System.IO;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace MsgReader.Mime.Decode
|
|
{
|
|
/// <summary>
|
|
/// Used for decoding Quoted-Printable text.<br/>
|
|
/// This is a robust implementation of a Quoted-Printable decoder defined in <a href="http://tools.ietf.org/html/rfc2045">RFC 2045</a> and <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>.<br/>
|
|
/// Every measurement has been taken to conform to the RFC.
|
|
/// </summary>
|
|
internal static class QuotedPrintable
|
|
{
|
|
#region DecodeEncodedWord
|
|
/// <summary>
|
|
/// Decodes a Quoted-Printable string according to <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>.<br/>
|
|
/// RFC 2047 is used for decoding Encoded-Word encoded strings.
|
|
/// </summary>
|
|
/// <param name="toDecode">Quoted-Printable encoded string</param>
|
|
/// <param name="encoding">Specifies which encoding the returned string will be in</param>
|
|
/// <returns>A decoded string in the correct encoding</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> or <paramref name="encoding"/> is <see langword="null"/></exception>
|
|
public static string DecodeEncodedWord(string toDecode, Encoding encoding)
|
|
{
|
|
if (toDecode == null)
|
|
throw new ArgumentNullException(nameof(toDecode));
|
|
|
|
if (encoding == null)
|
|
throw new ArgumentNullException(nameof(encoding));
|
|
|
|
// Decode the QuotedPrintable string and return it
|
|
return encoding.GetString(Rfc2047QuotedPrintableDecode(toDecode, true));
|
|
}
|
|
#endregion
|
|
|
|
#region DecodeContentTransferEncoding
|
|
/// <summary>
|
|
/// Decodes a Quoted-Printable string according to <a href="http://tools.ietf.org/html/rfc2045">RFC 2045</a>.<br/>
|
|
/// RFC 2045 specifies the decoding of a body encoded with Content-Transfer-Encoding of quoted-printable.
|
|
/// </summary>
|
|
/// <param name="toDecode">Quoted-Printable encoded string</param>
|
|
/// <returns>A decoded byte array that the Quoted-Printable encoded string described</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> is <see langword="null"/></exception>
|
|
public static byte[] DecodeContentTransferEncoding(string toDecode)
|
|
{
|
|
if (toDecode == null)
|
|
throw new ArgumentNullException(nameof(toDecode));
|
|
|
|
// Decode the QuotedPrintable string and return it
|
|
return Rfc2047QuotedPrintableDecode(toDecode, false);
|
|
}
|
|
#endregion
|
|
|
|
#region Rfc2047QuotedPrintableDecode
|
|
/// <summary>
|
|
/// This is the actual decoder.
|
|
/// </summary>
|
|
/// <param name="toDecode">The string to be decoded from Quoted-Printable</param>
|
|
/// <param name="encodedWordVariant">
|
|
/// If <see langword="true"/>, specifies that RFC 2047 quoted printable decoding is used.<br/>
|
|
/// This is for quoted-printable encoded words<br/>
|
|
/// <br/>
|
|
/// If <see langword="false"/>, specifies that RFC 2045 quoted printable decoding is used.<br/>
|
|
/// This is for quoted-printable Content-Transfer-Encoding
|
|
/// </param>
|
|
/// <returns>A decoded byte array that was described by <paramref name="toDecode"/></returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="toDecode"/> is <see langword="null"/></exception>
|
|
/// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-4.2">RFC 2047 section 4.2</a> for RFC details</remarks>
|
|
private static byte[] Rfc2047QuotedPrintableDecode(string toDecode, bool encodedWordVariant)
|
|
{
|
|
if (toDecode == null)
|
|
throw new ArgumentNullException(nameof(toDecode));
|
|
|
|
// Create a byte array builder which is roughly equivalent to a StringBuilder
|
|
using (var byteArrayBuilder = new MemoryStream())
|
|
{
|
|
// Remove illegal control characters
|
|
toDecode = RemoveIllegalControlCharacters(toDecode);
|
|
|
|
// Run through the whole string that needs to be decoded
|
|
for (var i = 0; i < toDecode.Length; i++)
|
|
{
|
|
var currentChar = toDecode[i];
|
|
if (currentChar == '=')
|
|
{
|
|
// Check that there is at least two characters behind the equal sign
|
|
if (toDecode.Length - i < 3)
|
|
{
|
|
// We are at the end of the toDecode string, but something is missing. Handle it the way RFC 2045 states
|
|
WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSignNotLongEnough(toDecode.Substring(i)));
|
|
|
|
// Since it was the last part, we should stop parsing anymore
|
|
break;
|
|
}
|
|
|
|
// Decode the Quoted-Printable part
|
|
var quotedPrintablePart = toDecode.Substring(i, 3);
|
|
WriteAllBytesToStream(byteArrayBuilder, DecodeEqualSign(quotedPrintablePart));
|
|
|
|
// We now consumed two extra characters. Go forward two extra characters
|
|
i += 2;
|
|
}
|
|
else
|
|
{
|
|
// This character is not quoted printable hex encoded.
|
|
|
|
// Could it be the _ character, which represents space
|
|
// and are we using the encoded word variant of QuotedPrintable
|
|
if (currentChar == '_' && encodedWordVariant)
|
|
{
|
|
// The RFC specifies that the "_" always represents hexadecimal 20 even if the
|
|
// SPACE character occupies a different code position in the character set in use.
|
|
byteArrayBuilder.WriteByte(0x20);
|
|
}
|
|
else
|
|
{
|
|
// This is not encoded at all. This is a literal which should just be included into the output.
|
|
byteArrayBuilder.WriteByte((byte) currentChar);
|
|
}
|
|
}
|
|
}
|
|
|
|
return byteArrayBuilder.ToArray();
|
|
}
|
|
}
|
|
#endregion
|
|
|
|
#region WriteAllBytesToStream
|
|
/// <summary>
|
|
/// Writes all bytes in a byte array to a stream
|
|
/// </summary>
|
|
/// <param name="stream">The stream to write to</param>
|
|
/// <param name="toWrite">The bytes to write to the <paramref name="stream"/></param>
|
|
private static void WriteAllBytesToStream(Stream stream, byte[] toWrite)
|
|
{
|
|
stream.Write(toWrite, 0, toWrite.Length);
|
|
}
|
|
#endregion
|
|
|
|
#region RemoveIllegalControlCharacters
|
|
/// <summary>
|
|
/// RFC 2045 states about robustness:<br/>
|
|
/// <code>
|
|
/// Control characters other than TAB, or CR and LF as parts of CRLF pairs,
|
|
/// must not appear. The same is true for octets with decimal values greater
|
|
/// than 126. If found in incoming quoted-printable data by a decoder, a
|
|
/// robust implementation might exclude them from the decoded data and warn
|
|
/// the user that illegal characters were discovered.
|
|
/// </code>
|
|
/// Control characters are defined in RFC 2396 as<br/>
|
|
/// <c>control = US-ASCII coded characters 00-1F and 7F hexadecimal</c>
|
|
/// </summary>
|
|
/// <param name="input">String to be stripped from illegal control characters</param>
|
|
/// <returns>A string with no illegal control characters</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="input"/> is <see langword="null"/></exception>
|
|
private static string RemoveIllegalControlCharacters(string input)
|
|
{
|
|
if (input == null)
|
|
throw new ArgumentNullException(nameof(input));
|
|
|
|
// First we remove any \r or \n which is not part of a \r\n pair
|
|
input = RemoveCarriageReturnAndNewLinewIfNotInPair(input);
|
|
|
|
// Here only legal \r\n is left over
|
|
// We now simply keep them, and the \t which is also allowed
|
|
// \x0A = \n
|
|
// \x0D = \r
|
|
// \x09 = \t)
|
|
return Regex.Replace(input, "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");
|
|
}
|
|
#endregion
|
|
|
|
#region RemoveCarriageReturnAndNewLinewIfNotInPair
|
|
/// <summary>
|
|
/// This method will remove any \r and \n which is not paired as \r\n
|
|
/// </summary>
|
|
/// <param name="input">String to remove lonely \r and \n's from</param>
|
|
/// <returns>A string without lonely \r and \n's</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="input"/> is <see langword="null"/></exception>
|
|
private static string RemoveCarriageReturnAndNewLinewIfNotInPair(string input)
|
|
{
|
|
if (input == null)
|
|
throw new ArgumentNullException(nameof(input));
|
|
|
|
// Use this for building up the new string. This is used for performance instead
|
|
// of altering the input string each time a illegal token is found
|
|
var newString = new StringBuilder(input.Length);
|
|
|
|
for (var i = 0; i < input.Length; i++)
|
|
{
|
|
// There is a character after it
|
|
// Check for lonely \r
|
|
// There is a lonely \r if it is the last character in the input or if there
|
|
// is no \n following it
|
|
if (input[i] == '\r' && (i + 1 >= input.Length || input[i + 1] != '\n'))
|
|
{
|
|
// Illegal token \r found. Do not add it to the new string
|
|
|
|
// Check for lonely \n
|
|
// There is a lonely \n if \n is the first character or if there
|
|
// is no \r in front of it
|
|
}
|
|
else if (input[i] == '\n' && (i - 1 < 0 || input[i - 1] != '\r'))
|
|
{
|
|
// Illegal token \n found. Do not add it to the new string
|
|
}
|
|
else
|
|
{
|
|
// No illegal tokens found. Simply insert the character we are at
|
|
// in our new string
|
|
newString.Append(input[i]);
|
|
}
|
|
}
|
|
|
|
return newString.ToString();
|
|
}
|
|
#endregion
|
|
|
|
#region DecodeEqualSignNotLongEnough
|
|
/// <summary>
|
|
/// RFC 2045 says that a robust implementation should handle:<br/>
|
|
/// <code>
|
|
/// An "=" cannot be the ultimate or penultimate character in an encoded
|
|
/// object. This could be handled as in case (2) above.
|
|
/// </code>
|
|
/// Case (2) is:<br/>
|
|
/// <code>
|
|
/// An "=" followed by a character that is neither a
|
|
/// hexadecimal digit (including "abcdef") nor the CR character of a CRLF pair
|
|
/// is illegal. This case can be the result of US-ASCII text having been
|
|
/// included in a quoted-printable part of a message without itself having
|
|
/// been subjected to quoted-printable encoding. A reasonable approach by a
|
|
/// robust implementation might be to include the "=" character and the
|
|
/// following character in the decoded data without any transformation and, if
|
|
/// possible, indicate to the user that proper decoding was not possible at
|
|
/// this point in the data.
|
|
/// </code>
|
|
/// </summary>
|
|
/// <param name="decode">
|
|
/// The string to decode which cannot have length above or equal to 3
|
|
/// and must start with an equal sign.
|
|
/// </param>
|
|
/// <returns>A decoded byte array</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="decode"/> is <see langword="null"/></exception>
|
|
/// <exception cref="ArgumentException">Thrown if a the <paramref name="decode"/> parameter has length above 2 or does not start with an equal sign.</exception>
|
|
private static byte[] DecodeEqualSignNotLongEnough(string decode)
|
|
{
|
|
if (decode == null)
|
|
throw new ArgumentNullException(nameof(decode));
|
|
|
|
// We can only decode wrong length equal signs
|
|
if (decode.Length >= 3)
|
|
throw new ArgumentException(@"decode must have length lower than 3", nameof(decode));
|
|
|
|
if (decode.Length <= 0)
|
|
throw new ArgumentException(@"decode must have length lower at least 1", nameof(decode));
|
|
|
|
// First char must be =
|
|
if (decode[0] != '=')
|
|
throw new ArgumentException(@"First part of decode must be an equal sign", nameof(decode));
|
|
|
|
// We will now believe that the string sent to us, was actually not encoded
|
|
// Therefore it must be in US-ASCII and we will return the bytes it corrosponds to
|
|
return Encoding.ASCII.GetBytes(decode);
|
|
}
|
|
#endregion
|
|
|
|
#region DecodeEqualSign
|
|
/// <summary>
|
|
/// This helper method will decode a string of the form "=XX" where X is any character.<br/>
|
|
/// This method will never fail, unless an argument of length not equal to three is passed.
|
|
/// </summary>
|
|
/// <param name="decode">The length 3 character that needs to be decoded</param>
|
|
/// <returns>A decoded byte array</returns>
|
|
/// <exception cref="ArgumentNullException">If <paramref name="decode"/> is <see langword="null"/></exception>
|
|
/// <exception cref="ArgumentException">Thrown if a the <paramref name="decode"/> parameter does not have length 3 or does not start with an equal sign.</exception>
|
|
private static byte[] DecodeEqualSign(string decode)
|
|
{
|
|
if (decode == null)
|
|
throw new ArgumentNullException(nameof(decode));
|
|
|
|
// We can only decode the string if it has length 3 - other calls to this function is invalid
|
|
if (decode.Length != 3)
|
|
throw new ArgumentException(@"decode must have length 3", nameof(decode));
|
|
|
|
// First char must be =
|
|
if (decode[0] != '=')
|
|
throw new ArgumentException(@"decode must start with an equal sign", nameof(decode));
|
|
|
|
// There are two cases where an equal sign might appear
|
|
// It might be a
|
|
// - hex-string like =3D, denoting the character with hex value 3D
|
|
// - it might be the last character on the line before a CRLF
|
|
// pair, denoting a soft linebreak, which simply
|
|
// splits the text up, because of the 76 chars per line restriction
|
|
if (decode.Contains("\r\n"))
|
|
{
|
|
// Soft break detected
|
|
// We want to return string.Empty which is equivalent to a zero-length byte array
|
|
return new byte[0];
|
|
}
|
|
|
|
// Hex string detected. Convertion needed.
|
|
// It might be that the string located after the equal sign is not hex characters
|
|
// An example: =JU
|
|
// In that case we would like to catch the FormatException and do something else
|
|
try
|
|
{
|
|
// The number part of the string is the last two digits. Here we simply remove the equal sign
|
|
var numberString = decode.Substring(1);
|
|
|
|
// Now we create a byte array with the converted number encoded in the string as a hex value (base 16)
|
|
// This will also handle illegal encodings like =3d where the hex digits are not uppercase,
|
|
// which is a robustness requirement from RFC 2045.
|
|
var oneByte = new[] {Convert.ToByte(numberString, 16)};
|
|
|
|
// Simply return our one byte byte array
|
|
return oneByte;
|
|
}
|
|
catch (FormatException)
|
|
{
|
|
// RFC 2045 says about robust implementation:
|
|
// An "=" followed by a character that is neither a
|
|
// hexadecimal digit (including "abcdef") nor the CR
|
|
// character of a CRLF pair is illegal. This case can be
|
|
// the result of US-ASCII text having been included in a
|
|
// quoted-printable part of a message without itself
|
|
// having been subjected to quoted-printable encoding. A
|
|
// reasonable approach by a robust implementation might be
|
|
// to include the "=" character and the following
|
|
// character in the decoded data without any
|
|
// transformation and, if possible, indicate to the user
|
|
// that proper decoding was not possible at this point in
|
|
// the data.
|
|
|
|
// So we choose to believe this is actually an un-encoded string
|
|
// Therefore it must be in US-ASCII and we will return the bytes it corrosponds to
|
|
return Encoding.ASCII.GetBytes(decode);
|
|
}
|
|
}
|
|
#endregion
|
|
}
|
|
} |