/* * 2006 - 2018 Ted Spence, http://tedspence.com * License: http://www.apache.org/licenses/LICENSE-2.0 * Home page: https://github.com/tspence/csharp-csv-reader */ using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Data; using System.Reflection; using System.ComponentModel; using System.Text; #if HAS_ASYNC using System.Threading; #endif // These suggestions from Resharper apply because we don't want it to recommend fixing things needed for Net20: // ReSharper disable LoopCanBeConvertedToQuery // ReSharper disable ConvertIfStatementToNullCoalescingAssignment // ReSharper disable ReplaceSubstringWithRangeIndexer // ReSharper disable InvertIf // ReSharper disable ConvertIfStatementToNullCoalescingExpression namespace CSVNET { /// /// Keeps track of which columns are excluded from CSV serialization / deserialization /// public class ExcludedColumnHelper { /// /// Note that Dot Net Framework 2.0 does not support HashSet, but it does support Dictionary. /// private readonly Dictionary _excluded; private readonly CSVSettings _settings; /// /// Construct a helper object to track which columns are excluded from serialization /// /// public ExcludedColumnHelper(CSVSettings settings) { if (settings?.ExcludedColumns == null || settings.ExcludedColumns.Length == 0) { _excluded = null; } else { _settings = settings; _excluded = new Dictionary(); foreach (var name in _settings.ExcludedColumns) { var excludedColumnName = name; if (!_settings.HeadersCaseSensitive) { excludedColumnName = excludedColumnName.ToUpperInvariant(); } _excluded.Add(excludedColumnName, 1); } } } /// /// True if this column should be excluded /// /// /// public bool IsExcluded(string name) { if (_excluded == null) return false; var excludedColumnName = name; if (!_settings.HeadersCaseSensitive) { excludedColumnName = excludedColumnName.ToUpperInvariant(); } return _excluded.ContainsKey(excludedColumnName); } } /// /// A helper object to deserialize a class based on CSV strings /// public class DeserializationHelper where T : class, new() { private readonly int _numColumns; private readonly Type[] _columnTypes; private readonly TypeConverter[] _converters; private readonly PropertyInfo[] _properties; private readonly FieldInfo[] _fields; private readonly MethodInfo[] _methods; /// /// Construct a new deserialization helper for a specific class containing all the information necessary /// for optimized deserialization /// /// /// public DeserializationHelper(CSVSettings settings, string[] headers) { var settings1 = settings; if (settings1 == null) { settings1 = CSVSettings.TSV; } if (headers == null) throw new Exception("CSV must have headers to be deserialized"); var return_type = typeof(T); _numColumns = headers.Length; // Set binding flags correctly var bindings = BindingFlags.Public | BindingFlags.Instance; if (!settings1.HeadersCaseSensitive) { bindings |= BindingFlags.IgnoreCase; } // Set up the list of excluded columns var excluded = new ExcludedColumnHelper(settings1); // Determine how to handle each column in the file - check properties, fields, and methods _columnTypes = new Type[_numColumns]; _converters = new TypeConverter[_numColumns]; _properties = new PropertyInfo[_numColumns]; _fields = new FieldInfo[_numColumns]; _methods = new MethodInfo[_numColumns]; for (var i = 0; i < _numColumns; i++) { // Is this column excluded? if (excluded.IsExcluded(headers[i])) continue; // Check if this is a property _properties[i] = return_type.GetProperty(headers[i], bindings); if (_properties[i] != null && !_properties[i].CanWrite) { if (settings1.IgnoreReadOnlyProperties && settings1.IgnoreHeaderErrors) { _properties[i] = null; continue; } throw new Exception($"The column header '{headers[i]}' matches a read-only property. To ignore this exception, enable IgnoreReadOnlyProperties and IgnoreHeaderErrors."); } // If we failed to get a property handler, let's try a field handler if (_properties[i] == null) { _fields[i] = return_type.GetField(headers[i], bindings); // If we failed to get a field handler, let's try a method if (_fields[i] == null) { // Methods must be treated differently - we have to ensure that the method has a single parameter var mi = return_type.GetMethod(headers[i], bindings); if (mi != null) { if (mi.GetParameters().Length == 1) { _methods[i] = mi; _columnTypes[i] = mi.GetParameters()[0].ParameterType; } else if (!settings1.IgnoreHeaderErrors) { throw new Exception( $"The column header '{headers[i]}' matched a method with more than one parameter."); } } else if (!settings1.IgnoreHeaderErrors) { throw new Exception( $"The column header '{headers[i]}' was not found in the class '{return_type.FullName}'."); } } else { _columnTypes[i] = _fields[i].FieldType; } } else { _columnTypes[i] = _properties[i].PropertyType; } if (_columnTypes[i] != null) { _converters[i] = TypeDescriptor.GetConverter(_columnTypes[i]); if (_converters[i] == null && !settings1.IgnoreHeaderErrors) { throw new Exception( $"The column {headers[i]} (type {_columnTypes[i]}) does not have a type converter."); } } } } /// /// Deserialize a single row using precomputed converters /// /// /// /// /// /// public T Deserialize(string[] line, int row_num, CSVSettings settings) { // If this line is completely empty, do our settings permit us to ignore the empty line? if (line.Length == 0 || (line.Length == 1 && line[0] == string.Empty) && settings.IgnoreEmptyLineForDeserialization) { return null; } // Does this line match the length of the first line? Does the caller want us to complain? if (line.Length != _numColumns && !settings.IgnoreHeaderErrors) { throw new Exception($"Line #{row_num} contains {line.Length} columns; expected {_numColumns}"); } // Construct a new object and execute each column on it var obj = new T(); for (var i = 0; i < Math.Min(line.Length, _numColumns); i++) { if (_converters[i] == null) continue; // Attempt to convert this to the specified type object value = null; if (settings.AllowNull && (line[i] == null || line[i] == settings.NullToken)) { value = null; } else if (_converters[i].IsValid(line[i])) { value = _converters[i].ConvertFromString(line[i]); } else if (!settings.IgnoreHeaderErrors) { throw new Exception( $"The value '{line[i]}' cannot be converted to the type {_columnTypes[i]}."); } // Can we set this value to the object as a property? if (_properties[i] != null) { _properties[i].SetValue(obj, value, null); } else if (_fields[i] != null) { _fields[i].SetValue(obj, value); } else if (_methods[i] != null) { _methods[i].Invoke(obj, new object[] { value }); } } return obj; } } /// /// A reader that reads from a stream and emits CSV records /// #if HAS_ASYNC_IENUM public class CSVReader : IAsyncEnumerable, IEnumerable, IDisposable #else public class CSVReader : IEnumerable, IDisposable #endif { private readonly CSVSettings _settings; private readonly StreamReader _stream; /// /// The settings currently in use by this reader /// public CSVSettings Settings { get { return _settings; } } /// /// If the first row in the file is a header row, this will be populated /// public string[] Headers { get; private set; } /// /// Convenience function to read from a string /// /// The string to read /// The CSV settings to use for this reader (Default: CSV) /// public static CSVReader FromString(string source, CSVSettings settings = null) { if (settings == null) { settings = CSVSettings.CSV; } var byteArray = settings.Encoding.GetBytes(source); var stream = new MemoryStream(byteArray); return new CSVReader(stream, settings); } /// /// Convenience function to read from a file on disk /// /// The file to read /// The CSV settings to use for this reader (Default: CSV) /// The string encoding to use for the reader (Default: UTF8) /// public static CSVReader FromFile(string filename, CSVSettings settings = null, Encoding encoding = null) { if (encoding == null) { encoding = Encoding.UTF8; } var sr = new StreamReader(filename, encoding); return new CSVReader(sr, settings); } /// /// Construct a new CSV reader off a streamed source /// /// The stream source. Note that when disposed, the CSV Reader will dispose the stream reader. /// The CSV settings to use for this reader (Default: CSV) public CSVReader(StreamReader source, CSVSettings settings = null) { _stream = source; _settings = settings; if (_settings == null) { _settings = CSVSettings.CSV; } // Do we need to parse headers? if (_settings.HeaderRowIncluded) { var line = source.ReadLine(); if (_settings.AllowSepLine) { var newDelimiter = CSV.ParseSepLine(line); if (newDelimiter != null) { // We don't want to change the original settings, since they may be a singleton _settings = _settings.CloneWithNewDelimiter(newDelimiter.Value); line = source.ReadLine(); } } Headers = CSV.ParseLine(line, _settings); } else { Headers = _settings.AssumedHeaders; } } /// /// Construct a new CSV reader off a streamed source /// /// The stream source. Note that when disposed, the CSV Reader will dispose the stream reader. /// The CSV settings to use for this reader (Default: CSV) public CSVReader(Stream source, CSVSettings settings = null) { _settings = settings; if (_settings == null) { _settings = CSVSettings.CSV; } _stream = new StreamReader(source, _settings.Encoding); // Do we need to parse headers? if (_settings.HeaderRowIncluded) { var line = _stream.ReadLine(); if (_settings.AllowSepLine) { var newDelimiter = CSV.ParseSepLine(line); if (newDelimiter != null) { // We don't want to change the original settings, since they may be a singleton _settings = _settings.CloneWithNewDelimiter(newDelimiter.Value); line = _stream.ReadLine(); } } Headers = CSV.ParseLine(line, _settings); } else { Headers = _settings.AssumedHeaders; } } /// /// Iterate through all lines in this CSV file /// /// An array of all data columns in the line public IEnumerable Lines() { return CSV.ParseStream(_stream, _settings); } /// /// Iterate through all lines in this CSV file /// /// public IEnumerator GetEnumerator() { return CSV.ParseStream(_stream, _settings).GetEnumerator(); } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } #if HAS_ASYNC_IENUM /// /// Iterate through all lines in this CSV file using async /// /// An array of all data columns in the line public IAsyncEnumerable LinesAsync() { return CSV.ParseStreamAsync(_stream, _settings); } /// /// Iterate through all lines in this CSV file using async /// /// An array of all data columns in the line public IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToken = new CancellationToken()) { return CSV.ParseStreamAsync(_stream, _settings).GetAsyncEnumerator(cancellationToken); } /// /// Deserialize the CSV reader into a generic list /// /// The type of data to deserialize /// A streaming collection of records from the CSV source /// If the CSV source cannot be parsed into the type, throws exceptions public async IAsyncEnumerable DeserializeAsync() where T : class, new() { var helper = new DeserializationHelper(_settings, Headers); // Alright, let's retrieve CSV lines and parse each one! var row_num = 0; await foreach (var line in this) { row_num++; var obj = helper.Deserialize(line, row_num, _settings); if (obj != null) { yield return obj; } } } #endif /// /// Read this file into a data table in memory /// /// public DataTable ReadAsDataTable() { var dt = new DataTable(); string[] firstLine = null; // File contains column names - so name each column properly if (Headers == null) { var rawLine = _stream.ReadLine(); firstLine = CSV.ParseLine(rawLine, _settings); var list = new List(); for (var i = 0; i < firstLine.Length; i++) { list.Add($"Column{i}"); } this.Headers = list.ToArray(); } // Add headers var numColumns = Headers.Length; foreach (var t in Headers) { dt.Columns.Add(new DataColumn(t, typeof(string))); } // If we had to read the first line to get dimensions, add it var row_num = 1; if (firstLine != null) { dt.Rows.Add(firstLine); row_num++; } // Start reading through the file foreach (var line in CSV.ParseStream(_stream, _settings)) { // Does this line match the length of the first line? if (line.Length != numColumns) { if (!_settings.IgnoreDimensionErrors) { throw new Exception($"Line #{row_num} contains {line.Length} columns; expected {numColumns}"); } else { // Add as best we can - construct a new line and make it fit var list = new List(); list.AddRange(line); while (list.Count < numColumns) { list.Add(""); } dt.Rows.Add(list.GetRange(0, numColumns).ToArray()); } } else { dt.Rows.Add(line); } // Keep track of where we are in the file row_num++; } // Here's your data table return dt; } /// /// Deserialize the CSV reader into a generic list /// /// The type to deserialize /// A streaming collection of objects as they are read from the source /// If the CSV formatting does not match the object, throw errors public IEnumerable Deserialize() where T : class, new() { var helper = new DeserializationHelper(_settings, Headers); // Alright, let's retrieve CSV lines and parse each one! var row_num = 0; foreach (var line in this) { row_num++; var obj = helper.Deserialize(line, row_num, _settings); if (obj != null) { yield return obj; } } } /// /// Close our resources - specifically, the stream reader /// public void Dispose() { _stream.Dispose(); } /// /// Take a CSV file and chop it into multiple chunks of a specified maximum size. /// /// The input filename to chop /// The folder where the chopped CSV will be saved /// The maximum number of lines to put into each file /// The CSV settings to use when chopping this file into chunks (Default: CSV) /// Number of files chopped public static int ChopFile(string filename, string out_folder, int maxLinesPerFile, CSVSettings settings = null) { // Default settings if (settings == null) settings = CSVSettings.CSV; // Let's begin parsing var file_id = 1; var line_count = 0; var file_prefix = Path.GetFileNameWithoutExtension(filename); var ext = Path.GetExtension(filename); CSVWriter cw = null; StreamWriter sw = null; // Read in lines from the file using (var sr = new StreamReader(filename)) { using (var cr = new CSVReader(sr, settings)) { // Okay, let's do the real work foreach (var line in cr.Lines()) { // Do we need to create a file for writing? if (cw == null) { var fn = Path.Combine(out_folder, file_prefix + file_id.ToString() + ext); var fs = new FileStream(fn, FileMode.CreateNew); sw = new StreamWriter(fs, settings.Encoding); cw = new CSVWriter(sw, settings); if (settings.HeaderRowIncluded) { cw.WriteLine(cr.Headers); } } // Write one line cw.WriteLine(line); // Count lines - close the file if done line_count++; if (line_count >= maxLinesPerFile) { cw.Dispose(); cw = null; file_id++; line_count = 0; } } } } // Ensure the final CSVWriter is closed properly if (cw != null) { cw.Dispose(); cw = null; } return file_id; } } }