using System; using System.Collections.Generic; using System.Text; // These suggestions from Resharper apply because we don't want it to recommend fixing things needed for Net20: // ReSharper disable LoopCanBeConvertedToQuery // ReSharper disable ConvertIfStatementToNullCoalescingAssignment // ReSharper disable ReplaceSubstringWithRangeIndexer // ReSharper disable InvertIf // ReSharper disable ConvertIfStatementToSwitchExpression // ReSharper disable ConvertIfStatementToSwitchStatement // ReSharper disable ReturnTypeCanBeEnumerable.Global namespace CSVNET { /// /// The current state of CSV processing, given the text that has been seen so far /// public enum CSVState { /// /// We have reached the end of the CSV and everything is done /// Done, /// /// We don't need more text at the moment /// CanKeepGoing, /// /// The CSV reached the end, but there was a missing (unpaired) text qualifier. /// For example: /// `1,2,3,"test` /// MissingTrailingQualifier } /// /// This state machine handles all functions of CSV processing except for the I/O, which can come in a variety /// of forms, either from a stream or an in-memory collection. /// /// Since some CSV files have a single row of data that comprises multiple lines, this state machine may or may /// not produce one row of data for each chunk of text received. /// public class CSVStateMachine { private readonly CSVSettings _settings; private string _line; private readonly List _list; private readonly StringBuilder _work; private int _position; private char _delimiter; private bool _allowSepLine; private bool _inTextQualifier; /// /// Whether the state machine has concluded or can continue processing /// public CSVState State { get; private set; } /// /// Returns true if we need more text /// /// public bool NeedsMoreText() { return String.IsNullOrEmpty(_line) || _position + _settings.LineSeparator.Length >= _line.Length; } /// /// Constructs a new state machine to begin processing CSV text /// public CSVStateMachine(CSVSettings settings) { _line = ""; _list = new List(); _work = new StringBuilder(); _settings = settings ?? CSVSettings.CSV; _position = -1; // The presence of a "sep=" line may affect these values _delimiter = _settings.FieldDelimiter; _allowSepLine = _settings.AllowSepLine; // We are ready for work State = CSVState.CanKeepGoing; } /// /// Parse a new chunk of text retrieved via some other means than a stream. /// /// Call this function when you are retrieving your own text and when each chunk may or may not /// include line separators, and your stream does not consume line separators on its own. /// /// The new data to process /// Set this value to true /// If this parsing operation produces a valid row, this will be non-null public string[] ParseChunk(string chunk, bool reachedEnd) { // Detect end of stream if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1 && string.IsNullOrEmpty(_line)) { State = CSVState.Done; return null; } // If we're at the end of the line, remember to backtrack one because we increment immediately if (_position == _line.Length) { _position -= 1; } // Add this chunk to the current processing logic _line += chunk; // Check for the presence of a "sep=" line once at the beginning of a stream if (_allowSepLine) { var newDelimiter = CSV.ParseSepLine(_line); _allowSepLine = false; if (newDelimiter != null) { _delimiter = newDelimiter.Value; return null; } } // Process one character at a time from the current line while (_position < _line.Length || !reachedEnd) { _position++; // Have we reached the end of the stream? if (_position >= _line.Length) { if (reachedEnd) { // If we reached the end while still in a text qualifier, the CSV is broken if (_inTextQualifier) { State = CSVState.MissingTrailingQualifier; return null; } // We always add the final work item here because trailing empty strings are valid State = CSVState.Done; _list.Add(_work.ToString()); _line = string.Empty; _position = -1; return _list.ToArray(); } return null; } var c = _line[_position]; // If we are resuming after starting a text qualifier, can we find the end? if (_inTextQualifier) { var p2 = -1; while (p2 < 0) { p2 = _line.IndexOf(_settings.TextQualifier, _position + 1); if (p2 < 0) { if (reachedEnd) { State = CSVState.MissingTrailingQualifier; } // Backtrack one character so we can move forward when the next chunk loads _position--; return null; } // Append the text between the qualifiers _work.Append(_line.Substring(_position + 1, p2 - _position - 1)); _position = p2; // If the user put in a doubled-up qualifier, e.g. `""`, insert a single one and continue if (p2 + 1 < _line.Length && _line[p2 + 1] == _settings.TextQualifier) { _work.Append(_settings.TextQualifier); _position++; p2 = -1; } } // We're done parsing this text qualifier _inTextQualifier = false; } // Is this the start of a text qualified field? else if (c == _settings.TextQualifier && _work.Length == 0) { _inTextQualifier = true; _position--; } // Are we at a line separator? Let's do a quick test first else if (c == _settings.LineSeparator[0]) { // If we don't have enough characters left to test the line separator properly, ask for more var notEnoughChars = _position + _settings.LineSeparator.Length > _line.Length; if (notEnoughChars && !reachedEnd) { // Backtrack one character so we can pick up the line separator completely next time _position--; return null; } // If we have reached the end, but this isn't a complete line separator, it's just text if (notEnoughChars) { _work.Append(c); } // OK, we have enough characters, see if this is a line separator else if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), _settings.LineSeparator)) { _line = _line.Substring(_position + _settings.LineSeparator.Length); _position = -1; _list.Add(_work.ToString()); var row = _list.ToArray(); _list.Clear(); _work.Length = 0; return row; } // It's not a line separator, it's just a normal character else { _work.Append(c); } } // Does this start a new field? else if (c == _delimiter) { // Is this a null token, and do we permit null tokens? var s = _work.ToString(); if (_settings.AllowNull && string.Equals(s, _settings.NullToken, StringComparison.Ordinal)) { _list.Add(null); } else { _list.Add(s); } _work.Length = 0; // Test for special case: when the user has written a casual comma, space, and text qualifier, skip the space // Checks if the second parameter of the if statement will pass through successfully // e.g. `"bob", "mary", "bill"` if (_position + 2 <= _line.Length - 1) { if (_line[_position + 1].Equals(' ') && _line[_position + 2].Equals(_settings.TextQualifier)) { _position++; } } } // Regular character else { _work.Append(c); } } State = CSVState.Done; return null; } } }