You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
269 lines
10 KiB
269 lines
10 KiB
using System;
|
|
using System.Collections.Generic;
|
|
using System.Text;
|
|
|
|
// These suggestions from Resharper apply because we don't want it to recommend fixing things needed for Net20:
|
|
// ReSharper disable LoopCanBeConvertedToQuery
|
|
// ReSharper disable ConvertIfStatementToNullCoalescingAssignment
|
|
// ReSharper disable ReplaceSubstringWithRangeIndexer
|
|
// ReSharper disable InvertIf
|
|
// ReSharper disable ConvertIfStatementToSwitchExpression
|
|
// ReSharper disable ConvertIfStatementToSwitchStatement
|
|
// ReSharper disable ReturnTypeCanBeEnumerable.Global
|
|
|
|
namespace CSVNET
|
|
{
|
|
/// <summary>
|
|
/// The current state of CSV processing, given the text that has been seen so far
|
|
/// </summary>
|
|
public enum CSVState
|
|
{
|
|
/// <summary>
|
|
/// We have reached the end of the CSV and everything is done
|
|
/// </summary>
|
|
Done,
|
|
|
|
/// <summary>
|
|
/// We don't need more text at the moment
|
|
/// </summary>
|
|
CanKeepGoing,
|
|
|
|
/// <summary>
|
|
/// The CSV reached the end, but there was a missing (unpaired) text qualifier.
|
|
/// For example:
|
|
/// `1,2,3,"test`
|
|
/// </summary>
|
|
MissingTrailingQualifier
|
|
}
|
|
|
|
/// <summary>
|
|
/// This state machine handles all functions of CSV processing except for the I/O, which can come in a variety
|
|
/// of forms, either from a stream or an in-memory collection.
|
|
///
|
|
/// Since some CSV files have a single row of data that comprises multiple lines, this state machine may or may
|
|
/// not produce one row of data for each chunk of text received.
|
|
/// </summary>
|
|
public class CSVStateMachine
|
|
{
|
|
private readonly CSVSettings _settings;
|
|
private string _line;
|
|
private readonly List<string> _list;
|
|
private readonly StringBuilder _work;
|
|
private int _position;
|
|
private char _delimiter;
|
|
private bool _allowSepLine;
|
|
private bool _inTextQualifier;
|
|
|
|
/// <summary>
|
|
/// Whether the state machine has concluded or can continue processing
|
|
/// </summary>
|
|
public CSVState State { get; private set; }
|
|
|
|
/// <summary>
|
|
/// Returns true if we need more text
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
public bool NeedsMoreText()
|
|
{
|
|
return String.IsNullOrEmpty(_line) || _position + _settings.LineSeparator.Length >= _line.Length;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Constructs a new state machine to begin processing CSV text
|
|
/// </summary>
|
|
public CSVStateMachine(CSVSettings settings)
|
|
{
|
|
_line = "";
|
|
_list = new List<string>();
|
|
_work = new StringBuilder();
|
|
_settings = settings ?? CSVSettings.CSV;
|
|
_position = -1;
|
|
|
|
// The presence of a "sep=" line may affect these values
|
|
_delimiter = _settings.FieldDelimiter;
|
|
_allowSepLine = _settings.AllowSepLine;
|
|
|
|
// We are ready for work
|
|
State = CSVState.CanKeepGoing;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Parse a new chunk of text retrieved via some other means than a stream.
|
|
///
|
|
/// Call this function when you are retrieving your own text and when each chunk may or may not
|
|
/// include line separators, and your stream does not consume line separators on its own.
|
|
/// </summary>
|
|
/// <param name="chunk">The new data to process</param>
|
|
/// <param name="reachedEnd">Set this value to true </param>
|
|
/// <returns>If this parsing operation produces a valid row, this will be non-null</returns>
|
|
public string[] ParseChunk(string chunk, bool reachedEnd)
|
|
{
|
|
// Detect end of stream
|
|
if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1 && string.IsNullOrEmpty(_line))
|
|
{
|
|
State = CSVState.Done;
|
|
return null;
|
|
}
|
|
|
|
// If we're at the end of the line, remember to backtrack one because we increment immediately
|
|
if (_position == _line.Length)
|
|
{
|
|
_position -= 1;
|
|
}
|
|
|
|
// Add this chunk to the current processing logic
|
|
_line += chunk;
|
|
|
|
// Check for the presence of a "sep=" line once at the beginning of a stream
|
|
if (_allowSepLine)
|
|
{
|
|
var newDelimiter = CSV.ParseSepLine(_line);
|
|
_allowSepLine = false;
|
|
if (newDelimiter != null)
|
|
{
|
|
_delimiter = newDelimiter.Value;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// Process one character at a time from the current line
|
|
while (_position < _line.Length || !reachedEnd)
|
|
{
|
|
_position++;
|
|
|
|
// Have we reached the end of the stream?
|
|
if (_position >= _line.Length)
|
|
{
|
|
if (reachedEnd)
|
|
{
|
|
// If we reached the end while still in a text qualifier, the CSV is broken
|
|
if (_inTextQualifier)
|
|
{
|
|
State = CSVState.MissingTrailingQualifier;
|
|
return null;
|
|
}
|
|
|
|
// We always add the final work item here because trailing empty strings are valid
|
|
State = CSVState.Done;
|
|
_list.Add(_work.ToString());
|
|
_line = string.Empty;
|
|
_position = -1;
|
|
return _list.ToArray();
|
|
}
|
|
return null;
|
|
}
|
|
var c = _line[_position];
|
|
|
|
// If we are resuming after starting a text qualifier, can we find the end?
|
|
if (_inTextQualifier)
|
|
{
|
|
var p2 = -1;
|
|
while (p2 < 0)
|
|
{
|
|
p2 = _line.IndexOf(_settings.TextQualifier, _position + 1);
|
|
if (p2 < 0)
|
|
{
|
|
if (reachedEnd)
|
|
{
|
|
State = CSVState.MissingTrailingQualifier;
|
|
}
|
|
|
|
// Backtrack one character so we can move forward when the next chunk loads
|
|
_position--;
|
|
return null;
|
|
}
|
|
|
|
// Append the text between the qualifiers
|
|
_work.Append(_line.Substring(_position + 1, p2 - _position - 1));
|
|
_position = p2;
|
|
|
|
// If the user put in a doubled-up qualifier, e.g. `""`, insert a single one and continue
|
|
if (p2 + 1 < _line.Length && _line[p2 + 1] == _settings.TextQualifier)
|
|
{
|
|
_work.Append(_settings.TextQualifier);
|
|
_position++;
|
|
p2 = -1;
|
|
}
|
|
}
|
|
|
|
// We're done parsing this text qualifier
|
|
_inTextQualifier = false;
|
|
}
|
|
// Is this the start of a text qualified field?
|
|
else if (c == _settings.TextQualifier && _work.Length == 0)
|
|
{
|
|
_inTextQualifier = true;
|
|
_position--;
|
|
}
|
|
// Are we at a line separator? Let's do a quick test first
|
|
else if (c == _settings.LineSeparator[0])
|
|
{
|
|
// If we don't have enough characters left to test the line separator properly, ask for more
|
|
var notEnoughChars = _position + _settings.LineSeparator.Length > _line.Length;
|
|
if (notEnoughChars && !reachedEnd)
|
|
{
|
|
// Backtrack one character so we can pick up the line separator completely next time
|
|
_position--;
|
|
return null;
|
|
}
|
|
|
|
// If we have reached the end, but this isn't a complete line separator, it's just text
|
|
if (notEnoughChars)
|
|
{
|
|
_work.Append(c);
|
|
}
|
|
// OK, we have enough characters, see if this is a line separator
|
|
else if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), _settings.LineSeparator))
|
|
{
|
|
_line = _line.Substring(_position + _settings.LineSeparator.Length);
|
|
_position = -1;
|
|
_list.Add(_work.ToString());
|
|
var row = _list.ToArray();
|
|
_list.Clear();
|
|
_work.Length = 0;
|
|
return row;
|
|
}
|
|
// It's not a line separator, it's just a normal character
|
|
else
|
|
{
|
|
_work.Append(c);
|
|
}
|
|
}
|
|
// Does this start a new field?
|
|
else if (c == _delimiter)
|
|
{
|
|
// Is this a null token, and do we permit null tokens?
|
|
var s = _work.ToString();
|
|
if (_settings.AllowNull && string.Equals(s, _settings.NullToken, StringComparison.Ordinal))
|
|
{
|
|
_list.Add(null);
|
|
}
|
|
else
|
|
{
|
|
_list.Add(s);
|
|
}
|
|
_work.Length = 0;
|
|
|
|
// Test for special case: when the user has written a casual comma, space, and text qualifier, skip the space
|
|
// Checks if the second parameter of the if statement will pass through successfully
|
|
// e.g. `"bob", "mary", "bill"`
|
|
if (_position + 2 <= _line.Length - 1)
|
|
{
|
|
if (_line[_position + 1].Equals(' ') && _line[_position + 2].Equals(_settings.TextQualifier))
|
|
{
|
|
_position++;
|
|
}
|
|
}
|
|
}
|
|
// Regular character
|
|
else
|
|
{
|
|
_work.Append(c);
|
|
}
|
|
}
|
|
|
|
State = CSVState.Done;
|
|
return null;
|
|
}
|
|
}
|
|
} |