Skip to content

Commit

Permalink
Updated delimiter detection algorithm.
Browse files Browse the repository at this point in the history
- Strips escaped text based on mode.
- Only looks for delimiters that appear on every line.
- Uses CultureInfo.TextInfo.ListSeparator if it's on every line.
  • Loading branch information
JoshClose committed May 10, 2022
1 parent 81550d7 commit f3d8038
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 15 deletions.
5 changes: 5 additions & 0 deletions src/CsvHelper/Configuration/IParserConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ namespace CsvHelper.Configuration
/// </summary>
public interface IParserConfiguration
{
/// <summary>
/// Gets the culture info used to read an write CSV files.
/// </summary>
CultureInfo CultureInfo { get; }

/// <summary>
/// Cache fields that are created when parsing.
/// Default is false.
Expand Down
5 changes: 0 additions & 5 deletions src/CsvHelper/Configuration/IReaderConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,6 @@ public interface IReaderConfiguration : IParserConfiguration
/// </summary>
ReadingExceptionOccurred ReadingExceptionOccurred { get; }

/// <summary>
/// Gets the culture info used to read an write CSV files.
/// </summary>
CultureInfo CultureInfo { get; }

/// <summary>
/// Prepares the header field for matching against a member name.
/// The header field and the member name are both ran through this function.
Expand Down
63 changes: 54 additions & 9 deletions src/CsvHelper/CsvParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,22 @@ private void DetectDelimiter()
{
var text = new string(buffer, 0, charsRead);

if (mode == CsvMode.RFC4180)
{
// Remove text in between pairs of quotes.
text = Regex.Replace(text, $"({quote}.*?{quote})", string.Empty);
}
else if (mode == CsvMode.Escape)
{
// Remove escaped characters.
text = Regex.Replace(text, $"({escape}.)", string.Empty);
}

var lineDelimiterCounts = new List<Dictionary<string, int>>();
while (text.Length > 0)
{
// Since all escaped text has been removed, we can reliably read line by line.
var index = text.IndexOf(newLine);

var line = index > -1 ? text.Substring(0, index + newLine.Length) : text;

var delimiterCounts = new Dictionary<string, int>();
Expand All @@ -294,17 +306,50 @@ private void DetectDelimiter()
delimiterCounts[delimiter] = Regex.Matches(line, pattern).Count;
}

var maxCount = delimiterCounts.OrderByDescending(c => c.Value).First();
if (maxCount.Value > 0)
{
delimiter = maxCount.Key;
delimiterFirstChar = delimiter[0];
configuration.Validate();
lineDelimiterCounts.Add(delimiterCounts);

break;
text = index > -1 ? text.Substring(index + newLine.Length) : string.Empty;
}

if (lineDelimiterCounts.Count > 1)
{
// The last line isn't complete and can't be used to reliably detect a delimiter.
lineDelimiterCounts.Remove(lineDelimiterCounts.Last());
}

// Rank only the delimiters that appear on every line.
var delimiters =
(
from counts in lineDelimiterCounts
from count in counts
group count by count.Key into g
where g.All(x => x.Value > 0)
let sum = g.Sum(x => x.Value)
orderby sum descending
select new
{
Delimiter = g.Key,
Count = sum
}
).ToList();

text = index > -1 ? text.Substring(index + newLine.Length) : string.Empty;
string? newDelimiter = null;
if (delimiters.Any(x => x.Delimiter == configuration.CultureInfo.TextInfo.ListSeparator))
{
// The culture's separator is on every line. Assume this is the delimiter.
newDelimiter = configuration.CultureInfo.TextInfo.ListSeparator;
}
else
{
// Choose the highest ranked delimiter.
newDelimiter = delimiters.Select(x => x.Delimiter).FirstOrDefault();
}

if (newDelimiter != null)
{
delimiter = newDelimiter;
delimiterFirstChar = newDelimiter[0];
configuration.Validate();
}
}

Expand Down
40 changes: 39 additions & 1 deletion tests/CsvHelper.Tests/Parsing/DetectDelimiterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ public void Parse_TextHasRegularCharDelimiter_DetectsDelimiter()
}

[Fact]
public void Parse_MultipleLines_DetectsDelimiterFromFirstLineOnly()
public void Parse_MultipleLines_DetectsDelimiterThatIsOnEveryLine()
{
var s = new StringBuilder();
s.Append("Id;Name\r\n");
Expand Down Expand Up @@ -241,5 +241,43 @@ public void Parse_NoDelimiter_DoesNotDetect()
Assert.Equal("`", parser.Delimiter);
}
}

[Fact]
public void Parse_CulturesSeparatorOccursLessButIsOnEveryLine_CulturesSeparatorIsDetected()
{
var s = new StringBuilder();
s.Append("1;2,3;4\r\n");
s.Append("5;6,7;8\r\n");
var config = new CsvConfiguration(CultureInfo.InvariantCulture)
{
DetectDelimiter = true,
};
using (var reader = new StringReader(s.ToString()))
using (var parser = new CsvParser(reader, config))
{
parser.Read();

Assert.Equal(",", parser.Delimiter);
}
}

[Fact]
public void Parse_CulturesSeparatorOccursLessAndIsNotOnEveryLine_CulturesSeparatorIsDetected()
{
var s = new StringBuilder();
s.Append("1;2;3;4\r\n");
s.Append("5;6,7;8\r\n");
var config = new CsvConfiguration(CultureInfo.InvariantCulture)
{
DetectDelimiter = true,
};
using (var reader = new StringReader(s.ToString()))
using (var parser = new CsvParser(reader, config))
{
parser.Read();

Assert.Equal(";", parser.Delimiter);
}
}
}
}

0 comments on commit f3d8038

Please sign in to comment.