Skip to content

Commit

Permalink
do not skip characters when ignore error is false
Browse files Browse the repository at this point in the history
support line feed (LF) as CSV row line break (if not quoted)
performance improvement
add more test cases (including cases from lumen-works)
enrich the IDataResolver interface
  • Loading branch information
fengzhenqiong committed Dec 19, 2020
1 parent 78004cd commit 1c1ca12
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 24 deletions.
50 changes: 36 additions & 14 deletions Sky.Data.Csv/CsvReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ public class CsvReader<T> : IEnumerable<T>, IDisposable
private readonly CsvReaderSettings mCsvSettings;
private readonly String mFilePath;

private readonly IDataResolver<T> dataResolver;
private readonly Dictionary<String, List<String>> cachedRows = new Dictionary<String, List<String>>();
private Boolean fileHeaderAlreadySkipped = false;
private readonly IDataResolver<T> mDataResolver;
private readonly Dictionary<String, List<String>> mCachedRows = new Dictionary<String, List<String>>(1024);
private Boolean mFileHeaderAlreadySkipped = false;

private void ThrowException(String rowText, Int32 rowIndex, Int32 chPos)
{
Expand Down Expand Up @@ -96,9 +96,10 @@ private List<String> ParseOneRow(String oneRowText)
}
else if (nextChar == '\"')
mCsvTextBuilder.Append(oneRowText[charPos = charPos + 1]);
//Code should not hit this point, it indicates an error
else if (!this.mCsvSettings.IgnoreErrors)
ThrowException(oneRowText, this.LineIndex, charPos);
else
mCsvTextBuilder.Append(theChar);
}
}
#endregion
Expand Down Expand Up @@ -131,7 +132,7 @@ protected static void CheckFilePath(String filePath)
}
protected CsvReader(Stream stream, CsvReaderSettings settings, IDataResolver<T> dataResolver)
{
this.dataResolver = dataResolver;
this.mDataResolver = dataResolver;
this.mCsvSettings = settings = settings ?? new CsvReaderSettings();
this.mCsvSettings.UseCache = settings.UseCache || settings.SkipDuplicates;
EnsureParameters(stream, settings, dataResolver);
Expand Down Expand Up @@ -167,9 +168,11 @@ public List<String> ReadRow()
var commentHint = this.mCsvSettings.CommentHint;
while (true)
{
if (!this.EnsureBuffer()) return null;
if (this.mBufferPosition >= this.mBufferCharCount)
if (!this.EnsureBuffer()) return null;

#region Read one real CSV record line
var quoted = false;
mCsvTextBuilder.Length = 0;
while (this.mBufferPosition < this.mBufferCharCount)
{
Expand All @@ -186,7 +189,26 @@ public List<String> ReadRow()
//for macintosh csv format, it uses \r as line break
break;
}
else mCsvTextBuilder.Append(firstChar);
//else mCsvTextBuilder.Append(firstChar);
else if (!quoted && firstChar == '\n') break;
else
{
mCsvTextBuilder.Append(firstChar);
if (firstChar == '\"')
{
if (!quoted) quoted = true;
else
{
if (this.mBufferPosition >= this.mBufferCharCount)
if (!this.EnsureBuffer()) break;

if (this.mBuffer[this.mBufferPosition] == '\"')
mCsvTextBuilder.Append(this.mBuffer[this.mBufferPosition++]);
else
quoted = false;
}
}
}

//if there is no line break, we should read to the end of file.
if (this.mBufferPosition >= this.mBufferCharCount)
Expand All @@ -205,22 +227,22 @@ public List<String> ReadRow()
continue;

++this.RowIndex; //header is counted for row numbers
if (!fileHeaderAlreadySkipped && this.mCsvSettings.HasHeader)
if (!mFileHeaderAlreadySkipped && this.mCsvSettings.HasHeader)
{
fileHeaderAlreadySkipped = true;
mFileHeaderAlreadySkipped = true;
continue;
}

//if a row is in cache, it's already read, process skip duplicates
if (this.mCsvSettings.SkipDuplicates && cachedRows.ContainsKey(oneRowText))
if (this.mCsvSettings.SkipDuplicates && mCachedRows.ContainsKey(oneRowText))
continue;

//if use cache and the row is already read, use the existing value
if (this.mCsvSettings.UseCache && cachedRows.ContainsKey(oneRowText))
return cachedRows[oneRowText];
if (this.mCsvSettings.UseCache && mCachedRows.ContainsKey(oneRowText))
return mCachedRows[oneRowText];
var temporaryData = this.ParseOneRow(oneRowText);
//if use cache and the row is not read, add it to cache
if (this.mCsvSettings.UseCache) cachedRows[oneRowText] = temporaryData;
if (this.mCsvSettings.UseCache) mCachedRows[oneRowText] = temporaryData;
#endregion

return temporaryData;
Expand Down Expand Up @@ -306,7 +328,7 @@ public IEnumerator<T> GetEnumerator()
{
for (var row = this.ReadRow(); row != null; row = this.ReadRow())
{
yield return this.dataResolver.Deserialize(row);
yield return this.mDataResolver.Deserialize(row);
};
}
#endregion
Expand Down
19 changes: 15 additions & 4 deletions Sky.Data.Csv/CsvResolver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ public interface IDataResolver<TData>
/// </summary>
/// <param name="data">The list of String values.</param>
/// <returns>The deserialized object.</returns>
TData Deserialize(IEnumerable<String> data);
/// <summary>
/// Deserialize the specified list of String values to an object.
/// </summary>
/// <param name="data">The list of String values.</param>
/// <returns>The deserialized object.</returns>
TData Deserialize(List<String> data);
}

Expand All @@ -38,14 +44,19 @@ public interface IDataResolver<TData>
/// <typeparam name="TData">The generic type of which objects will be serialized and deserialized.</typeparam>
public abstract class AbstractDataResolver<TData> : IDataResolver<TData>
{
public TData Deserialize(params String[] data)
public abstract TData Deserialize(List<String> data);

public virtual TData Deserialize(IEnumerable<String> data)
{
return Deserialize(new List<String>(data));
return Deserialize((List<String>)data ?? new List<String>(data));
}

public abstract TData Deserialize(List<String> data);

public abstract List<String> Serialize(TData data);

public virtual TData Deserialize(params String[] data)
{
return Deserialize(new List<String>(data));
}
}

/// <summary>
Expand Down
12 changes: 6 additions & 6 deletions Sky.Data.Csv/CsvWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ public class CsvWriter<T> : IDisposable
{
private readonly StreamWriter mWriter;
private readonly CsvWriterSettings mCsvSettings;
private readonly Char[] needQuoteChars;
private readonly Char[] mNeedQuoteChars;

private readonly IDataResolver<T> dataResolver;
private readonly IDataResolver<T> mDataResolver;

private static void EnsureParameters(Stream stream, CsvWriterSettings settings, IDataResolver<T> dataResolver)
{
Expand Down Expand Up @@ -51,11 +51,11 @@ protected static void CheckFilePath(String filePath, CsvWriterSettings settings)
}
protected CsvWriter(Stream stream, CsvWriterSettings settings, IDataResolver<T> dataResolver)
{
this.dataResolver = dataResolver;
this.mDataResolver = dataResolver;
this.mCsvSettings = settings = settings ?? new CsvWriterSettings();
EnsureParameters(stream, settings, dataResolver);
settings.BufferSize = Math.Min(4096 * 1024, Math.Max(settings.BufferSize, 4096));
needQuoteChars = new Char[] { '\n', '\"', settings.Seperator };
mNeedQuoteChars = new Char[] { '\n', '\"', settings.Seperator };
this.mWriter = new StreamWriter(stream, settings.Encoding, settings.BufferSize);
if (stream is FileStream)
{
Expand Down Expand Up @@ -146,7 +146,7 @@ public CsvWriter<T> WriteRows(IEnumerable<T> data)
/// <returns>The current CsvWriter instance.</returns>
public CsvWriter<T> WriteRow(T data)
{
return this.WriteRow(this.dataResolver.Serialize(data));
return this.WriteRow(this.mDataResolver.Serialize(data));
}
/// <summary>
/// Write a list of String values as a CSV record to the current CSV file.
Expand All @@ -162,7 +162,7 @@ public CsvWriter<T> WriteRow(IEnumerable<String> data)
{
var valueString = originalCellValueString ?? String.Empty;

if (Array.Exists(needQuoteChars, c => valueString.IndexOf(c) >= 0))
if (Array.Exists(mNeedQuoteChars, c => valueString.IndexOf(c) >= 0))
{
valueString = String.Format("\"{0}\"",
valueString.Replace("\"", "\"\"").Replace("\r\n", "\r"));
Expand Down
Binary file added TestData.Csv/csv-lumentest.rar
Binary file not shown.
10 changes: 10 additions & 0 deletions TestData.Csv/csv-ms-dos-complex.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
·,~,`,!,&,@,#,$,%,^,*,(,),

_,+,-,=,[,],{,},;,:,a'b,,,","";",abc
REM this is a comment line
#4er,>,.,?,/,\,|, ,s,a,d,f,g
"""""",',;;,#$%^,(*&^,}{{IU,""":""P
OY$E",i,"./, ';l; k; ajldfp","k hk h
k hkj h
kh s",,.' ;'adl 'f l'a;l 'la'l' al'df,s
one more line

0 comments on commit 1c1ca12

Please sign in to comment.