Skip to content

Commit

Permalink
improve reader performance
Browse files Browse the repository at this point in the history
support skip duplicate records in reader
fix a potential out of index bug
  • Loading branch information
fengzhenqiong committed Dec 16, 2020
1 parent 75886de commit 78004cd
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 30 deletions.
69 changes: 40 additions & 29 deletions Sky.Data.Csv/CsvReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class CsvReader<T> : IEnumerable<T>, IDisposable
private readonly Char[] mBuffer;
private Int32 mBufferPosition = 0, mBufferCharCount = 0;
private readonly StreamReader mReader;
private readonly StringBuilder mCellValueBuilder = new StringBuilder(128);
private readonly StringBuilder mCsvTextBuilder = new StringBuilder(256);
private readonly CsvReaderSettings mCsvSettings;
private readonly String mFilePath;

Expand Down Expand Up @@ -53,16 +53,13 @@ private List<String> ParseOneRow(String oneRowText)
++this.RecordIndex;

var textLen = oneRowText.Length;
var sepChar = this.mCsvSettings.Seperator;

if (textLen == 0) return new List<String>();
if (this.mCsvSettings.UseCache && cachedRows.ContainsKey(oneRowText))
return cachedRows[oneRowText];

var recordInfo = new List<String>(16);
var sepChar = this.mCsvSettings.Seperator;
for (Int32 charPos = 0; charPos < textLen; ++charPos)
{
mCellValueBuilder.Length = 0;
mCsvTextBuilder.Length = 0;
var firstChar = oneRowText[charPos];

#region Non-Quoted CSV Cell Value Processor
Expand All @@ -71,47 +68,44 @@ private List<String> ParseOneRow(String oneRowText)
for (var c = firstChar; c != sepChar; c = oneRowText[charPos])
{
if (c == '\"' && !this.mCsvSettings.IgnoreErrors)
ThrowException(oneRowText, this.RowIndex, charPos);
ThrowException(oneRowText, this.LineIndex, charPos);

mCellValueBuilder.Append(c);
mCsvTextBuilder.Append(c);
if (++charPos >= textLen) break;
}
recordInfo.Add(mCellValueBuilder.ToString());
recordInfo.Add(mCsvTextBuilder.ToString());
}
#endregion
#region Quoted CSV Cell Value Processor
else //This is a quoted cell value
{
if (textLen < charPos + 1 && !this.mCsvSettings.IgnoreErrors)
ThrowException(oneRowText, this.RowIndex, charPos);
ThrowException(oneRowText, this.LineIndex, charPos);

for (++charPos; charPos < textLen; ++charPos)
{
Char theChar = oneRowText[charPos], nextChar;

if (theChar != '\"')
mCellValueBuilder.Append(theChar);
mCsvTextBuilder.Append(theChar);
else if ((textLen <= charPos + 1) || (nextChar = oneRowText[charPos + 1]) == sepChar)
{
++charPos;
recordInfo.Add(mCellValueBuilder.ToString());
recordInfo.Add(mCsvTextBuilder.ToString());
break;
}
else if (nextChar == '\"')
mCellValueBuilder.Append(oneRowText[charPos = charPos + 1]);
mCsvTextBuilder.Append(oneRowText[charPos = charPos + 1]);
//Code should not hit this point, it indicates an error
else if (!this.mCsvSettings.IgnoreErrors)
ThrowException(oneRowText, this.RowIndex, charPos);
ThrowException(oneRowText, this.LineIndex, charPos);
}
}
#endregion
}
if (oneRowText[textLen - 1] == sepChar)
recordInfo.Add(String.Empty);

if (this.mCsvSettings.UseCache && !cachedRows.ContainsKey(oneRowText))
cachedRows[oneRowText] = recordInfo;

return recordInfo;
}

Expand Down Expand Up @@ -139,6 +133,7 @@ protected CsvReader(Stream stream, CsvReaderSettings settings, IDataResolver<T>
{
this.dataResolver = dataResolver;
this.mCsvSettings = settings = settings ?? new CsvReaderSettings();
this.mCsvSettings.UseCache = settings.UseCache || settings.SkipDuplicates;
EnsureParameters(stream, settings, dataResolver);
settings.BufferSize = Math.Min(4096 * 1024, Math.Max(settings.BufferSize, 4096));
this.mReader = new StreamReader(stream, settings.Encoding, false, settings.BufferSize);
Expand Down Expand Up @@ -169,35 +164,39 @@ protected CsvReader(String filePath, CsvReaderSettings settings, IDataResolver<T
/// <returns>A list of String values read.</returns>
public List<String> ReadRow()
{
if (!this.EnsureBuffer())
return null;

var oneRowBuilder = new StringBuilder();
var oneRowText = String.Empty;
var commentHint = this.mCsvSettings.CommentHint;

while (true)
{
oneRowBuilder.Length = 0;
if (!this.EnsureBuffer()) return null;

#region Read one real CSV record line
mCsvTextBuilder.Length = 0;
while (this.mBufferPosition < this.mBufferCharCount)
{
var firstChar = this.mBuffer[this.mBufferPosition++];

if (firstChar == '\r')
{
//for macintosh csv format, it uses \r as line break
if (this.EnsureBuffer() && this.mBuffer[this.mBufferPosition] == '\n')
if (this.mBufferPosition >= this.mBufferCharCount)
if (!this.EnsureBuffer()) break;

if (this.mBuffer[this.mBufferPosition] == '\n')
++this.mBufferPosition;

//for macintosh csv format, it uses \r as line break
break;
}
else oneRowBuilder.Append(firstChar);
else mCsvTextBuilder.Append(firstChar);

//if there is no line break, we should read to the end of file.
if (this.mBufferPosition >= this.mBufferCharCount)
if (!this.EnsureBuffer()) break;
}
#endregion

#region Processing header/empty lines/cache
++this.LineIndex;
oneRowText = oneRowBuilder.ToString();
var oneRowText = mCsvTextBuilder.ToString();

//the first non-skipped row will be treat as header or first record
if (this.mCsvSettings.SkipEmptyLines && oneRowText.Length == 0)
Expand All @@ -208,10 +207,22 @@ public List<String> ReadRow()
++this.RowIndex; //header is counted for row numbers
if (!fileHeaderAlreadySkipped && this.mCsvSettings.HasHeader)
{
fileHeaderAlreadySkipped = true; continue;
fileHeaderAlreadySkipped = true;
continue;
}

//if a row is in cache, it's already read, process skip duplicates
if (this.mCsvSettings.SkipDuplicates && cachedRows.ContainsKey(oneRowText))
continue;

//if use cache and the row is already read, use the existing value
if (this.mCsvSettings.UseCache && cachedRows.ContainsKey(oneRowText))
return cachedRows[oneRowText];
var temporaryData = this.ParseOneRow(oneRowText);
//if use cache and the row is not read, add it to cache
if (this.mCsvSettings.UseCache) cachedRows[oneRowText] = temporaryData;
#endregion

return temporaryData;
}
}
Expand Down
9 changes: 8 additions & 1 deletion Sky.Data.Csv/CsvSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ public class CsvReaderSettings : CsvSettings
{
/// <summary>
/// Whether or not to use cache when reading a CSV file, default is false.
/// This value is useful when there are many duplicate records in the CSV file.
/// This value is useful when most records are duplicates in a CSV file.
/// Please note that if most records are unique in a large CSV file, set this option to true will take huge amount of memory.
/// If SkipDuplicates is true, this value will also be treated as true.
/// </summary>
public Boolean UseCache { get; set; }
/// <summary>
Expand All @@ -56,6 +58,11 @@ public class CsvReaderSettings : CsvSettings
/// </summary>
public String CommentHint { get; set; }
/// <summary>
/// Whether or not skip duplicate records, default is false.
/// If this option is set to true, duplicate records will be skipped and ignored when reading and writing CSV files.
/// </summary>
public Boolean SkipDuplicates { get; set; }
/// <summary>
/// Whether the CSV file has a header, default is false.
/// If true, the first not skipped line will be regarded to be the header of the CSV file and ignored.
/// Refer to CommentHint and SkipEmptyLines for more details.
Expand Down

0 comments on commit 78004cd

Please sign in to comment.