Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RDMP-143 Reorder Excel Attacher Processes #1752

Merged
merged 9 commits into from
Mar 1, 2024
40 changes: 5 additions & 35 deletions Rdmp.Core/DataLoad/Modules/Attachers/ExcelAttacher.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) The University of Dundee 2018-2019
// Copyright (c) The University of Dundee 2018-2024
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
Expand All @@ -9,7 +9,6 @@
using System.Data;
using System.IO;
using System.Linq;
using System.Text;
using Rdmp.Core.Curation.Data;
using Rdmp.Core.DataFlowPipeline;
using Rdmp.Core.DataFlowPipeline.Requirements;
Expand Down Expand Up @@ -84,51 +83,22 @@ protected override void OpenFile(FileInfo fileToLoad, IDataLoadEventListener lis
$"About to start processing {fileToLoad.FullName}"));

var columnTranslation = ConvertColumnOffsetToInt();
_dataTable = _hostedSource.GetChunk(listener, cancellationToken, RowOffset, columnTranslation);

string[] replacementHeadersSplit = { };
if (!string.IsNullOrEmpty(ForceReplacementHeaders))
{
//split headers by , (and trim leading/trailing whitespace).
var replacementHeadersSplit = ForceReplacementHeaders.Split(',')
replacementHeadersSplit = ForceReplacementHeaders.Split(',')
.Select(h => string.IsNullOrWhiteSpace(h) ? h : h.Trim()).ToArray();

listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
$"Force headers will make the following header changes:{GenerateASCIIArtOfSubstitutions(replacementHeadersSplit, _dataTable.Columns)}"));

if (replacementHeadersSplit.Length != _dataTable.Columns.Count)
listener.OnNotify(this,
new NotifyEventArgs(ProgressEventType.Error,
$"ForceReplacementHeaders was set but it had {replacementHeadersSplit.Length} column header names while the file had {_dataTable.Columns.Count} (there must be the same number of replacement headers as headers in the excel file)"));
else
for (var i = 0; i < replacementHeadersSplit.Length; i++)
_dataTable.Columns[i].ColumnName =
replacementHeadersSplit[i]; //rename the columns to match the forced replacements
}

_dataTable = _hostedSource.GetChunk(listener, cancellationToken, RowOffset, columnTranslation, replacementHeadersSplit);

//all data should now be exhausted
if (_hostedSource.GetChunk(listener, cancellationToken, RowOffset, columnTranslation) != null)
throw new Exception(
"Hosted source served more than 1 chunk, expected all the data to be read from the Excel file in one go");
}

private static string GenerateASCIIArtOfSubstitutions(string[] replacementHeadersSplit,
DataColumnCollection columns)
{
var sb = new StringBuilder("");

var max = Math.Max(replacementHeadersSplit.Length, columns.Count);

for (var i = 0; i < max; i++)
{
var replacement = i >= replacementHeadersSplit.Length ? "???" : replacementHeadersSplit[i];
var original = i >= columns.Count ? "???" : columns[i].ColumnName;

sb.Append($"{Environment.NewLine}[{i}]{original}>>>{replacement}");
}

return sb.ToString();
}

protected override int IterativelyBatchLoadDataIntoDataTable(DataTable loadTarget, int maxBatchSize,
GracefulCancellationToken cancellationToken)
{
Expand Down
49 changes: 42 additions & 7 deletions Rdmp.Core/DataLoad/Modules/DataFlowSources/ExcelDataFlowSource.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) The University of Dundee 2018-2019
// Copyright (c) The University of Dundee 2018-2024
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
Expand All @@ -11,6 +11,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using ExcelNumberFormat;
using FAnsi.Discovery;
Expand Down Expand Up @@ -54,9 +55,9 @@ public class ExcelDataFlowSource : IPluginDataFlowSource<DataTable>, IPipelineRe
private DataTable dataReadFromFile;
private bool haveDispatchedDataTable;

public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset = 0, int columnOffset = 0)
public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
{
dataReadFromFile ??= GetAllData(listener, cancellationToken,rowOffset,columnOffset);
dataReadFromFile ??= GetAllData(listener, cancellationToken, rowOffset, columnOffset, replacementHeadersSplit);

if (haveDispatchedDataTable)
return null;
Expand All @@ -66,7 +67,7 @@ public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationT
return dataReadFromFile;
}

private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset=0, int columnOffset=0)
private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
{
var sw = new Stopwatch();
sw.Start();
Expand All @@ -88,7 +89,7 @@ private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellati
(string.IsNullOrWhiteSpace(WorkSheetName) ? wb.GetSheetAt(0) : wb.GetSheet(WorkSheetName)) ??
throw new FlatFileLoadException(
$"The Excel sheet '{WorkSheetName}' was not found in workbook '{_fileToLoad.File.Name}'");
toReturn = GetAllData(worksheet, listener, rowOffset,columnOffset);
toReturn = GetAllData(worksheet, listener, rowOffset, columnOffset, replacementHeadersSplit);

//set the table name the file name
toReturn.TableName =
Expand Down Expand Up @@ -122,8 +123,9 @@ private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellati
/// <param name="listener"></param>
/// <param name="rowOffset"></param>
/// <param name="columnOffset"></param>
/// <param name="replacementHeadersSplit"></param>
/// <returns></returns>
public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, int rowOffset=0, int columnOffset =0)
public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
{
var toReturn = new DataTable();
toReturn.BeginLoadData();
Expand All @@ -137,7 +139,7 @@ public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, i
{
var row = (IRow)rowEnumerator.Current;
if (rowOffset - 1 > row.RowNum) continue;// .RowNumber is 0 indexed

//if all the cells in the current row are blank skip it (eliminates top of file whitespace)
if (row.Cells.All(c => string.IsNullOrWhiteSpace(c.ToString())))
continue;
Expand All @@ -149,6 +151,13 @@ public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, i
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
$"Excel sheet {worksheet.SheetName} contains {nColumns}"));


if (replacementHeadersSplit is not null && replacementHeadersSplit.Any() && replacementHeadersSplit.Length != nColumns)
listener.OnNotify(this,
new NotifyEventArgs(ProgressEventType.Error,
$"ForceReplacementHeaders was set but it had {replacementHeadersSplit.Length} column header names while the file had {nColumns} (there must be the same number of replacement headers as headers in the excel file)"));

string[] originalHeaders = new string[nColumns];
for (var i = 0; i < nColumns; i++)
{
//if the cell header is blank
Expand All @@ -163,11 +172,19 @@ public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, i
{
h = cell.NumericCellValue.ToString();
}
if (replacementHeadersSplit is not null && replacementHeadersSplit.Any() && replacementHeadersSplit.Length == nColumns)
{
originalHeaders[i] = h;
h = replacementHeadersSplit[i];
}
if (string.IsNullOrWhiteSpace(h))
continue;

nonBlankColumns.Add(cell.ColumnIndex, toReturn.Columns.Add(h));
}
if(replacementHeadersSplit is not null && replacementHeadersSplit.Any())
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
$"Force headers will make the following header changes:{GenerateASCIIArtOfSubstitutions(originalHeaders, replacementHeadersSplit)}"));

continue;
}
Expand Down Expand Up @@ -205,6 +222,24 @@ public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, i
return toReturn;
}

private static string GenerateASCIIArtOfSubstitutions(string[] headers,
string[] replacements)
{
var sb = new StringBuilder("");

var max = Math.Max(replacements.Length, headers.Length);

for (var i = 0; i < max; i++)
{
var replacement = i >= replacements.Length ? "???" : replacements[i];
var original = i >= headers.Length ? "???" : headers[i];

sb.Append($"{Environment.NewLine}[{i}]{original}>>>{replacement}");
}

return sb.ToString();
}

/// <summary>
/// Returns the C# value that best represents the contents of the cell.
/// </summary>
Expand Down