-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/rdmp 73 cohort holdouts (#1653)
* add top to cohort * working holdout flow * interim * basic ui flow * now filtering * working flow * working auto-holdout * fix test * add query * improved holdout * tidy up code * add description * revert test db * add holdout description * add todo * fixups from codeql * Minor syntax fix * Fix possible null deref * fix todo url * Tidy, typo fix * Remove disused field --------- Co-authored-by: James A Sutherland <[email protected]> Co-authored-by: James A Sutherland <>
- Loading branch information
Showing
13 changed files
with
1,002 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
Rdmp.Core/CohortCommitting/Pipeline/CohortHoldoutLookupRequest.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Copyright (c) The University of Dundee 2018-2019 | ||
// This file is part of the Research Data Management Platform (RDMP). | ||
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>. | ||
|
||
using System; | ||
using System.Data; | ||
using System.Globalization; | ||
using System.Security.Permissions; | ||
using NPOI.SS.Formula.Functions; | ||
using Rdmp.Core.Curation.Data.Cohort; | ||
using Rdmp.Core.Curation.Data.Pipelines; | ||
using Rdmp.Core.DataFlowPipeline; | ||
using Rdmp.Core.DataFlowPipeline.Requirements; | ||
using Rdmp.Core.MapsDirectlyToDatabaseTable; | ||
using Rdmp.Core.ReusableLibraryCode.Checks; | ||
|
||
namespace Rdmp.Core.CohortCommitting.Pipeline; | ||
|
||
/// <summary> | ||
/// All details required to create a holdout set from a cohort | ||
/// </summary> | ||
public sealed class CohortHoldoutLookupRequest : PipelineUseCase, ICanBeSummarised, ICohortHoldoutLookupRequest | ||
{ | ||
public CohortIdentificationConfiguration CIC { get; set; } | ||
public int Count { get; set; } | ||
public bool IsPercent { get; set; } | ||
|
||
public string Description { get; set; } | ||
|
||
public string WhereQuery { get; set; } | ||
|
||
public string Name { get; set; } | ||
|
||
public DateTime MinDate { get; set; } | ||
public DateTime MaxDate { get; set; } | ||
public string DateColumnName { get; set; } | ||
public CohortHoldoutLookupRequest(CohortIdentificationConfiguration cic, string name, int count, bool isPercent, string description = "", string minDate = null, string maxDate = null, string dateColumnName = null) | ||
{ | ||
CIC = cic; | ||
Name = name; | ||
Count = count; | ||
IsPercent = isPercent; | ||
Description = description; | ||
if (DateTime.TryParseExact(minDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMinDate)) | ||
MinDate = parsedMinDate; | ||
if (DateTime.TryParseExact(maxDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMaxDate)) | ||
MaxDate = parsedMaxDate; | ||
DateColumnName = dateColumnName; | ||
AddInitializationObject(this); | ||
} | ||
public string GetSummary(bool includeName, bool includeId) => $"Cohort Holdout: {Name}"; | ||
|
||
|
||
protected override IDataFlowPipelineContext GenerateContextImpl() => | ||
new DataFlowPipelineContext<CohortIdentificationConfiguration> | ||
{ | ||
MustHaveDestination = typeof(ICohortPipelineDestination), | ||
MustHaveSource = typeof(IDataFlowSource<CohortIdentificationConfiguration>) | ||
}; | ||
|
||
public void Check(ICheckNotifier notifier) | ||
{ | ||
throw new NotImplementedException(); | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
Rdmp.Core/CohortCommitting/Pipeline/ICohortHoldoutLookupRequest.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) The University of Dundee 2018-2023 | ||
// This file is part of the Research Data Management Platform (RDMP). | ||
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>. | ||
using Rdmp.Core.Curation.Data.Pipelines; | ||
using Rdmp.Core.ReusableLibraryCode.Checks; | ||
|
||
namespace Rdmp.Core.CohortCommitting.Pipeline; | ||
|
||
|
||
/// <summary> | ||
/// See CohortHoldoutLookupRequest | ||
/// </summary> | ||
public interface ICohortHoldoutLookupRequest : ICheckable, IPipelineUseCase | ||
{ | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
205 changes: 205 additions & 0 deletions
205
Rdmp.Core/CommandExecution/AtomicCommands/ExecuteCommandCreateHoldoutLookup.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
// Copyright (c) The University of Dundee 2018-2023 | ||
// This file is part of the Research Data Management Platform (RDMP). | ||
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | ||
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>. | ||
|
||
using FAnsi.Discovery; | ||
using Rdmp.Core.CohortCommitting.Pipeline; | ||
using Rdmp.Core.CommandExecution.AtomicCommands.CatalogueCreationCommands; | ||
using Rdmp.Core.Curation.Data.Cohort; | ||
using Rdmp.Core.Curation.Data.Pipelines; | ||
using Rdmp.Core.DataExport.Data; | ||
using Rdmp.Core.DataViewing; | ||
using Rdmp.Core.Icons.IconProvision; | ||
using Rdmp.Core.ReusableLibraryCode.DataAccess; | ||
using Rdmp.Core.ReusableLibraryCode.Icons.IconProvision; | ||
using SixLabors.ImageSharp; | ||
using SixLabors.ImageSharp.PixelFormats; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Data; | ||
using System.Data.Common; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Text; | ||
|
||
namespace Rdmp.Core.CommandExecution.AtomicCommands; | ||
|
||
public class ExecuteCommandCreateHoldoutLookup : BasicCommandExecution | ||
{ | ||
private readonly CohortIdentificationConfiguration _cic; | ||
readonly IBasicActivateItems _activator; | ||
private DiscoveredServer _server; | ||
private DataTable _dataTable; | ||
|
||
|
||
public ExecuteCommandCreateHoldoutLookup(IBasicActivateItems activator, | ||
CohortIdentificationConfiguration cic) : base(activator) | ||
{ | ||
_cic = cic; | ||
_activator = activator; | ||
} | ||
|
||
public override string GetCommandName() => "Create Holdout"; | ||
|
||
/// <summary> | ||
/// Describes in a user friendly way the activity of picking an <see cref="ExternalCohortTable"/> | ||
/// </summary> | ||
/// <returns></returns> | ||
private static DialogArgs GetChooseCohortDialogArgs() => | ||
new() | ||
{ | ||
WindowTitle = "Choose where to save cohort", | ||
TaskDescription = | ||
"Select the Cohort Database in which to store the identifiers. If you have multiple methods of anonymising cohorts or manage different types of identifiers (e.g. CHI lists, ECHI lists and/or BarcodeIDs) then you must pick the Cohort Database that matches your cohort identifier type/anonymisation protocol.", | ||
EntryLabel = "Select Cohort Database", | ||
AllowAutoSelect = true | ||
}; | ||
|
||
private DataTable LoadDataTable(DiscoveredServer server, string sql) | ||
{ | ||
|
||
var dt = new DataTable(); | ||
|
||
try | ||
{ | ||
using var con = server.GetConnection(); | ||
con.Open(); | ||
using var cmd = server.GetCommand(sql, con); | ||
cmd.CommandTimeout = 10000; | ||
var adapter = server.GetDataAdapter(cmd); | ||
dt.BeginLoadData(); | ||
adapter.Fill(dt); | ||
dt.EndLoadData(); | ||
con.Close(); | ||
} | ||
catch (Exception e) | ||
{ | ||
GlobalError("Unable to access datatable",e); | ||
} | ||
return dt; | ||
|
||
} | ||
|
||
private const string HoldoutShuffle = "_HoldoutShuffle"; | ||
|
||
public override void Execute() | ||
{ | ||
base.Execute(); | ||
|
||
SelectOne(GetChooseCohortDialogArgs(), | ||
BasicActivator.RepositoryLocator.DataExportRepository, | ||
out ExternalCohortTable ect); | ||
if (ect is null) | ||
return; | ||
|
||
var holdoutRequest = BasicActivator.GetCohortHoldoutLookupRequest(ect, null, _cic); | ||
if(holdoutRequest is null) | ||
return; | ||
|
||
var cohortConfiguration = new ViewCohortIdentificationConfigurationSqlCollection(_cic); | ||
var sql = cohortConfiguration.GetSql(); | ||
_server = DataAccessPortal | ||
.ExpectServer(cohortConfiguration.GetDataAccessPoint(), DataAccessContext.InternalDataProcessing, false); | ||
_server.TestConnection(); | ||
_dataTable = LoadDataTable(_server, sql); | ||
if(_dataTable.Rows.Count == 0) | ||
{ | ||
Show("Unable to Access Cohort"); | ||
return; | ||
} | ||
StringBuilder sb = new(); | ||
|
||
var columnNames = _dataTable.Columns.Cast<DataColumn>(). | ||
Select(static column => column.ColumnName); | ||
sb.AppendLine(string.Join(",", columnNames)); | ||
_dataTable.Columns.Add(HoldoutShuffle); | ||
Random rnd = new(); | ||
foreach (DataRow row in _dataTable.Rows) | ||
{ | ||
row[HoldoutShuffle] = rnd.Next(); | ||
} | ||
var beforeDate = holdoutRequest.MaxDate; | ||
var afterDate = holdoutRequest.MinDate; | ||
var dateColumn = holdoutRequest.DateColumnName; | ||
var hasMinDate = false; | ||
var hasMaxDate = false; | ||
|
||
|
||
if (columnNames.Contains(dateColumn)) | ||
{ | ||
if (beforeDate.Date != DateTime.MinValue) | ||
{ | ||
//has max date | ||
hasMaxDate = true; | ||
} | ||
if (afterDate.Date != DateTime.MinValue) | ||
{ | ||
//has min date | ||
hasMinDate = true; | ||
} | ||
} | ||
|
||
if (hasMinDate || hasMaxDate) | ||
{ | ||
foreach(DataRow row in _dataTable.Rows) | ||
{ | ||
if (hasMaxDate && DateTime.Parse(row[dateColumn].ToString()) > beforeDate) { | ||
row.Delete(); | ||
} | ||
else if (hasMinDate && DateTime.Parse(row[dateColumn].ToString()) < afterDate) | ||
{ | ||
row.Delete(); | ||
} | ||
} | ||
} | ||
_dataTable.DefaultView.Sort = HoldoutShuffle; | ||
_dataTable = _dataTable.DefaultView.ToTable(); | ||
_dataTable.Columns.Remove(HoldoutShuffle); | ||
var rowCount = holdoutRequest.Count; | ||
var rows = _dataTable.Rows.Cast<System.Data.DataRow>().Take(rowCount); | ||
if (holdoutRequest.IsPercent) | ||
{ | ||
if (rowCount > 100) | ||
{ | ||
rowCount = 100; | ||
} | ||
rowCount = (int)Math.Ceiling((float)_dataTable.Rows.Count / 100 * rowCount); | ||
rows = _dataTable.Rows.Cast<DataRow>().Take(rowCount); | ||
} | ||
|
||
var dataRows = rows as DataRow[] ?? rows.ToArray(); | ||
if (!dataRows.Any()) | ||
{ | ||
Show("Holdout would be empty with current configuration. Will not create holdout."); | ||
return; | ||
} | ||
|
||
foreach (var row in dataRows) | ||
{ | ||
sb.AppendLine(string.Join(",", row.ItemArray.Select(static field => field?.ToString()))); | ||
} | ||
|
||
File.WriteAllText($"{holdoutRequest.Name}.csv", sb.ToString()); | ||
var fi = new FileInfo($"{holdoutRequest.Name}.csv"); | ||
|
||
var columns = _dataTable.Columns.Cast<DataColumn>().Select(c=>c.ColumnName).ToList(); | ||
|
||
BasicActivator.SelectObject("Select an Extraction Identifier", columns.ToArray(), out var extractionIdentifier); | ||
if (extractionIdentifier == null) | ||
return; | ||
|
||
var db = SelectDatabase(true, "Select a Database to store the new Holdout."); | ||
if(db == null) return; | ||
|
||
var pipe = _activator.RepositoryLocator.CatalogueRepository.GetAllObjects<Pipeline>().OrderByDescending(static p => p.ID) | ||
.FirstOrDefault(static p => p.Name.Contains("BULK INSERT: CSV Import File (automated column-type detection)")); | ||
|
||
var importCommand = new ExecuteCommandCreateNewCatalogueByImportingFile(_activator, fi, extractionIdentifier, db, pipe, null,holdoutRequest.Description); | ||
importCommand.Execute(); | ||
|
||
} | ||
|
||
public override Image<Rgba32> GetImage(IIconProvider iconProvider) => iconProvider.GetImage(RDMPConcept.CohortAggregate,OverlayKind.Link); | ||
} |
Oops, something went wrong.