Skip to content

Commit

Permalink
Feature/rdmp 73 cohort holdouts (#1653)
Browse files Browse the repository at this point in the history
* add top to cohort

* working holdout flow

* interim

* basic ui  flow

* now filtering

* working flow

* working auto-holdout

* fix test

* add query

* improved holdout

* tidy up code

* add description

* revert test db

* add holdout description

* add todo

* fixups from codeql

* Minor syntax fix

* Fix possible null deref

* fix todo url

* Tidy, typo fix

* Remove disused field

---------

Co-authored-by: James A Sutherland <[email protected]>
Co-authored-by: James A Sutherland <>
  • Loading branch information
JFriel and jas88 authored Nov 9, 2023
1 parent 9fdc460 commit adb2a51
Show file tree
Hide file tree
Showing 13 changed files with 1,002 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,11 @@ public void StartSession(string sessionName, IEnumerable<IMapsDirectlyToDatabase

public override IPipelineRunner GetPipelineRunner(DialogArgs args, IPipelineUseCase useCase, IPipeline pipeline)
{

if(useCase is not null && pipeline is not null)
{
return new PipelineRunner(useCase, pipeline);
}
var configureAndExecuteDialog = new ConfigureAndExecutePipelineUI(args, useCase, this)
{
Dock = DockStyle.Fill
Expand All @@ -856,9 +861,27 @@ public override CohortCreationRequest GetCohortCreationRequest(ExternalCohortTab
return ui.ShowDialog() == DialogResult.OK ? ui.Result : null;
}

public override CohortHoldoutLookupRequest GetCohortHoldoutLookupRequest(ExternalCohortTable externalCohortTable, IProject project, CohortIdentificationConfiguration cic)
{
// if on wrong Thread
if (_mainDockPanel?.InvokeRequired ?? false)
return _mainDockPanel.Invoke(() =>
GetCohortHoldoutLookupRequest(externalCohortTable, project, cic));

var ui = new Rdmp.UI.CohortUI.CreateHoldoutLookup.CreateHoldoutLookupUI(this, externalCohortTable, cic);

if (!string.IsNullOrWhiteSpace(cic.Description))
ui.CohortDescription = $"{cic.Description} ({Environment.UserName} - {DateTime.Now})";
return ui.ShowDialog() == DialogResult.OK ? ui.Result : null;
}

public override ICatalogue CreateAndConfigureCatalogue(ITableInfo tableInfo,
ColumnInfo[] extractionIdentifierColumns, string initialDescription, IProject projectSpecific, string folder)
{
if(extractionIdentifierColumns is not null)
{
return base.CreateAndConfigureCatalogue(tableInfo, extractionIdentifierColumns, initialDescription, projectSpecific, folder);
}
// if on wrong Thread
if (_mainDockPanel?.InvokeRequired ?? false)
return _mainDockPanel.Invoke(() => CreateAndConfigureCatalogue(tableInfo, extractionIdentifierColumns,
Expand Down
67 changes: 67 additions & 0 deletions Rdmp.Core/CohortCommitting/Pipeline/CohortHoldoutLookupRequest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (c) The University of Dundee 2018-2019
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.

using System;
using System.Data;
using System.Globalization;
using System.Security.Permissions;
using NPOI.SS.Formula.Functions;
using Rdmp.Core.Curation.Data.Cohort;
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.DataFlowPipeline;
using Rdmp.Core.DataFlowPipeline.Requirements;
using Rdmp.Core.MapsDirectlyToDatabaseTable;
using Rdmp.Core.ReusableLibraryCode.Checks;

namespace Rdmp.Core.CohortCommitting.Pipeline;

/// <summary>
/// All details required to create a holdout set from a cohort
/// </summary>
public sealed class CohortHoldoutLookupRequest : PipelineUseCase, ICanBeSummarised, ICohortHoldoutLookupRequest
{
public CohortIdentificationConfiguration CIC { get; set; }
public int Count { get; set; }
public bool IsPercent { get; set; }

public string Description { get; set; }

public string WhereQuery { get; set; }

public string Name { get; set; }

public DateTime MinDate { get; set; }
public DateTime MaxDate { get; set; }
public string DateColumnName { get; set; }
public CohortHoldoutLookupRequest(CohortIdentificationConfiguration cic, string name, int count, bool isPercent, string description = "", string minDate = null, string maxDate = null, string dateColumnName = null)
{
CIC = cic;
Name = name;
Count = count;
IsPercent = isPercent;
Description = description;
if (DateTime.TryParseExact(minDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMinDate))
MinDate = parsedMinDate;
if (DateTime.TryParseExact(maxDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMaxDate))
MaxDate = parsedMaxDate;
DateColumnName = dateColumnName;
AddInitializationObject(this);
}
public string GetSummary(bool includeName, bool includeId) => $"Cohort Holdout: {Name}";


protected override IDataFlowPipelineContext GenerateContextImpl() =>
new DataFlowPipelineContext<CohortIdentificationConfiguration>
{
MustHaveDestination = typeof(ICohortPipelineDestination),
MustHaveSource = typeof(IDataFlowSource<CohortIdentificationConfiguration>)
};

public void Check(ICheckNotifier notifier)
{
throw new NotImplementedException();
}
}
19 changes: 19 additions & 0 deletions Rdmp.Core/CohortCommitting/Pipeline/ICohortHoldoutLookupRequest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) The University of Dundee 2018-2023
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.ReusableLibraryCode.Checks;

namespace Rdmp.Core.CohortCommitting.Pipeline;


/// <summary>
/// See CohortHoldoutLookupRequest
/// </summary>
public interface ICohortHoldoutLookupRequest : ICheckable, IPipelineUseCase
{

}

2 changes: 2 additions & 0 deletions Rdmp.Core/CommandExecution/AtomicCommandFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ public IEnumerable<IAtomicCommand> CreateCommands(object o)

yield return new ExecuteCommandFreezeCohortIdentificationConfiguration(_activator, cic, !cic.Frozen)
{ Weight = -50.5f };
yield return new ExecuteCommandCreateHoldoutLookup(_activator, cic)
{ Weight = -50.5f };

var clone = new ExecuteCommandCloneCohortIdentificationConfiguration(_activator)
{ Weight = -50.4f, OverrideCommandName = "Clone" }.SetTarget(cic);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public class ExecuteCommandCreateNewCatalogueByImportingFile : CatalogueCreation
{
private readonly DiscoveredDatabase _targetDatabase;
private IPipeline _pipeline;
private readonly string _extractionIdentifier;
private readonly string _initialDescription;

public FileInfo File { get; private set; }

Expand Down Expand Up @@ -62,13 +64,17 @@ public ExecuteCommandCreateNewCatalogueByImportingFile(IBasicActivateItems activ
"Pipeline for reading the source file, applying any transforms and writing to the database")]
Pipeline pipeline,
[DemandsInitialization(Desc_ProjectSpecificParameter)]
Project projectSpecific) : base(activator, projectSpecific, null)
Project projectSpecific,
string initialDescription=null) : base(activator, projectSpecific, null)

{
File = file;
_targetDatabase = targetDatabase;
_pipeline = pipeline;
_extractionIdentifier = extractionIdentifier;
UseTripleDotSuffix = true;
CheckFile();
_initialDescription = initialDescription;
}


Expand Down Expand Up @@ -150,16 +156,21 @@ private void OnPipelineCompleted(object sender, PipelineEngineEventArgs args, Di

var importer = new TableInfoImporter(BasicActivator.RepositoryLocator.CatalogueRepository, tbl);
importer.DoImport(out var ti, out _);

var cata = BasicActivator.CreateAndConfigureCatalogue(ti, null,
var extractionIdentifiers = _extractionIdentifier is null ? null : ti.ColumnInfos.Where(t => t.Name == _extractionIdentifier).ToArray();
var cata = BasicActivator.CreateAndConfigureCatalogue(ti, extractionIdentifiers,
$"Import of file '{File.FullName}' by {Environment.UserName} on {DateTime.Now}", ProjectSpecific,
TargetFolder);

if (cata != null)
if (cata == null) return;

if(_initialDescription is not null)
{
Publish(cata);
Emphasise(cata);
cata.Description = _initialDescription;
cata.SaveToDatabase();
}

Publish(cata);
Emphasise(cata);
}

public override Image<Rgba32> GetImage(IIconProvider iconProvider) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Copyright (c) The University of Dundee 2018-2023
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.

using FAnsi.Discovery;
using Rdmp.Core.CohortCommitting.Pipeline;
using Rdmp.Core.CommandExecution.AtomicCommands.CatalogueCreationCommands;
using Rdmp.Core.Curation.Data.Cohort;
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.DataExport.Data;
using Rdmp.Core.DataViewing;
using Rdmp.Core.Icons.IconProvision;
using Rdmp.Core.ReusableLibraryCode.DataAccess;
using Rdmp.Core.ReusableLibraryCode.Icons.IconProvision;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.PixelFormats;
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.Common;
using System.IO;
using System.Linq;
using System.Text;

namespace Rdmp.Core.CommandExecution.AtomicCommands;

public class ExecuteCommandCreateHoldoutLookup : BasicCommandExecution
{
private readonly CohortIdentificationConfiguration _cic;
readonly IBasicActivateItems _activator;
private DiscoveredServer _server;
private DataTable _dataTable;


public ExecuteCommandCreateHoldoutLookup(IBasicActivateItems activator,
CohortIdentificationConfiguration cic) : base(activator)
{
_cic = cic;
_activator = activator;
}

public override string GetCommandName() => "Create Holdout";

/// <summary>
/// Describes in a user friendly way the activity of picking an <see cref="ExternalCohortTable"/>
/// </summary>
/// <returns></returns>
private static DialogArgs GetChooseCohortDialogArgs() =>
new()
{
WindowTitle = "Choose where to save cohort",
TaskDescription =
"Select the Cohort Database in which to store the identifiers. If you have multiple methods of anonymising cohorts or manage different types of identifiers (e.g. CHI lists, ECHI lists and/or BarcodeIDs) then you must pick the Cohort Database that matches your cohort identifier type/anonymisation protocol.",
EntryLabel = "Select Cohort Database",
AllowAutoSelect = true
};

private DataTable LoadDataTable(DiscoveredServer server, string sql)
{

var dt = new DataTable();

try
{
using var con = server.GetConnection();
con.Open();
using var cmd = server.GetCommand(sql, con);
cmd.CommandTimeout = 10000;
var adapter = server.GetDataAdapter(cmd);
dt.BeginLoadData();
adapter.Fill(dt);
dt.EndLoadData();
con.Close();
}
catch (Exception e)
{
GlobalError("Unable to access datatable",e);
}
return dt;

}

private const string HoldoutShuffle = "_HoldoutShuffle";

public override void Execute()
{
base.Execute();

SelectOne(GetChooseCohortDialogArgs(),
BasicActivator.RepositoryLocator.DataExportRepository,
out ExternalCohortTable ect);
if (ect is null)
return;

var holdoutRequest = BasicActivator.GetCohortHoldoutLookupRequest(ect, null, _cic);
if(holdoutRequest is null)
return;

var cohortConfiguration = new ViewCohortIdentificationConfigurationSqlCollection(_cic);
var sql = cohortConfiguration.GetSql();
_server = DataAccessPortal
.ExpectServer(cohortConfiguration.GetDataAccessPoint(), DataAccessContext.InternalDataProcessing, false);
_server.TestConnection();
_dataTable = LoadDataTable(_server, sql);
if(_dataTable.Rows.Count == 0)
{
Show("Unable to Access Cohort");
return;
}
StringBuilder sb = new();

var columnNames = _dataTable.Columns.Cast<DataColumn>().
Select(static column => column.ColumnName);
sb.AppendLine(string.Join(",", columnNames));
_dataTable.Columns.Add(HoldoutShuffle);
Random rnd = new();
foreach (DataRow row in _dataTable.Rows)
{
row[HoldoutShuffle] = rnd.Next();
}
var beforeDate = holdoutRequest.MaxDate;
var afterDate = holdoutRequest.MinDate;
var dateColumn = holdoutRequest.DateColumnName;
var hasMinDate = false;
var hasMaxDate = false;


if (columnNames.Contains(dateColumn))
{
if (beforeDate.Date != DateTime.MinValue)
{
//has max date
hasMaxDate = true;
}
if (afterDate.Date != DateTime.MinValue)
{
//has min date
hasMinDate = true;
}
}

if (hasMinDate || hasMaxDate)
{
foreach(DataRow row in _dataTable.Rows)
{
if (hasMaxDate && DateTime.Parse(row[dateColumn].ToString()) > beforeDate) {
row.Delete();
}
else if (hasMinDate && DateTime.Parse(row[dateColumn].ToString()) < afterDate)
{
row.Delete();
}
}
}
_dataTable.DefaultView.Sort = HoldoutShuffle;
_dataTable = _dataTable.DefaultView.ToTable();
_dataTable.Columns.Remove(HoldoutShuffle);
var rowCount = holdoutRequest.Count;
var rows = _dataTable.Rows.Cast<System.Data.DataRow>().Take(rowCount);
if (holdoutRequest.IsPercent)
{
if (rowCount > 100)
{
rowCount = 100;
}
rowCount = (int)Math.Ceiling((float)_dataTable.Rows.Count / 100 * rowCount);
rows = _dataTable.Rows.Cast<DataRow>().Take(rowCount);
}

var dataRows = rows as DataRow[] ?? rows.ToArray();
if (!dataRows.Any())
{
Show("Holdout would be empty with current configuration. Will not create holdout.");
return;
}

foreach (var row in dataRows)
{
sb.AppendLine(string.Join(",", row.ItemArray.Select(static field => field?.ToString())));
}

File.WriteAllText($"{holdoutRequest.Name}.csv", sb.ToString());
var fi = new FileInfo($"{holdoutRequest.Name}.csv");

var columns = _dataTable.Columns.Cast<DataColumn>().Select(c=>c.ColumnName).ToList();

BasicActivator.SelectObject("Select an Extraction Identifier", columns.ToArray(), out var extractionIdentifier);
if (extractionIdentifier == null)
return;

var db = SelectDatabase(true, "Select a Database to store the new Holdout.");
if(db == null) return;

var pipe = _activator.RepositoryLocator.CatalogueRepository.GetAllObjects<Pipeline>().OrderByDescending(static p => p.ID)
.FirstOrDefault(static p => p.Name.Contains("BULK INSERT: CSV Import File (automated column-type detection)"));

var importCommand = new ExecuteCommandCreateNewCatalogueByImportingFile(_activator, fi, extractionIdentifier, db, pipe, null,holdoutRequest.Description);
importCommand.Execute();

}

public override Image<Rgba32> GetImage(IIconProvider iconProvider) => iconProvider.GetImage(RDMPConcept.CohortAggregate,OverlayKind.Link);
}
Loading

0 comments on commit adb2a51

Please sign in to comment.