Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/rdmp 73 cohort holdouts #1653

Merged
merged 30 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8d694ca
add top to cohort
JFriel Oct 13, 2023
dc27f96
working holdout flow
JFriel Oct 16, 2023
16a4470
interim
JFriel Oct 16, 2023
8baaee2
basic ui flow
JFriel Oct 16, 2023
6917804
now filtering
JFriel Oct 16, 2023
0b5c983
working flow
JFriel Oct 16, 2023
3e7ba3a
working auto-holdout
JFriel Oct 17, 2023
49ed3b2
fix test
JFriel Oct 17, 2023
5948f79
Merge branch 'develop' of https://github.com/HicServices/RDMP into fe…
JFriel Oct 18, 2023
d318a3d
add query
JFriel Oct 18, 2023
8e22249
improved holdout
JFriel Oct 20, 2023
a327171
tidy up code
JFriel Oct 20, 2023
bbea3ee
add description
JFriel Oct 20, 2023
0286883
revert test db
JFriel Oct 20, 2023
50b86c7
add holdout description
JFriel Oct 20, 2023
8d5ada5
add todo
JFriel Oct 20, 2023
25fa9be
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
jas88 Oct 26, 2023
ac74e48
fixups from codeql
JFriel Oct 27, 2023
2c9da25
Minor syntax fix
jas88 Oct 28, 2023
1fd6047
Fix possible null deref
Oct 28, 2023
9deee83
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
JFriel Oct 30, 2023
e61c076
Merge branch 'develop' of https://github.com/HicServices/RDMP into fe…
JFriel Oct 30, 2023
0b4eea7
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
JFriel Nov 1, 2023
e8caf21
fix todo url
JFriel Nov 2, 2023
6ee38cd
Merge branch 'feature/RDMP-73-cohort-holdouts' of https://github.com/…
JFriel Nov 2, 2023
f838887
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
JFriel Nov 6, 2023
95c189d
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
jas88 Nov 6, 2023
4a7cefa
Tidy, typo fix
jas88 Nov 9, 2023
f3415ed
Merge branch 'develop' into feature/RDMP-73-cohort-holdouts
jas88 Nov 9, 2023
b58a1c6
Remove disused field
Nov 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,11 @@ public void StartSession(string sessionName, IEnumerable<IMapsDirectlyToDatabase

public override IPipelineRunner GetPipelineRunner(DialogArgs args, IPipelineUseCase useCase, IPipeline pipeline)
{

if(useCase is not null && pipeline is not null)
{
return new PipelineRunner(useCase, pipeline);
}
var configureAndExecuteDialog = new ConfigureAndExecutePipelineUI(args, useCase, this)
{
Dock = DockStyle.Fill
Expand All @@ -856,9 +861,27 @@ public override CohortCreationRequest GetCohortCreationRequest(ExternalCohortTab
return ui.ShowDialog() == DialogResult.OK ? ui.Result : null;
}

public override CohortHoldoutLookupRequest GetCohortHoldoutLookupRequest(ExternalCohortTable externalCohortTable, IProject project, CohortIdentificationConfiguration cic)
{
// if on wrong Thread
if (_mainDockPanel?.InvokeRequired ?? false)
return _mainDockPanel.Invoke(() =>
GetCohortHoldoutLookupRequest(externalCohortTable, project, cic));

var ui = new Rdmp.UI.CohortUI.CreateHoldoutLookup.CreateHoldoutLookupUI(this, externalCohortTable, cic);

if (!string.IsNullOrWhiteSpace(cic.Description))
ui.CohortDescription = $"{cic.Description} ({Environment.UserName} - {DateTime.Now})";
return ui.ShowDialog() == DialogResult.OK ? ui.Result : null;
}

public override ICatalogue CreateAndConfigureCatalogue(ITableInfo tableInfo,
ColumnInfo[] extractionIdentifierColumns, string initialDescription, IProject projectSpecific, string folder)
{
if(extractionIdentifierColumns is not null)
{
return base.CreateAndConfigureCatalogue(tableInfo, extractionIdentifierColumns, initialDescription, projectSpecific, folder);
}
// if on wrong Thread
if (_mainDockPanel?.InvokeRequired ?? false)
return _mainDockPanel.Invoke(() => CreateAndConfigureCatalogue(tableInfo, extractionIdentifierColumns,
Expand Down
67 changes: 67 additions & 0 deletions Rdmp.Core/CohortCommitting/Pipeline/CohortHoldoutLookupRequest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (c) The University of Dundee 2018-2019
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.

using System;
using System.Data;
using System.Globalization;
using System.Security.Permissions;
using NPOI.SS.Formula.Functions;
using Rdmp.Core.Curation.Data.Cohort;
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.DataFlowPipeline;
using Rdmp.Core.DataFlowPipeline.Requirements;
using Rdmp.Core.MapsDirectlyToDatabaseTable;
using Rdmp.Core.ReusableLibraryCode.Checks;

namespace Rdmp.Core.CohortCommitting.Pipeline;

/// <summary>
/// All details required to create a holdout set from a cohort
/// </summary>
public sealed class CohortHoldoutLookupRequest : PipelineUseCase, ICanBeSummarised, ICohortHoldoutLookupRequest
{
public CohortIdentificationConfiguration CIC { get; set; }
public int Count { get; set; }
public bool IsPercent { get; set; }

public string Description { get; set; }

public string WhereQuery { get; set; }

public string Name { get; set; }

public DateTime MinDate { get; set; }
public DateTime MaxDate { get; set; }
public string DateColumnName { get; set; }
public CohortHoldoutLookupRequest(CohortIdentificationConfiguration cic, string name, int count, bool isPercent, string description = "", string minDate = null, string maxDate = null, string dateColumnName = null)
{
CIC = cic;
Name = name;
Count = count;
IsPercent = isPercent;
Description = description;
if (DateTime.TryParseExact(minDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMinDate))
MinDate = parsedMinDate;
if (DateTime.TryParseExact(maxDate, "DD/MM/YYYY", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedMaxDate))
MaxDate = parsedMaxDate;
DateColumnName = dateColumnName;
AddInitializationObject(this);
}
public string GetSummary(bool includeName, bool includeId) => $"Cohort Holdout: {Name}";


protected override IDataFlowPipelineContext GenerateContextImpl() =>
new DataFlowPipelineContext<CohortIdentificationConfiguration>
{
MustHaveDestination = typeof(ICohortPipelineDestination),
MustHaveSource = typeof(IDataFlowSource<CohortIdentificationConfiguration>)
};

public void Check(ICheckNotifier notifier)
{
throw new NotImplementedException();
}
}
19 changes: 19 additions & 0 deletions Rdmp.Core/CohortCommitting/Pipeline/ICohortHoldoutLookupRequest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) The University of Dundee 2018-2023
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.ReusableLibraryCode.Checks;

namespace Rdmp.Core.CohortCommitting.Pipeline;


/// <summary>
/// See CohortHoldoutLookupRequest
/// </summary>
public interface ICohortHoldoutLookupRequest : ICheckable, IPipelineUseCase
{

}

2 changes: 2 additions & 0 deletions Rdmp.Core/CommandExecution/AtomicCommandFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ public IEnumerable<IAtomicCommand> CreateCommands(object o)

yield return new ExecuteCommandFreezeCohortIdentificationConfiguration(_activator, cic, !cic.Frozen)
{ Weight = -50.5f };
yield return new ExecuteCommandCreateHoldoutLookup(_activator, cic)
{ Weight = -50.5f };

var clone = new ExecuteCommandCloneCohortIdentificationConfiguration(_activator)
{ Weight = -50.4f, OverrideCommandName = "Clone" }.SetTarget(cic);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public class ExecuteCommandCreateNewCatalogueByImportingFile : CatalogueCreation
{
private readonly DiscoveredDatabase _targetDatabase;
private IPipeline _pipeline;
private readonly string _extractionIdentifier;
private readonly string _initialDescription;

public FileInfo File { get; private set; }

Expand Down Expand Up @@ -62,13 +64,17 @@ public ExecuteCommandCreateNewCatalogueByImportingFile(IBasicActivateItems activ
"Pipeline for reading the source file, applying any transforms and writing to the database")]
Pipeline pipeline,
[DemandsInitialization(Desc_ProjectSpecificParameter)]
Project projectSpecific) : base(activator, projectSpecific, null)
Project projectSpecific,
string initialDescription=null) : base(activator, projectSpecific, null)

{
File = file;
_targetDatabase = targetDatabase;
_pipeline = pipeline;
_extractionIdentifier = extractionIdentifier;
UseTripleDotSuffix = true;
CheckFile();
_initialDescription = initialDescription;
}


Expand Down Expand Up @@ -150,16 +156,21 @@ private void OnPipelineCompleted(object sender, PipelineEngineEventArgs args, Di

var importer = new TableInfoImporter(BasicActivator.RepositoryLocator.CatalogueRepository, tbl);
importer.DoImport(out var ti, out _);

var cata = BasicActivator.CreateAndConfigureCatalogue(ti, null,
var extractionIdentifiers = _extractionIdentifier is null ? null : ti.ColumnInfos.Where(t => t.Name == _extractionIdentifier).ToArray();
var cata = BasicActivator.CreateAndConfigureCatalogue(ti, extractionIdentifiers,
$"Import of file '{File.FullName}' by {Environment.UserName} on {DateTime.Now}", ProjectSpecific,
TargetFolder);

if (cata != null)
if (cata == null) return;

if(_initialDescription is not null)
{
Publish(cata);
Emphasise(cata);
cata.Description = _initialDescription;
cata.SaveToDatabase();
}

Publish(cata);
Emphasise(cata);
}

public override Image<Rgba32> GetImage(IIconProvider iconProvider) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Copyright (c) The University of Dundee 2018-2023
// This file is part of the Research Data Management Platform (RDMP).
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.

using FAnsi.Discovery;
using Rdmp.Core.CohortCommitting.Pipeline;
using Rdmp.Core.CommandExecution.AtomicCommands.CatalogueCreationCommands;
using Rdmp.Core.Curation.Data.Cohort;
using Rdmp.Core.Curation.Data.Pipelines;
using Rdmp.Core.DataExport.Data;
using Rdmp.Core.DataViewing;
using Rdmp.Core.Icons.IconProvision;
using Rdmp.Core.ReusableLibraryCode.DataAccess;
using Rdmp.Core.ReusableLibraryCode.Icons.IconProvision;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.PixelFormats;
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.Common;
using System.IO;
using System.Linq;
using System.Text;

namespace Rdmp.Core.CommandExecution.AtomicCommands;

public class ExecuteCommandCreateHoldoutLookup : BasicCommandExecution
{
private readonly CohortIdentificationConfiguration _cic;
readonly IBasicActivateItems _activator;
private DiscoveredServer _server;
private DataTable _dataTable;


public ExecuteCommandCreateHoldoutLookup(IBasicActivateItems activator,
CohortIdentificationConfiguration cic) : base(activator)
{
_cic = cic;
_activator = activator;
}

public override string GetCommandName() => "Create Holdout";

/// <summary>
/// Describes in a user friendly way the activity of picking an <see cref="ExternalCohortTable"/>
/// </summary>
/// <returns></returns>
private static DialogArgs GetChooseCohortDialogArgs() =>
new()
{
WindowTitle = "Choose where to save cohort",
TaskDescription =
"Select the Cohort Database in which to store the identifiers. If you have multiple methods of anonymising cohorts or manage different types of identifiers (e.g. CHI lists, ECHI lists and/or BarcodeIDs) then you must pick the Cohort Database that matches your cohort identifier type/anonymisation protocol.",
EntryLabel = "Select Cohort Database",
AllowAutoSelect = true
};

private DataTable LoadDataTable(DiscoveredServer server, string sql)
{

var dt = new DataTable();

try
{
using var con = server.GetConnection();
con.Open();
using var cmd = server.GetCommand(sql, con);
cmd.CommandTimeout = 10000;
var adapter = server.GetDataAdapter(cmd);
dt.BeginLoadData();
adapter.Fill(dt);
dt.EndLoadData();
con.Close();
}
catch (Exception e)
{
GlobalError("Unable to access datatable",e);
}
Fixed Show fixed Hide fixed
Comment on lines +77 to +80

Check notice

Code scanning / CodeQL

Generic catch clause Note

Generic catch clause.
return dt;

}

private const string HoldoutShuffle = "_HoldoutShuffle";

public override void Execute()
{
base.Execute();

SelectOne(GetChooseCohortDialogArgs(),
BasicActivator.RepositoryLocator.DataExportRepository,
out ExternalCohortTable ect);
if (ect is null)
return;

var holdoutRequest = BasicActivator.GetCohortHoldoutLookupRequest(ect, null, _cic);
if(holdoutRequest is null)
return;

var cohortConfiguration = new ViewCohortIdentificationConfigurationSqlCollection(_cic);
var sql = cohortConfiguration.GetSql();
_server = DataAccessPortal
.ExpectServer(cohortConfiguration.GetDataAccessPoint(), DataAccessContext.InternalDataProcessing, false);
_server.TestConnection();
_dataTable = LoadDataTable(_server, sql);
if(_dataTable.Rows.Count == 0)
{
Show("Unable to Access Cohort");
return;
}
StringBuilder sb = new();

var columnNames = _dataTable.Columns.Cast<DataColumn>().
Select(static column => column.ColumnName);
sb.AppendLine(string.Join(",", columnNames));
_dataTable.Columns.Add(HoldoutShuffle);
Random rnd = new();
foreach (DataRow row in _dataTable.Rows)
{
row[HoldoutShuffle] = rnd.Next();
}
var beforeDate = holdoutRequest.MaxDate;
var afterDate = holdoutRequest.MinDate;
var dateColumn = holdoutRequest.DateColumnName;
var hasMinDate = false;
var hasMaxDate = false;


if (columnNames.Contains(dateColumn))
{
if (beforeDate.Date != DateTime.MinValue)
{
//has max date
hasMaxDate = true;
}
if (afterDate.Date != DateTime.MinValue)
{
//has min date
hasMinDate = true;
}
}

if (hasMinDate || hasMaxDate)
{
foreach(DataRow row in _dataTable.Rows)
{
if (hasMaxDate && DateTime.Parse(row[dateColumn].ToString()) > beforeDate) {
row.Delete();
}
else if (hasMinDate && DateTime.Parse(row[dateColumn].ToString()) < afterDate)
{
row.Delete();
}
}
}
_dataTable.DefaultView.Sort = HoldoutShuffle;
_dataTable = _dataTable.DefaultView.ToTable();
_dataTable.Columns.Remove(HoldoutShuffle);
var rowCount = holdoutRequest.Count;
var rows = _dataTable.Rows.Cast<System.Data.DataRow>().Take(rowCount);
if (holdoutRequest.IsPercent)
{
if (rowCount > 100)
{
rowCount = 100;
}
rowCount = (int)Math.Ceiling((float)_dataTable.Rows.Count / 100 * rowCount);
rows = _dataTable.Rows.Cast<DataRow>().Take(rowCount);
}

var dataRows = rows as DataRow[] ?? rows.ToArray();
if (!dataRows.Any())
{
Show("Holdout would be empty with current configuration. Will not create holdout.");
return;
}

foreach (var row in dataRows)
{
sb.AppendLine(string.Join(",", row.ItemArray.Select(static field => field?.ToString())));
}

File.WriteAllText($"{holdoutRequest.Name}.csv", sb.ToString());
var fi = new FileInfo($"{holdoutRequest.Name}.csv");

var columns = _dataTable.Columns.Cast<DataColumn>().Select(c=>c.ColumnName).ToList();

BasicActivator.SelectObject("Select an Extraction Identifier", columns.ToArray(), out var extractionIdentifier);
if (extractionIdentifier == null)
return;

var db = SelectDatabase(true, "Select a Database to store the new Holdout.");
if(db == null) return;

var pipe = _activator.RepositoryLocator.CatalogueRepository.GetAllObjects<Pipeline>().OrderByDescending(static p => p.ID)
.FirstOrDefault(static p => p.Name.Contains("BULK INSERT: CSV Import File (automated column-type detection)"));

var importCommand = new ExecuteCommandCreateNewCatalogueByImportingFile(_activator, fi, extractionIdentifier, db, pipe, null,holdoutRequest.Description);
importCommand.Execute();

}

public override Image<Rgba32> GetImage(IIconProvider iconProvider) => iconProvider.GetImage(RDMPConcept.CohortAggregate,OverlayKind.Link);
}
Loading