diff --git a/data/microserviceConfigs/default.yaml b/data/microserviceConfigs/default.yaml index de516cb53..cba437e8a 100644 --- a/data/microserviceConfigs/default.yaml +++ b/data/microserviceConfigs/default.yaml @@ -217,4 +217,52 @@ IsIdentifiableOptions: MaxConfirmAttempts: 1 ClassifierType: 'Microservices.IsIdentifiable.Service.TesseractStanfordDicomFileClassifier' DataDirectory: '' + + #Optional. Full connection string to the database storing the whitelist of valid entries + WhitelistConnectionString: + #Optional. The DBMS provider of the whitelist table e.g. MySql + WhitelistDatabaseType: + #Optional. The unqualified name of the whitelist table + WhitelistTableName: + #Optional. The column in WhitelistTableName which contains the whitelist elements + WhitelistColumn: + + #Optional. Path to a CSV file containing a single untitled column of whitelist values + WhitelistCsv: + + #Optional. Generate a report on the proportion of values failing validation (for each column) + #ColumnReport: true + + #Optional. Generate a report listing every unique value failing validation (and the column the value failed in) + #ValuesReport: true + + #Optional. Generate a full failure storage report that persists Failure objects in a manner that they can be retrieved. + #StoreReport: true + + #Optional - If specified reports will be generated in the given folder. If not specified, current directory is used (unless an alternate destination option is picked) + DestinationCsvFolder: + #Optional - If specified, the given separator will be used instead of ,. Includes support for \t for tab and \r\n + DestinationCsvSeparator: + #Optional - If specified all tabs, newlines (\r and \n) and 2+ spaces will be stripped from the values written as output (applies to all output formats) + DestinationNoWhitespace: + + #Optional. Full connection string to the database in which to store the report results + DestinationConnectionString: + #Optional. The DBMS provider of DestinationConnectionString e.g. MySql + DestinationDatabaseType: + + #Optional. If specified postcodes will not be reported as failures + IgnorePostcodes: false + #Optional. Comma separated list of columns/tags which should be ignored and not processed + SkipColumns: + #Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures. + IgnoreDatesInText: + #Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data + MaxCacheSize: + #Optional. Filename of additional rules in yaml format. + RulesFile: + #Optional. Directory of additional rules in yaml format. + RulesDirectory: + #Optional. Maximum number of answers to cache per column. + MaxValidationCacheSize: diff --git a/news/795-feature.md b/news/795-feature.md new file mode 100644 index 000000000..f8f5ed686 --- /dev/null +++ b/news/795-feature.md @@ -0,0 +1 @@ +Added support for specifying IsIdentifiable CLI options in the yaml config files instead of command line (command line will always take precedence if both are specified) \ No newline at end of file diff --git a/src/common/Smi.Common/Options/GlobalOptions.cs b/src/common/Smi.Common/Options/GlobalOptions.cs index b699a58e8..1179caf44 100644 --- a/src/common/Smi.Common/Options/GlobalOptions.cs +++ b/src/common/Smi.Common/Options/GlobalOptions.cs @@ -112,6 +112,107 @@ public class IsIdentifiableOptions : ConsumerOptions /// public string DataDirectory { get; set; } + /// + /// "Optional. Full connection string to the database storing the whitelist of valid entries" + /// + public string WhitelistConnectionString { get; set; } + + /// + /// "Optional. The DBMS provider of the whitelist table e.g. MySql" + /// + public DatabaseType? WhitelistDatabaseType { get; set; } + + /// + /// "Optional. The unqualified name of the whitelist table" + /// + public string WhitelistTableName { get; set; } + + /// + /// "Optional. The column in WhitelistTableName which contains the whitelist elements" + /// + public string WhitelistColumn { get; set; } + + /// + /// "Optional. Path to a CSV file containing a single untitled column of whitelist values" + /// + public string WhitelistCsv { get; set; } + + /// + /// Optional. Generate a report on the proportion of values failing validation (for each column)")] + /// + public bool? ColumnReport { get; set; } + + /// + /// Optional. Generate a report listing every unique value failing validation (and the column the value failed in) + /// + public bool? ValuesReport { get; set; } + + /// + /// Optional. Generate a full failure storage report that persists Failure objects in a manner that they can be retrieved. + /// + public bool? StoreReport { get; set; } + + /// + /// Optional - If specified reports will be generated in the given folder. If not specified, current directory is used (unless an alternate destination option is picked) + /// + public string DestinationCsvFolder { get; set; } + + /// + /// @"Optional - If specified, the given separator will be used instead of ,. Includes support for \t for tab and \r\n." + /// + public string DestinationCsvSeparator { get; set; } + + /// + /// @"Optional - If specified all tabs, newlines (\r and \n) and 2+ spaces will be stripped from the values written as output (applies to all output formats)" + /// + public bool? DestinationNoWhitespace { get; set; } + + /// + /// "Optional. Full connection string to the database in which to store the report results" + /// + public string DestinationConnectionString { get; set; } + + /// + /// "Optional. The DBMS provider of DestinationConnectionString e.g. MySql" + /// + public DatabaseType? DestinationDatabaseType { get; set; } + + /// + /// "Optional. If specified postcodes will not be reported as failures" + /// + public bool? IgnorePostcodes { get; set; } + + /// + /// "Optional. Comma separated list of columns/tags which should be ignored and not processed" + /// + public string SkipColumns { get; set; } + + /// + /// "Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures." + /// + public bool? IgnoreDatesInText { get; set; } + + /// + /// "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data" + /// + public int? MaxCacheSize { get; set; } + + /// + /// "Optional. Filename of additional rules in yaml format." + /// + public string RulesFile { get; set; } + + /// + /// "Optional. Directory of additional rules in yaml format." + /// + public string RulesDirectory { get; set; } + + /// + /// "Optional. Maximum number of answers to cache per column." + /// + public int? MaxValidationCacheSize { get; set; } + + public ProducerOptions IsIdentifiableProducerOptions {get; set;} } diff --git a/src/microservices/Microservices.IsIdentifiable/Options/IsIdentifiableAbstractOptions.cs b/src/microservices/Microservices.IsIdentifiable/Options/IsIdentifiableAbstractOptions.cs index e51fca051..36e8aeef6 100644 --- a/src/microservices/Microservices.IsIdentifiable/Options/IsIdentifiableAbstractOptions.cs +++ b/src/microservices/Microservices.IsIdentifiable/Options/IsIdentifiableAbstractOptions.cs @@ -55,8 +55,10 @@ public abstract class IsIdentifiableAbstractOptions : CliOptions [Option(HelpText = "Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures.")] public bool IgnoreDatesInText { get; set; } - [Option(HelpText = "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data", Default = 10000)] - public int MaxCacheSize { get; set; } + [Option(HelpText = "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data", Default = MaxCacheSizeDefault)] + public int MaxCacheSize { get; set; } = MaxCacheSizeDefault; + + public const int MaxCacheSizeDefault = 10000; [Option(HelpText = "Optional. Filename of additional rules in yaml format.")] public string RulesFile { get; set; } @@ -64,8 +66,10 @@ public abstract class IsIdentifiableAbstractOptions : CliOptions [Option(HelpText = "Optional. Directory of additional rules in yaml format.")] public string RulesDirectory { get; set; } - [Option(HelpText = "Optional. Maximum number of answers to cache per column.", Default = 1_000_000)] - public int MaxValidationCacheSize { get; set; } = 1_000_000; + [Option(HelpText = "Optional. Maximum number of answers to cache per column.", Default = MaxValidationCacheSizeDefault)] + public int MaxValidationCacheSize { get; set; } = MaxValidationCacheSizeDefault; + + public const int MaxValidationCacheSizeDefault = 1_000_000; /// /// Returns a short string with no spaces or punctuation that describes the target. This will be used @@ -81,5 +85,74 @@ public virtual void ValidateOptions() { } + + + /// + /// Populates class options that have not been specified on the command line directly by using the values (if any) in the + /// default yaml file for smi services + /// + /// + public virtual void FillMissingWithValuesUsing(IsIdentifiableOptions globalOpts) + { + if (string.IsNullOrWhiteSpace(WhitelistConnectionString)) + WhitelistConnectionString = globalOpts.WhitelistConnectionString; + + if (WhitelistDatabaseType == default(DatabaseType) && globalOpts.WhitelistDatabaseType.HasValue) + WhitelistDatabaseType = globalOpts.WhitelistDatabaseType.Value; + + if (string.IsNullOrWhiteSpace(WhitelistTableName)) + WhitelistTableName = globalOpts.WhitelistTableName; + + if (string.IsNullOrWhiteSpace(WhitelistColumn)) + WhitelistColumn = globalOpts.WhitelistColumn; + + if (string.IsNullOrWhiteSpace(WhitelistCsv)) + WhitelistCsv = globalOpts.WhitelistCsv; + + if (ColumnReport == default(bool) && globalOpts.ColumnReport.HasValue) + ColumnReport = globalOpts.ColumnReport.Value; + + if (ValuesReport == default(bool) && globalOpts.ValuesReport.HasValue) + ValuesReport = globalOpts.ValuesReport.Value; + + if (StoreReport == default(bool) && globalOpts.StoreReport.HasValue) + StoreReport = globalOpts.StoreReport.Value; + + if (string.IsNullOrWhiteSpace(DestinationCsvFolder)) + DestinationCsvFolder = globalOpts.DestinationCsvFolder; + + if (string.IsNullOrWhiteSpace(DestinationCsvSeparator)) + DestinationCsvSeparator = globalOpts.DestinationCsvSeparator; + + if (DestinationNoWhitespace == default(bool) && globalOpts.DestinationNoWhitespace.HasValue) + DestinationNoWhitespace = globalOpts.DestinationNoWhitespace.Value; + + if (string.IsNullOrWhiteSpace(DestinationConnectionString)) + DestinationConnectionString = globalOpts.DestinationConnectionString; + + if (DestinationDatabaseType == default(DatabaseType) && globalOpts.DestinationDatabaseType.HasValue) + DestinationDatabaseType = globalOpts.DestinationDatabaseType.Value; + + if (IgnorePostcodes == default(bool) && globalOpts.IgnorePostcodes.HasValue) + IgnorePostcodes = globalOpts.IgnorePostcodes.Value; + + if (string.IsNullOrWhiteSpace(SkipColumns)) + SkipColumns = globalOpts.SkipColumns; + + if (IgnoreDatesInText == default(bool) && globalOpts.IgnoreDatesInText.HasValue) + IgnoreDatesInText = globalOpts.IgnoreDatesInText.Value; + + if (MaxCacheSize == MaxCacheSizeDefault && globalOpts.MaxCacheSize.HasValue) + MaxCacheSize = globalOpts.MaxCacheSize.Value; + + if (string.IsNullOrWhiteSpace(RulesFile)) + RulesFile = globalOpts.RulesFile; + + if (string.IsNullOrWhiteSpace(RulesDirectory)) + RulesDirectory = globalOpts.RulesDirectory; + + if (MaxValidationCacheSize == MaxValidationCacheSizeDefault && globalOpts.MaxValidationCacheSize.HasValue) + MaxValidationCacheSize = globalOpts.MaxValidationCacheSize.Value; + } } } diff --git a/src/microservices/Microservices.IsIdentifiable/Program.cs b/src/microservices/Microservices.IsIdentifiable/Program.cs index 035f9116a..5bbcdf5a6 100644 --- a/src/microservices/Microservices.IsIdentifiable/Program.cs +++ b/src/microservices/Microservices.IsIdentifiable/Program.cs @@ -34,6 +34,9 @@ private static int OnParse(GlobalOptions globals, object parsedOpts) { var opts = SmiCliInit.Verify(parsedOpts); + // For any values not specified on the command line - use the yaml values + opts.FillMissingWithValuesUsing(globals.IsIdentifiableOptions); + return opts switch { IsIdentifiableRelationalDatabaseOptions o => Run(o), diff --git a/tests/microservices/Microservices.IsIdentifiable.Tests/ServiceTests/IsIdentifiableAbstractOptionsTests.cs b/tests/microservices/Microservices.IsIdentifiable.Tests/ServiceTests/IsIdentifiableAbstractOptionsTests.cs new file mode 100644 index 000000000..77a7aaffb --- /dev/null +++ b/tests/microservices/Microservices.IsIdentifiable.Tests/ServiceTests/IsIdentifiableAbstractOptionsTests.cs @@ -0,0 +1,156 @@ +using FAnsi; +using Microservices.IsIdentifiable.Options; +using NUnit.Framework; +using Smi.Common.Options; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Microservices.IsIdentifiable.Tests.ServiceTests +{ + class IsIdentifiableAbstractOptionsTests + { + [Test] + public void FillMissingWithValuesUsing_NoOptionsAnywhere() + { + var opts = new IsIdentifiableDicomFileOptions(); + var globalOpts = new IsIdentifiableOptions(); + opts.FillMissingWithValuesUsing(globalOpts); + } + + [Test] + public void FillMissingWithValuesUsing_Override() + { + int propsCounted = 0; + + foreach(var gProp in typeof(IsIdentifiableOptions).GetProperties()) + { + var cliProp = typeof(IsIdentifiableDicomFileOptions).GetProperty(gProp.Name); + + if(cliProp == null) + { + continue; + } + + var opts = new IsIdentifiableDicomFileOptions(); + var globalOpts = new IsIdentifiableOptions(); + + var testVal = GetTestValue(cliProp); + gProp.SetValue(globalOpts, testVal); + + Assert.AreNotEqual(testVal, cliProp.GetValue(opts)); + opts.FillMissingWithValuesUsing(globalOpts); + Assert.AreEqual(testVal, cliProp.GetValue(opts)); + + propsCounted++; + } + + // we did test some properties right! + Assert.Greater(propsCounted, 0); + + } + + [Test] + public void FillMissingWithValuesUsing_NoOverride() + { + int propsCounted = 0; + + foreach (var gProp in typeof(IsIdentifiableOptions).GetProperties()) + { + var cliProp = typeof(IsIdentifiableDicomFileOptions).GetProperty(gProp.Name); + + if (cliProp == null) + { + continue; + } + + var opts = new IsIdentifiableDicomFileOptions(); + var globalOpts = new IsIdentifiableOptions(); + + var testVal1 = GetTestValue(cliProp); + var testVal2 = GetTestValue2(cliProp); + + if(testVal1 is bool) + { + // boolean cli false is the default so missing and false are the same + // so instead lets make sure that false in yaml config doesn't override + // true in cli + testVal1 = false; + testVal2 = true; + } + + // yaml says one value + gProp.SetValue(globalOpts, testVal1); + // cli says a different value + cliProp.SetValue(opts, testVal2); + + // we should not have the yaml file entry + Assert.AreNotEqual(testVal1, cliProp.GetValue(opts)); + + // we ask to fill in missing values using the yaml entries + opts.FillMissingWithValuesUsing(globalOpts); + + // but we had an entry on CLI already so that should take precedence + Assert.AreNotEqual(testVal1, cliProp.GetValue(opts)); + Assert.AreEqual(testVal2, cliProp.GetValue(opts)); + + propsCounted++; + } + + // we did test some properties right! + Assert.Greater(propsCounted, 0); + + } + + private object GetTestValue(System.Reflection.PropertyInfo gProp) + { + if(gProp.PropertyType == typeof(int)) + { + return 5123; + } + + if (gProp.PropertyType == typeof(string)) + { + return "troll doll!"; + } + if (gProp.PropertyType == typeof(bool)) + { + return true; + } + + if (gProp.PropertyType == typeof(DatabaseType)) + { + return DatabaseType.MySql; + } + + throw new ArgumentException($"Not sure what value to use in test for PropertyType {gProp.PropertyType}. This is an error in the test harness coverage not the underlying code."); + + } + private object GetTestValue2(System.Reflection.PropertyInfo gProp) + { + if (gProp.PropertyType == typeof(int)) + { + return 66456; + } + + if (gProp.PropertyType == typeof(string)) + { + return "rylyly?"; + } + if (gProp.PropertyType == typeof(bool)) + { + return false; + } + + if (gProp.PropertyType == typeof(DatabaseType)) + { + return DatabaseType.Oracle; + } + + throw new ArgumentException($"Not sure what value to use in test for PropertyType {gProp.PropertyType}. This is an error in the test harness coverage not the underlying code."); + + } + } +}