Skip to content

Commit

Permalink
Merge pull request #795 from SMI/reviewer-fixes
Browse files Browse the repository at this point in the history
Added the ability to provide IsIdentifiable arguments in the global y…
  • Loading branch information
rkm authored Jun 15, 2021
2 parents 02d3d71 + 4ebd6bf commit 63fd851
Show file tree
Hide file tree
Showing 6 changed files with 386 additions and 4 deletions.
48 changes: 48 additions & 0 deletions data/microserviceConfigs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,52 @@ IsIdentifiableOptions:
MaxConfirmAttempts: 1
ClassifierType: 'Microservices.IsIdentifiable.Service.TesseractStanfordDicomFileClassifier'
DataDirectory: ''

#Optional. Full connection string to the database storing the whitelist of valid entries
WhitelistConnectionString:
#Optional. The DBMS provider of the whitelist table e.g. MySql
WhitelistDatabaseType:
#Optional. The unqualified name of the whitelist table
WhitelistTableName:
#Optional. The column in WhitelistTableName which contains the whitelist elements
WhitelistColumn:

#Optional. Path to a CSV file containing a single untitled column of whitelist values
WhitelistCsv:

#Optional. Generate a report on the proportion of values failing validation (for each column)
#ColumnReport: true

#Optional. Generate a report listing every unique value failing validation (and the column the value failed in)
#ValuesReport: true

#Optional. Generate a full failure storage report that persists Failure objects in a manner that they can be retrieved.
#StoreReport: true

#Optional - If specified reports will be generated in the given folder. If not specified, current directory is used (unless an alternate destination option is picked)
DestinationCsvFolder:
#Optional - If specified, the given separator will be used instead of ,. Includes support for \t for tab and \r\n
DestinationCsvSeparator:
#Optional - If specified all tabs, newlines (\r and \n) and 2+ spaces will be stripped from the values written as output (applies to all output formats)
DestinationNoWhitespace:

#Optional. Full connection string to the database in which to store the report results
DestinationConnectionString:
#Optional. The DBMS provider of DestinationConnectionString e.g. MySql
DestinationDatabaseType:

#Optional. If specified postcodes will not be reported as failures
IgnorePostcodes: false
#Optional. Comma separated list of columns/tags which should be ignored and not processed
SkipColumns:
#Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures.
IgnoreDatesInText:
#Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data
MaxCacheSize:

#Optional. Filename of additional rules in yaml format.
RulesFile:
#Optional. Directory of additional rules in yaml format.
RulesDirectory:
#Optional. Maximum number of answers to cache per column.
MaxValidationCacheSize:
1 change: 1 addition & 0 deletions news/795-feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added support for specifying IsIdentifiable CLI options in the yaml config files instead of command line (command line will always take precedence if both are specified)
101 changes: 101 additions & 0 deletions src/common/Smi.Common/Options/GlobalOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,107 @@ public class IsIdentifiableOptions : ConsumerOptions
/// </summary>
public string DataDirectory { get; set; }

/// <summary>
/// "Optional. Full connection string to the database storing the whitelist of valid entries"
/// </summary>
public string WhitelistConnectionString { get; set; }

/// <summary>
/// "Optional. The DBMS provider of the whitelist table e.g. MySql"
/// </summary>
public DatabaseType? WhitelistDatabaseType { get; set; }

/// <summary>
/// "Optional. The unqualified name of the whitelist table"
/// </summary>
public string WhitelistTableName { get; set; }

/// <summary>
/// "Optional. The column in WhitelistTableName which contains the whitelist elements"
/// </summary>
public string WhitelistColumn { get; set; }

/// <summary>
/// "Optional. Path to a CSV file containing a single untitled column of whitelist values"
/// </summary>
public string WhitelistCsv { get; set; }

/// <summary>
/// Optional. Generate a report on the proportion of values failing validation (for each column)")]
/// </summary>
public bool? ColumnReport { get; set; }

/// <summary>
/// Optional. Generate a report listing every unique value failing validation (and the column the value failed in)
/// </summary>
public bool? ValuesReport { get; set; }

/// <summary>
/// Optional. Generate a full failure storage report that persists Failure objects in a manner that they can be retrieved.
/// </summary>
public bool? StoreReport { get; set; }

/// <summary>
/// Optional - If specified reports will be generated in the given folder. If not specified, current directory is used (unless an alternate destination option is picked)
/// </summary>
public string DestinationCsvFolder { get; set; }

/// <summary>
/// @"Optional - If specified, the given separator will be used instead of ,. Includes support for \t for tab and \r\n."
/// </summary>
public string DestinationCsvSeparator { get; set; }

/// <summary>
/// @"Optional - If specified all tabs, newlines (\r and \n) and 2+ spaces will be stripped from the values written as output (applies to all output formats)"
/// </summary>
public bool? DestinationNoWhitespace { get; set; }

/// <summary>
/// "Optional. Full connection string to the database in which to store the report results"
/// </summary>
public string DestinationConnectionString { get; set; }

/// <summary>
/// "Optional. The DBMS provider of DestinationConnectionString e.g. MySql"
/// </summary>
public DatabaseType? DestinationDatabaseType { get; set; }

/// <summary>
/// "Optional. If specified postcodes will not be reported as failures"
/// </summary>
public bool? IgnorePostcodes { get; set; }

/// <summary>
/// "Optional. Comma separated list of columns/tags which should be ignored and not processed"
/// </summary>
public string SkipColumns { get; set; }

/// <summary>
/// "Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures."
/// </summary>
public bool? IgnoreDatesInText { get; set; }

/// <summary>
/// "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data"
/// </summary>
public int? MaxCacheSize { get; set; }

/// <summary>
/// "Optional. Filename of additional rules in yaml format."
/// </summary>
public string RulesFile { get; set; }

/// <summary>
/// "Optional. Directory of additional rules in yaml format."
/// </summary>
public string RulesDirectory { get; set; }

/// <summary>
/// "Optional. Maximum number of answers to cache per column."
/// </summary>
public int? MaxValidationCacheSize { get; set; }


public ProducerOptions IsIdentifiableProducerOptions {get; set;}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,21 @@ public abstract class IsIdentifiableAbstractOptions : CliOptions
[Option(HelpText = "Optional. If set and using a 7 class NER model then DATE and TIME objects will not be considered failures.")]
public bool IgnoreDatesInText { get; set; }

[Option(HelpText = "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data", Default = 10000)]
public int MaxCacheSize { get; set; }
[Option(HelpText = "Optional. Set to control the max size of the in-memory store of processed before the get written out to any destinations. Only makes sense for reports that don't perform any aggregation across the data", Default = MaxCacheSizeDefault)]
public int MaxCacheSize { get; set; } = MaxCacheSizeDefault;

public const int MaxCacheSizeDefault = 10000;

[Option(HelpText = "Optional. Filename of additional rules in yaml format.")]
public string RulesFile { get; set; }

[Option(HelpText = "Optional. Directory of additional rules in yaml format.")]
public string RulesDirectory { get; set; }

[Option(HelpText = "Optional. Maximum number of answers to cache per column.", Default = 1_000_000)]
public int MaxValidationCacheSize { get; set; } = 1_000_000;
[Option(HelpText = "Optional. Maximum number of answers to cache per column.", Default = MaxValidationCacheSizeDefault)]
public int MaxValidationCacheSize { get; set; } = MaxValidationCacheSizeDefault;

public const int MaxValidationCacheSizeDefault = 1_000_000;

/// <summary>
/// Returns a short string with no spaces or punctuation that describes the target. This will be used
Expand All @@ -81,5 +85,74 @@ public virtual void ValidateOptions()
{

}


/// <summary>
/// Populates class options that have not been specified on the command line directly by using the values (if any) in the
/// default yaml file for smi services
/// </summary>
/// <param name="globalOpts"></param>
public virtual void FillMissingWithValuesUsing(IsIdentifiableOptions globalOpts)
{
if (string.IsNullOrWhiteSpace(WhitelistConnectionString))
WhitelistConnectionString = globalOpts.WhitelistConnectionString;

if (WhitelistDatabaseType == default(DatabaseType) && globalOpts.WhitelistDatabaseType.HasValue)
WhitelistDatabaseType = globalOpts.WhitelistDatabaseType.Value;

if (string.IsNullOrWhiteSpace(WhitelistTableName))
WhitelistTableName = globalOpts.WhitelistTableName;

if (string.IsNullOrWhiteSpace(WhitelistColumn))
WhitelistColumn = globalOpts.WhitelistColumn;

if (string.IsNullOrWhiteSpace(WhitelistCsv))
WhitelistCsv = globalOpts.WhitelistCsv;

if (ColumnReport == default(bool) && globalOpts.ColumnReport.HasValue)
ColumnReport = globalOpts.ColumnReport.Value;

if (ValuesReport == default(bool) && globalOpts.ValuesReport.HasValue)
ValuesReport = globalOpts.ValuesReport.Value;

if (StoreReport == default(bool) && globalOpts.StoreReport.HasValue)
StoreReport = globalOpts.StoreReport.Value;

if (string.IsNullOrWhiteSpace(DestinationCsvFolder))
DestinationCsvFolder = globalOpts.DestinationCsvFolder;

if (string.IsNullOrWhiteSpace(DestinationCsvSeparator))
DestinationCsvSeparator = globalOpts.DestinationCsvSeparator;

if (DestinationNoWhitespace == default(bool) && globalOpts.DestinationNoWhitespace.HasValue)
DestinationNoWhitespace = globalOpts.DestinationNoWhitespace.Value;

if (string.IsNullOrWhiteSpace(DestinationConnectionString))
DestinationConnectionString = globalOpts.DestinationConnectionString;

if (DestinationDatabaseType == default(DatabaseType) && globalOpts.DestinationDatabaseType.HasValue)
DestinationDatabaseType = globalOpts.DestinationDatabaseType.Value;

if (IgnorePostcodes == default(bool) && globalOpts.IgnorePostcodes.HasValue)
IgnorePostcodes = globalOpts.IgnorePostcodes.Value;

if (string.IsNullOrWhiteSpace(SkipColumns))
SkipColumns = globalOpts.SkipColumns;

if (IgnoreDatesInText == default(bool) && globalOpts.IgnoreDatesInText.HasValue)
IgnoreDatesInText = globalOpts.IgnoreDatesInText.Value;

if (MaxCacheSize == MaxCacheSizeDefault && globalOpts.MaxCacheSize.HasValue)
MaxCacheSize = globalOpts.MaxCacheSize.Value;

if (string.IsNullOrWhiteSpace(RulesFile))
RulesFile = globalOpts.RulesFile;

if (string.IsNullOrWhiteSpace(RulesDirectory))
RulesDirectory = globalOpts.RulesDirectory;

if (MaxValidationCacheSize == MaxValidationCacheSizeDefault && globalOpts.MaxValidationCacheSize.HasValue)
MaxValidationCacheSize = globalOpts.MaxValidationCacheSize.Value;
}
}
}
3 changes: 3 additions & 0 deletions src/microservices/Microservices.IsIdentifiable/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ private static int OnParse(GlobalOptions globals, object parsedOpts)
{
var opts = SmiCliInit.Verify<IsIdentifiableAbstractOptions>(parsedOpts);

// For any values not specified on the command line - use the yaml values
opts.FillMissingWithValuesUsing(globals.IsIdentifiableOptions);

return opts switch
{
IsIdentifiableRelationalDatabaseOptions o => Run(o),
Expand Down
Loading

0 comments on commit 63fd851

Please sign in to comment.