Skip to content

Commit

Permalink
Filter parameters in common-script #548
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 21, 2024
1 parent 30ca597 commit 06725ec
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 21 deletions.
65 changes: 49 additions & 16 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@ log() {
}

# define 'untrace' command to disable trace mode
# remove all alias definitions
unalias -a
# shopt set and unset shell options.
# -s OPTNAME: enable (set) each OPTNAME
# expand_aliases: aliases are expanded
# Aliases are not expanded when the shell is not interactive,
# unless the expand_aliases shell option is set using shopt
shopt -s expand_aliases
# 'set +x': Print commands and their arguments as they are executed.
alias untrace='{ set +x; } 2> /dev/null'

# start a named processing step and enable trace mode
Expand All @@ -30,26 +37,35 @@ run() {
set -x
}

filter_params() {
php scripts/utils/parameter-filter.php $1 ${TYPE_PARAMS}
}

# ---- proccessing steps ----

# run validation
do_validate() {
GENERAL_PARAMS="--details --trimId --summary --format csv --defaultRecordType BOOKS"
OUTPUT_PARAMS="--outputDir ${OUTPUT_DIR} --detailsFileName issue-details.csv --summaryFileName issue-summary.csv"
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "validate")
log "log file: ${LOG_DIR}/validate.log"
run validate
./validate ${GENERAL_PARAMS} ${OUTPUT_PARAMS} ${PARAMS} ${MARC_DIR}/$MASK 2> ${LOG_DIR}/validate.log
}

do_prepare_solr() {
log "log file: ${LOG_DIR}/solr.log"
run prepare-solr
./prepare-solr $NAME 2> ${LOG_DIR}/solr.log
}

do_index() {
log "log file: ${LOG_DIR}/solr.log"
run index
untrace

PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+//g')
PARAMS=$(filter_params "index")

# HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# PARAMS="${PARAMS} --solrForScoresUrl ${NAME}_validation"
Expand All @@ -67,12 +83,14 @@ do_index() {
}

do_postprocess_solr() {
log "log file: ${LOG_DIR}/solr.log"
run postprocess-solr
./postprocess-solr $NAME 2>> ${LOG_DIR}/solr.log
}

do_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "completeness")
log "log file: ${LOG_DIR}/completeness.log"
run completeness
./completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/completeness.log
}
Expand All @@ -86,20 +104,23 @@ do_completeness_sqlite() {
}

do_classifications() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "classifications")
log "log file: ${LOG_DIR}/classifications.log"
run classifications
./classifications --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/classifications.log
Rscript scripts/classifications/classifications-type.R ${OUTPUT_DIR}
}

do_authorities() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "authorities")
log "log file: ${LOG_DIR}/authorities.log"
run authorities
./authorities --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/authorities.log
}

do_tt_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "tt-completeness")
log "log file: ${LOG_DIR}/tt-completeness.log"
run tt-completeness
./tt-completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/tt-completeness.log
Rscript scripts/tt-histogram/tt-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/tt-completeness.log
Expand All @@ -109,7 +130,8 @@ do_tt_completeness() {
}

do_shelf_ready_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "shelf-ready-completeness")
log "log file: ${LOG_DIR}/shelf-ready-completeness.log"
run shelf-ready-completeness
./shelf-ready-completeness \
--defaultRecordType BOOKS \
Expand All @@ -124,7 +146,8 @@ do_shelf_ready_completeness() {
}

do_bl_classification() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "bk-classification")
log "log file: ${LOG_DIR}/bl-classification.log"
run bk-classification
./bl-classification \
--defaultRecordType BOOKS \
Expand All @@ -134,7 +157,8 @@ do_bl_classification() {
}

do_serial_score() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "bk-classification")
log "log file: ${LOG_DIR}/serial-score.log"
run serial-score
./serial-score --defaultRecordType BOOKS \
${PARAMS} \
Expand All @@ -150,15 +174,17 @@ do_format() {
}

do_functional_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "functional-analysis")
log "log file: ${LOG_DIR}/functional-analysis.log"
run functional-analysis
./functional-analysis --defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/functional-analysis.log
}

do_network_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "network-analysis")
log "log file: ${LOG_DIR}/network-analysis.log"
run network-analysis
./network-analysis --defaultRecordType BOOKS \
${PARAMS} \
Expand Down Expand Up @@ -197,6 +223,7 @@ do_network_analysis() {
}

do_pareto() {
log "log file: ${LOG_DIR}/pareto.log"
run pareto
Rscript scripts/pareto/frequency-range.R ${OUTPUT_DIR} &> ${LOG_DIR}/pareto.log
untrace
Expand All @@ -214,7 +241,9 @@ do_marc_history() {
else
SELECTOR="008~7-10;008~0-5"
fi
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')

PARAMS=$(filter_params "marc-history")
log "log file: ${LOG_DIR}/marc-history.log"

run marc-history
./formatter --selector "$SELECTOR" --defaultRecordType BOOKS ${PARAMS} --separator "," \
Expand All @@ -233,7 +262,8 @@ do_marc_history() {
}

do_record_patterns() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "record-patterns")
log "log file: ${LOG_DIR}/record-patterns.log"

run record-patterns
Rscript scripts/record-patterns/top-fields.R ${OUTPUT_DIR} &>> ${LOG_DIR}/top-fields.log
Expand Down Expand Up @@ -320,7 +350,7 @@ EOF
ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)

if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "index"
log "index sqlite3"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${LOG_DIR}/sqlite.log
if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
echo "index at ${SOLR_FOR_SCORES_URL}"
Expand Down Expand Up @@ -441,9 +471,10 @@ do_export_cli_parameters() {

do_shacl4bib() {
# note: SHACL specific parameters are missing here --shaclConfigurationFile, --shaclOutputType, --shaclOutputFile
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
PARAMS=$(filter_params "shacl4bib")
log "log file: ${LOG_DIR}/shacl4bib.log"

run shacl4bib
echo " ./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log"
./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log
Rscript scripts/shacl4bib/shacl4bib.R ${OUTPUT_DIR}

Expand Down Expand Up @@ -571,6 +602,7 @@ config() {
}

fatal() {
echo "fatal()"
colored "1;31" "$1"
exit 1
}
Expand Down Expand Up @@ -631,6 +663,7 @@ if [[ "$datatask" = true ]]; then
fi
fi

echo "tasks: ${tasks}"
for task in ${tasks//,/ }; do
case $task in
validate) do_validate ; do_validate_sqlite ;;
Expand Down
1 change: 1 addition & 0 deletions parameter-definition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"common":[{"short":"m","long":"marcVersion","hasArg":true,"description":"MARC version ('OCLC' or 'DNB')"},{"short":"h","long":"help","hasArg":false,"description":"display help"},{"short":"n","long":"nolog","hasArg":false,"description":"do not display log messages"},{"short":"l","long":"limit","hasArg":true,"description":"limit the number of records to process"},{"short":"o","long":"offset","hasArg":true,"description":"the first record to process"},{"short":"i","long":"id","hasArg":true,"description":"the MARC identifier (content of 001)"},{"short":"d","long":"defaultRecordType","hasArg":true,"description":"the default record type if the record's type is undetectable"},{"short":"q","long":"fixAlephseq","hasArg":false,"description":"fix the known issues of Alephseq format"},{"short":"a","long":"fixAlma","hasArg":false,"description":"fix the known issues of Alma format"},{"short":"b","long":"fixKbr","hasArg":false,"description":"fix the known issues of Alma format"},{"short":"p","long":"alephseq","hasArg":false,"description":"the source is in Alephseq format"},{"short":"x","long":"marcxml","hasArg":false,"description":"the source is in MARCXML format"},{"short":"y","long":"lineSeparated","hasArg":false,"description":"the source is in line separated MARC format"},{"short":"t","long":"outputDir","hasArg":true,"description":"output directory"},{"short":"r","long":"trimId","hasArg":false,"description":"remove spaces from the end of record IDs"},{"short":"z","long":"ignorableFields","hasArg":true,"description":"ignore fields from the analysis"},{"short":"v","long":"ignorableRecords","hasArg":true,"description":"ignore records from the analysis"},{"short":"f","long":"marcFormat","hasArg":true,"description":"MARC format (like 'ISO' or 'MARCXML')"},{"short":"s","long":"dataSource","hasArg":true,"description":"data source (file of stream)"},{"short":"g","long":"defaultEncoding","hasArg":true,"description":"default character encoding"},{"short":"1","long":"alephseqLineType","hasArg":true,"description":"Alephseq line type"},{"short":"2","long":"picaIdField","hasArg":true,"description":"PICA id field"},{"short":"u","long":"picaSubfieldSeparator","hasArg":true,"description":"PICA subfield separator"},{"short":"j","long":"picaSchemaFile","hasArg":true,"description":"Avram PICA schema file"},{"short":"w","long":"schemaType","hasArg":true,"description":"metadata schema type ('MARC21', 'UNIMARC', or 'PICA')"},{"short":"k","long":"picaRecordType","hasArg":true,"description":"picaRecordType"},{"short":"c","long":"allowableRecords","hasArg":true,"description":"allow records for the analysis"},{"short":"e","long":"groupBy","hasArg":true,"description":"group the results by the value of this data element (e.g. the ILN of library)"},{"short":"3","long":"groupListFile","hasArg":true,"description":"the file which contains a list of ILN codes"},{"short":"4","long":"solrForScoresUrl","hasArg":true,"description":"the URL of the Solr server used to store scores"}],"completeness":[{"short":"R","long":"format","hasArg":true,"description":"specify a format"},{"short":"V","long":"advanced","hasArg":false,"description":"advanced mode (not yet implemented)"},{"short":"P","long":"onlyPackages","hasArg":false,"description":"only packages (not yet implemented)"}],"validate":[{"short":"G","long":"summaryFileName","hasArg":true,"description":"the summary file name (provides a summary of issues, such as the number of instance and number of records having the particular issue)"},{"short":"S","long":"summary","hasArg":false,"description":"show summary instead of record level display"},{"short":"H","long":"details","hasArg":false,"description":"show record level display"},{"short":"F","long":"detailsFileName","hasArg":true,"description":"the report file name (default is 'issue-details.csv')"},{"short":"R","long":"format","hasArg":true,"description":"specify a format"},{"short":"W","long":"emptyLargeCollectors","hasArg":false,"description":"empty large collectors"},{"short":"T","long":"collectAllErrors","hasArg":false,"description":"collect all errors (useful only for validating small number of records)"},{"short":"I","long":"ignorableIssueTypes","hasArg":true,"description":"comma separated list of issue types not to collect"}],"index":[{"short":"S","long":"solrUrl","hasArg":true,"description":"the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)"},{"short":"A","long":"doCommit","hasArg":false,"description":"commits Solr index regularly"},{"short":"T","long":"solrFieldType","hasArg":true,"description":"type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'"},{"short":"B","long":"useEmbedded","hasArg":false,"description":"use embedded Solr server (used in tests only)"},{"short":"C","long":"indexWithTokenizedField","hasArg":false,"description":"index data elements as tokenized field as well"},{"short":"D","long":"commitAt","hasArg":true,"description":"commit index after this number of records"},{"short":"E","long":"indexFieldCounts","hasArg":false,"description":"index the count of field instances"},{"short":"F","long":"fieldPrefix","hasArg":true,"description":"field prefix"}],"classifications":[{"short":"A","long":"collectCollocations","hasArg":false,"description":"collect collocation of schemas"}],"authorities":[{"short":"G","long":"summaryFileName","hasArg":true,"description":"the summary file name (provides a summary of issues, such as the number of instance and number of records having the particular issue)"},{"short":"S","long":"summary","hasArg":false,"description":"show summary instead of record level display"},{"short":"H","long":"details","hasArg":false,"description":"show record level display"},{"short":"F","long":"detailsFileName","hasArg":true,"description":"the report file name (default is 'issue-details.csv')"},{"short":"R","long":"format","hasArg":true,"description":"specify a format"},{"short":"W","long":"emptyLargeCollectors","hasArg":false,"description":"empty large collectors"},{"short":"T","long":"collectAllErrors","hasArg":false,"description":"collect all errors (useful only for validating small number of records)"},{"short":"I","long":"ignorableIssueTypes","hasArg":true,"description":"comma separated list of issue types not to collect"}],"tt-completeness":[{"short":"F","long":"fileName","hasArg":true,"description":"the report file name (default is tt-completeness.csv)"}],"shelf-ready-completeness":[{"short":"F","long":"fileName","hasArg":true,"description":"the report file name (default is shelf-ready-completeness.csv)"}],"bl-classification":[],"serial-score":[{"short":"F","long":"fileName","hasArg":true,"description":"the report file name (default is serial-score.csv)"}],"formatter":[{"short":"l","long":"selector","hasArg":true,"description":"selectors"},{"short":"p","long":"separator","hasArg":true,"description":"separator between the parts (default: TAB)"},{"short":"f","long":"format","hasArg":true,"description":"specify a format"},{"short":"s","long":"search","hasArg":true,"description":"search string ([path]=[value])"},{"short":"w","long":"withId","hasArg":false,"description":"the generated CSV should contain record ID as first field"},{"short":"c","long":"countNr","hasArg":true,"description":"count number of the record (e.g. 1 means the first record)"},{"short":"e","long":"fileName","hasArg":true,"description":"output file (default: extracted.csv)"}],"functional-analysis":[{"short":"R","long":"format","hasArg":true,"description":"specify a format"},{"short":"V","long":"advanced","hasArg":false,"description":"advanced mode (not yet implemented)"},{"short":"P","long":"onlyPackages","hasArg":false,"description":"only packages (not yet implemented)"}],"network-analysis":[{"short":"l","long":"group-limit","hasArg":true,"description":"pair creation limit"},{"short":"a","long":"action","hasArg":true,"description":"action: 'primary' (default), 'pairing'"}],"record-patterns":[{"short":"R","long":"format","hasArg":true,"description":"specify a format"},{"short":"V","long":"advanced","hasArg":false,"description":"advanced mode (not yet implemented)"},{"short":"P","long":"onlyPackages","hasArg":false,"description":"only packages (not yet implemented)"}],"shacl4bib":[{"short":"C","long":"shaclConfigurationFile","hasArg":true,"description":"specify the configuration file"},{"short":"O","long":"shaclOutputFile","hasArg":true,"description":"output file"},{"short":"P","long":"shaclOutputType","hasArg":true,"description":"output type (STATUS: status only, SCORE: score only, BOTH: status and score"}]}
2 changes: 1 addition & 1 deletion postprocess-solr
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ CORE_DEV=${CORE}_dev
echo "Swap ${CORE_DEV} to ${CORE}"
swap_cores ${CORE_DEV} ${CORE}

echo "Solr preparation DONE"
echo "Solr index swapping DONE"
2 changes: 1 addition & 1 deletion scripts/sqlite/modify-tables.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--- issue_details indices
CREATE INDEX IF NOT EXISTS "recordId" ON "issue_details" ("id");
CREATE INDEX IF NOT EXISTS "errorId" ON "issue_details" ("errorId");
CREATE INDEX IF NOT EXISTS "recordId" ON "issue_details" ("recordId");

--- issue_summary indices
CREATE INDEX IF NOT EXISTS "id" ON "issue_summary" ("id");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public String exportAll() {
options.put("formatter", export(new FormatterParameters())); // TODO at common-script
options.put("functional-analysis", export(new CompletenessParameters())); // TODO
options.put("network-analysis", export(new NetworkParameters()));
options.put("marc-history", export(new CommonParameters()));
options.put("record-patterns", export(new CompletenessParameters())); // TODO
// options.put("export-schema", read(new MappingParameters()));
options.put("shacl4bib", export(new Shacl4bibParameters()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,20 @@ public void test() {
CliParameterDefinitionsExporter extractor = new CliParameterDefinitionsExporter();
String json = extractor.exportAll();
assertNotNull(json);
System.err.println(json);
assertTrue(json.contains("\"common\""));
ObjectMapper mapper = new ObjectMapper();
Map firstItem = null;
LinkedHashMap parameters = null;
try {
parameters = (LinkedHashMap) mapper.readValue(json, Object.class);

assertEquals(15, parameters.size());
assertEquals(16, parameters.size());
assertEquals(
Set.of(
"common", "completeness", "validate", "index", "classifications",
"authorities", "tt-completeness", "shelf-ready-completeness",
"bl-classification", "serial-score", "formatter", "functional-analysis",
"network-analysis", "record-patterns", "shacl4bib"
"network-analysis", "marc-history", "record-patterns", "shacl4bib"
),
parameters.keySet());

Expand Down Expand Up @@ -152,6 +151,9 @@ public void test() {
assertEquals(true, firstItem.get("hasArg"));
assertEquals("pair creation limit", firstItem.get("description"));

assertTrue(parameters.containsKey("marc-history"));
assertEquals(0, ((List) parameters.get("marc-history")).size());

assertTrue(parameters.containsKey("record-patterns"));
assertEquals(3, ((List) parameters.get("record-patterns")).size());
firstItem = (Map) ((List) parameters.get("record-patterns")).get(0);
Expand Down

0 comments on commit 06725ec

Please sign in to comment.