Skip to content

Commit

Permalink
Allow to run solr-indexing without an existing collection #289: fixi…
Browse files Browse the repository at this point in the history
…ng the issue around 'validation' Solr instance
  • Loading branch information
pkiraly committed Nov 26, 2023
1 parent 29fa7ca commit 94e7ff1
Show file tree
Hide file tree
Showing 11 changed files with 166 additions and 30 deletions.
2 changes: 1 addition & 1 deletion catalogues/nls.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
. ./setdir.sh
NAME=nls
MARC_DIR=${BASE_INPUT_DIR}/nls
TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --indexFieldCounts"
TYPE_PARAMS="--marcxml --emptyLargeCollectors --indexWithTokenizedField --indexFieldCounts --solrForScoresUrl http://localhost:8983/solr/nls_validation"
MASK=NBS_v2_validated_marcxml.xml.gz

. ./common-script
10 changes: 9 additions & 1 deletion common-script
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,20 @@ EOF
rm ${OUTPUT_DIR}/id-groupid_noheader.csv
fi

SOLR_FOR_SCORES_URL=$(echo $TYPE_PARAMS | grep -P -o --regexp='--solrForScoresUrl \K([^ ]+)')

if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "index"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${PREFIX}/sqlite.log
if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
echo "index at ${SOLR_FOR_SCORES_URL}"
# index id-groupid.csv and issue-details.csv
scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${SOLR_FOR_SCORES_URL}
fi
else
log "index (grouped)"
scripts/sqlite/calculate-aggregated-numbers.grouped.sh ${OUTPUT_DIR} ${NAME}
scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${SOLR_FOR_SCORES_URL}
scripts/sqlite/calculate-aggregated-numbers.grouped.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${SOLR_FOR_SCORES_URL}
fi
}

Expand Down
27 changes: 8 additions & 19 deletions scripts/sqlite/calculate-aggregated-numbers.grouped.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ log() {
}

OUTPUT_DIR=$1
NAME=$2
NAME=$1
SOLR_FOR_SCORES_URL=$3

log "OUTPUT_DIR: ${OUTPUT_DIR}"

Expand All @@ -21,33 +22,21 @@ else
. ./../../solr-functions
fi

SOLR_CORE=${NAME}_validation

log "create Solr core"

CORE_EXISTS=$(check_core $SOLR_CORE)
log "$SOLR_CORE exists: $CORE_EXISTS"
if [[ $CORE_EXISTS != 1 ]]; then
echo "Create Solr core '$SOLR_CORE'"
create_core $SOLR_CORE
prepare_schema $SOLR_CORE
if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
SOLR_HOST=$(extract_host $SOLR_FOR_SCORES_URL)
SOLR_CORE=$(extract_core $SOLR_FOR_SCORES_URL)
else
purge_core $SOLR_CORE
SOLR_CORE=${NAME}_validation
fi

log "populate Solr core"

php scripts/sqlite/validation-result-indexer.php ${OUTPUT_DIR} $SOLR_CORE

optimize_core $SOLR_CORE
log "using Solr at ${SOLR_HOST} with core: ${SOLR_CORE}"

log "calculate numbers"

# creating
# ${OUTPUT_DIR}/issue-grouped-types.csv
# ${OUTPUT_DIR}/issue-grouped-categories.csv
# ${OUTPUT_DIR}/issue-grouped-paths.csv
Rscript scripts/sqlite/qa_catalogue.grouping.R ${OUTPUT_DIR} $SOLR_CORE
Rscript scripts/sqlite/qa_catalogue.grouping.R ${OUTPUT_DIR} ${SOLR_HOST} $SOLR_CORE

log "import issue_grouped_types"
tail -n +2 ${OUTPUT_DIR}/issue-grouped-types.csv > ${OUTPUT_DIR}/issue-grouped-types-noheader.csv
Expand Down
55 changes: 55 additions & 0 deletions scripts/sqlite/index-issue-details.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# calculate and store the aggregated number of instances and records
# for issue types, categories and paths within each groups
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

log() {
echo "$(date +'%F %T')> $1"
}

OUTPUT_DIR=$1
NAME=$2
HAS_GROUP_PARAM=$3
SOLR_FOR_SCORES_URL=$4

log "OUTPUT_DIR: ${OUTPUT_DIR}"

if [[ -f $(pwd)/solr-functions ]]; then
. ./solr-functions
else
. ./../../solr-functions
fi

if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
SOLR_HOST=$(extract_host $SOLR_FOR_SCORES_URL)
SOLR_CORE=$(extract_core $SOLR_FOR_SCORES_URL)
else
SOLR_CORE=${NAME}_validation
fi
log "using Solr at ${SOLR_HOST}/${SOLR_CORE}"
log "create Solr core"

CORE_EXISTS=$(check_core $SOLR_CORE)
log "$SOLR_CORE exists: $CORE_EXISTS"
if [[ $CORE_EXISTS != 1 ]]; then
echo "Create Solr core '$SOLR_CORE'"
create_core $SOLR_CORE
prepare_schema $SOLR_CORE
else
purge_core $SOLR_CORE
fi

log "populate Solr core"

if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
# index id-groupid.csv and issue-details.csv
php scripts/sqlite/validation-result-indexer-simple.php ${OUTPUT_DIR} ${SOLR_HOST} ${SOLR_CORE}
else
# index id-groupid.csv and issue-details.csv
php scripts/sqlite/validation-result-indexer-grouped.php ${OUTPUT_DIR} ${SOLR_HOST} ${SOLR_CORE}
fi

optimize_core $SOLR_CORE
9 changes: 8 additions & 1 deletion scripts/sqlite/qa_catalogue.grouping.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,21 @@ if (length(args) == 0) {
} else if (length(args) == 1) {
# default output dir
OUTPUT_DIR <- args[1]
SOLR_HOST <- 'http://localhost:8983'
SOLR_CORE <- 'validation'
} else if (length(args) == 2) {
# default output dir
OUTPUT_DIR <- args[1]
SOLR_CORE <- args[2]
SOLR_HOST <- 'http://localhost:8983'
} else if (length(args) == 3) {
# default output dir
OUTPUT_DIR <- args[1]
SOLR_HOST <- args[2]
SOLR_CORE <- args[3]
}
print(sprintf('[parameters] OUTPUT_DIR: %s, SOLR_CORE: %s', OUTPUT_DIR, SOLR_CORE))
URL <- paste0('http://localhost:8983/solr/', SOLR_CORE, '/select?q=%s&rows=0')
URL <- paste0(SOLR_HOST, '/solr/', SOLR_CORE, '/select?q=%s&rows=0')

print('reading issue-summary.csv')
summary <- read_csv(sprintf('%s/%s', OUTPUT_DIR, 'issue-summary.csv'))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?php

define('LN', "\n");
define('CMD', "curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/%s/update' --data-binary '%s'");
const LN = "\n";
const CMD = "curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/%s/update' --data-binary '%s'";

function index($records) {
global $solrUrl;
Expand Down Expand Up @@ -95,13 +95,14 @@ function processIdGroupid() {
}

$dir = $argv[1];
$host = $argv[2];
$core = $argv[3];
if (preg_match('/[^\/]$/', $dir))
$dir .= '/';

$core = $argv[2];
$solrUrl = sprintf('http://localhost:8983/solr/%s/update', $core);
$inDeFirst = true;
$solrUrl = sprintf('%s/solr/%s/update', $host, $core);

$inDeFirst = true;
$fileDetails = $dir . 'issue-details.csv';
echo 'fileDetails: ', $fileDetails, "\n";
$inDe = fopen($fileDetails, "r");
Expand Down
64 changes: 64 additions & 0 deletions scripts/sqlite/validation-result-indexer-simple.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

define('LN', "\n");
define('CMD', "curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/%s/update' --data-binary '%s'");

function index($records) {
global $solrUrl;

$ch = curl_init($solrUrl);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($records));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
$result = curl_exec($ch);
curl_close($ch);
// print_r ($result);
}

function processIssueDetails($dir) {
$fileDetails = $dir . 'issue-details.csv';
echo 'fileDetails: ', $fileDetails, "\n";
$handle = fopen($fileDetails, "r");
if ($handle) {
$i = 0;
$records = [];
while (($line = fgets($handle)) != false) {
$values = str_getcsv($line);
$id = $values[0];
if ($id != 'recordId') {
if (++$i % 100000 == 0)
echo $i, LN;
$record = (object)['id' => $id, 'errorId_is' => []];
$pairs = explode(';', $values[1]);
foreach ($pairs as $pair) {
list($eid, $iid) = explode(':', $pair);
$record->errorId_is[] = (int) $eid;
}
$records[] = $record;
if (count($records) == 10000) {
index($records);
$records = [];
}
}
}

index($records);
index((object)["commit" => (object)[]]);
fclose($handle);
}
}

$dir = $argv[1];
$host = $argv[2];
$core = $argv[3];

if (preg_match('/[^\/]$/', $dir))
$dir .= '/';

$solrUrl = sprintf('%s/solr/%s/update', $host, $core);

processIssueDetails($dir);

echo "indexing is DONE\n";
exit();
11 changes: 11 additions & 0 deletions solr-functions
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,14 @@ store_fields() {
> $OUTPUT_DIR/solr-fields.json
}

extract_host() {
URL=$1
HOST=$(echo ${URL} | grep -oP "^https?://[^/]+" || true)
echo ${HOST}
}

extract_core() {
URL=$1
CORE=$(echo ${URL} | grep -oP "[^/]+$" || true)
echo ${CORE};
}
4 changes: 3 additions & 1 deletion src/main/assembly/release.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
<include>sqlite/qa_catalogue.grouped.sqlite.sql</include>
<include>sqlite/calculate-aggregated-numbers.grouped.sh</include>
<include>sqlite/qa_catalogue.grouping.R</include>
<include>sqlite/validation-result-indexer.php</include>
<include>sqlite/validation-result-indexer-grouped.php</include>
<include>sqlite/validation-result-indexer-simple.php</include>
<include>sqlite/index-issue-details.sh</include>

<include>timeline/timeline.R</include>
<include>timeline/timeline.sh</include>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public Marc21DataFieldDefinition(String tag, String label, boolean repeatable, b
this.label = label;
this.fixed = fixed;
cardinality = repeatable ? Cardinality.Repeatable : Cardinality.Nonrepeatable;
this.descriptionUrl = String.format("https://www.loc.gov/marc/authority/ad%s.html", tag);
}

public void setInd1(Indicator indicator) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ private void readStream(InputStream inputStream) throws IOException, ParseExcept
private void process(JSONObject jsonObject) throws IOException, ParseException, URISyntaxException {
for (Map.Entry<String, Object> entry : jsonObject.entrySet()) {
String id = entry.getKey();
// if (id.equals("leader"))
// continue;

JSONObject field = (JSONObject) entry.getValue();
DataFieldDefinition tag = new Marc21DataFieldDefinition(
Expand Down

0 comments on commit 94e7ff1

Please sign in to comment.