From 8b525f8348069dae304bd33eddaed6b481702ac9 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 26 Oct 2023 17:12:33 +0200 Subject: [PATCH 1/3] Use daily Allegro dump in transformAndIndex.sh script (RPB-101) Add extracted files to .gitignore, backup ZIP with timestamp --- .gitignore | 6 ++++++ conf/RPBEXP/.empty | 0 conf/rpb-titel-to-lobid.flux | 2 +- transformAndIndex.sh | 9 +++++++++ 4 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 conf/RPBEXP/.empty diff --git a/.gitignore b/.gitignore index 8ba9df23..70d9b4c6 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,10 @@ RPB-Export_HBZ_SW.txt RPB-Export_HBZ_Tit.txt RPB-Export_HBZ_Tit_hbzIds.txt RPB-Export_HBZ_Bio.txt +RPB-Export_HBZ_Ort.txt +RPB-Export_HBZ_Raum.txt +RPB-Export_HBZ_SWN.txt +RPB-Export_HBZ_Syst.txt +RPB-Export_HBZ_ZSS.txt +conf/RPBEXP/*.ZIP nohup.out* diff --git a/conf/RPBEXP/.empty b/conf/RPBEXP/.empty new file mode 100644 index 00000000..e69de29b diff --git a/conf/rpb-titel-to-lobid.flux b/conf/rpb-titel-to-lobid.flux index 87ebf44b..c0ca0400 100644 --- a/conf/rpb-titel-to-lobid.flux +++ b/conf/rpb-titel-to-lobid.flux @@ -6,6 +6,6 @@ default outfile = "conf/output/bulk/bulk-${i}.ndjson"; | fix(FLUX_DIR + "rpb-titel-to-lobid.fix") | batch-reset(batchsize="1000") | encode-json(prettyPrinting="false") -| json-to-elasticsearch-bulk(idkey="id", type="resource", index="resources-alma-fix-staging") +| json-to-elasticsearch-bulk(idkey="id", type="resource", index="resources-rpb-test") | write(outfile) ; diff --git a/transformAndIndex.sh b/transformAndIndex.sh index c49504c6..2cba2b7e 100644 --- a/transformAndIndex.sh +++ b/transformAndIndex.sh @@ -2,10 +2,19 @@ set -eu IFS=$'\n\t' +# Get the daily Allegro dump: +cd conf +wget https://rpb.lbz-rlp.de/rpb04/intern/RPBEXP.ZIP +unzip -o RPBEXP.ZIP +mv RPBEXP.ZIP RPBEXP/RPBEXP-$(date "+%Y%m%d-%H%M").ZIP +cd .. + +# Transform the data: sbt "runMain rpb.ETL conf/rpb-sw.flux" sbt "runMain rpb.ETL conf/rpb-titel-to-strapi.flux" sbt "runMain rpb.ETL conf/rpb-titel-to-lobid.flux" +# Index to Elasticsearch: unset http_proxy # for posting to weywot3 for filename in conf/output/bulk/bulk-*.ndjson do From 54be8d23b584601acc4456d9e40eb58f50ee49b3 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Fri, 10 Nov 2023 12:20:52 +0100 Subject: [PATCH 2/3] Update URL for daily Allegro dump (RPB-101) --- transformAndIndex.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformAndIndex.sh b/transformAndIndex.sh index 2cba2b7e..74ebc326 100644 --- a/transformAndIndex.sh +++ b/transformAndIndex.sh @@ -4,7 +4,7 @@ IFS=$'\n\t' # Get the daily Allegro dump: cd conf -wget https://rpb.lbz-rlp.de/rpb04/intern/RPBEXP.ZIP +wget http://www.rpb-rlp.de/rpb/rpb04/intern/RPBEXP.zip unzip -o RPBEXP.ZIP mv RPBEXP.ZIP RPBEXP/RPBEXP-$(date "+%Y%m%d-%H%M").ZIP cd .. From fde3f2af54c6e8c8c9049f9f99a70d02e9c754ab Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 15 Nov 2023 09:57:55 +0100 Subject: [PATCH 3/3] Update script for renamed `zip` file extension in new URL (RPB-101) --- transformAndIndex.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformAndIndex.sh b/transformAndIndex.sh index 74ebc326..d6d86ac1 100644 --- a/transformAndIndex.sh +++ b/transformAndIndex.sh @@ -5,8 +5,8 @@ IFS=$'\n\t' # Get the daily Allegro dump: cd conf wget http://www.rpb-rlp.de/rpb/rpb04/intern/RPBEXP.zip -unzip -o RPBEXP.ZIP -mv RPBEXP.ZIP RPBEXP/RPBEXP-$(date "+%Y%m%d-%H%M").ZIP +unzip -o RPBEXP.zip +mv RPBEXP.zip RPBEXP/RPBEXP-$(date "+%Y%m%d-%H%M").zip cd .. # Transform the data: