From 8bcff29d995beafa276503948ce4914db3a61042 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Wed, 18 Oct 2023 10:13:23 -0300 Subject: [PATCH 01/20] handle duplicate license values --- indexer/indexer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/indexer/indexer.py b/indexer/indexer.py index c7cb076..fb9739a 100755 --- a/indexer/indexer.py +++ b/indexer/indexer.py @@ -330,7 +330,9 @@ def genericType_toAtts(orig, rid=None): if 'txt_region' in ret: ret['txt_region'] = list(set(ret['txt_region'])) if 'txt_nationality' in ret: - ret['txt_nationality'] = list(set(ret['txt_nationality'])) + ret['txt_nationality'] = list(set(ret['txt_nationality'])) + if 'txt_license' in ret: + ret['txt_license'] = list(set(ret['txt_license'])) return ret def _merge_prov(orig, prov): From 37ed4827d8a0278cd5f32ad8c1c40084b7317921 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Wed, 18 Oct 2023 10:27:26 -0300 Subject: [PATCH 02/20] handle duplicate license values --- indexer/indexer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/indexer/indexer.py b/indexer/indexer.py index fb9739a..aab0aeb 100755 --- a/indexer/indexer.py +++ b/indexer/indexer.py @@ -332,7 +332,9 @@ def genericType_toAtts(orig, rid=None): if 'txt_nationality' in ret: ret['txt_nationality'] = list(set(ret['txt_nationality'])) if 'txt_license' in ret: - ret['txt_license'] = list(set(ret['txt_license'])) + #remove trailing slash in urls, for performing comparison + stripped_vals = [url.rstrip('/') for url in ret['txt_license']] + ret['txt_license'] = list(set(stripped_vals)) return ret def _merge_prov(orig, prov): From a44c97d1b6e68331a8fd38b2e481179ddab13908 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Fri, 3 Nov 2023 09:50:54 -0300 Subject: [PATCH 03/20] initial graph-solr indexer --- indexer/indexer-graph-solr.py | 89 +++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 indexer/indexer-graph-solr.py diff --git a/indexer/indexer-graph-solr.py b/indexer/indexer-graph-solr.py new file mode 100644 index 0000000..60d0409 --- /dev/null +++ b/indexer/indexer-graph-solr.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +""" +Purpose: Load a directory of JSON files, that are generated from + the ODIS graph->Solr process + +Steps: 1) set your Solr core endpoint variable at the commandline by: + + export SOLR_URL=http://127.0.0.1:8983/solr/cioos + + 2) set the path to the directory of JSON files, at the commandline by: + + export DATA_DIR=/home/apps/oih-ui-jmckenna/indexer/data/test + + 3) python indexer-graph-solr.py + +Output: Records indexed into the Solr core. Look for the "added resource" link + in the command window, such as: + + ***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt1.json + added resource https://catalogue.cioos.ca/dataset/00863729-b5a8-4ac6-b73a-523d463f9963.jsonld: schema:Dataset to index + ***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt2.json + added resource https://catalogue.cioos.ca/dataset/d1391e91-1ed2-4600-901a-5a5408fd1a6f.jsonld: schema:Dataset to index + +Requires: Python 3.x + +Notes: + + Input files are JSON (not JSON-LD that the orginal "indexer.py" required) + +""" + +import requests +import json +import os +from pathlib import Path +from test_utils import test_generation, dump_exception + +#set urls +BASE_SOLR_URL=os.environ.get('SOLR_URL', '') +solr_url = BASE_SOLR_URL + "/update/json/docs" +delete_url = BASE_SOLR_URL + "/update" +query_url = BASE_SOLR_URL + "/select" + +DATA_DIR=os.environ.get('DATA_DIR') +BASE_DIR=Path(DATA_DIR) + +session = requests.Session() + +# set Solr params +solr_params = { + 'commit': 'true', + # echo implies a dry run +# 'echo': 'true', +} + +#loop through directory +def import_file(file): + with open(file, 'rb') as f: + print ("***Processing filename: " + f.name) + try: + orig = json.load(f) + except UnicodeDecodeError: + f.seek(0) + file_bytes= f.read() + try: + file_string = file_bytes.decode('latin1') + orig = json.loads(file_string) + except Exception as msg: + print ("Issue decoding %s, continuing" % filename) + shutil.copy(src, os.path.join('exceptions', filename.split('/')[-1])) + return + + data = orig + data['keys'] = list(data.keys()) + # print (json.dumps(data, indent=2)) + data['json_source'] = json.dumps(data) + solr_post = session.post(solr_url, params=solr_params, json=data) + try: + solr_post.raise_for_status() + print("added resource %s: %s to index" % (data['id'], data['type'])) + except: + dump_exception(orig, solr_post.text) + return + #print(solr_post.text) + +for item in os.scandir(BASE_DIR): + import_file(item) + From 9979b564833397aba03073c16d0f1316f745a9c5 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Fri, 3 Nov 2023 09:58:37 -0300 Subject: [PATCH 04/20] initial graph-solr indexer --- indexer/indexer-graph-solr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexer/indexer-graph-solr.py b/indexer/indexer-graph-solr.py index 60d0409..ea32253 100644 --- a/indexer/indexer-graph-solr.py +++ b/indexer/indexer-graph-solr.py @@ -14,8 +14,8 @@ 3) python indexer-graph-solr.py -Output: Records indexed into the Solr core. Look for the "added resource" link - in the command window, such as: +Output: Records indexed into the Solr core. Look for the "added resource" message + in the command window (which means it successfully indexed into Solr) such as: ***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt1.json added resource https://catalogue.cioos.ca/dataset/00863729-b5a8-4ac6-b73a-523d463f9963.jsonld: schema:Dataset to index From dfc1180766c406385d06f591159faea1c475b6af Mon Sep 17 00:00:00 2001 From: "a.lambert@unesco.org" Date: Tue, 7 Nov 2023 14:48:08 +0100 Subject: [PATCH 05/20] we don't want people to visit this page, redirect to homepage --- frontend/public/index.html | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/frontend/public/index.html b/frontend/public/index.html index f650831..7377358 100644 --- a/frontend/public/index.html +++ b/frontend/public/index.html @@ -18,10 +18,16 @@ +
\ No newline at end of file From 5bd7ae60a3b8b2f2d28ad399da91f02ba2f6541a Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Mon, 27 Nov 2023 16:13:54 -0400 Subject: [PATCH 06/20] handle geocoordinates --- indexer/conversions.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/indexer/conversions.py b/indexer/conversions.py index d34c4a2..f7f20c1 100644 --- a/indexer/conversions.py +++ b/indexer/conversions.py @@ -132,6 +132,21 @@ def GeoShape(geo): raise UnhandledFormatException("Didn't handle %s in GeoShape" % json.dumps(geo)) +def GeoCoordinates(geo): + #print('here [GeoCoordinates]') + + lat = geo.get("latitude",None) + long = geo.get("longitude",None) + if lat is not None and long is not None: + print ("Generating a Point from the GeoCoordinates...") + newPoint = "POINT (" + str(long) + " " + str(lat) + ")" + print(newPoint) + return _geo('point', newPoint) + + raise UnhandledFormatException("Didn't handle %s in GeoCoordinates" % json.dumps(geo)) + + + def CourseInstance(data): atts = [_dispatch(field, data.get(field, None)) for field in ('startDate', 'endDate')] if 'location' in data: From 5676d064e082022760b0b554441be56a9710205c Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Tue, 28 Nov 2023 09:33:13 -0400 Subject: [PATCH 07/20] handle creditText --- frontend/src/components/results/types/Dataset.json | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/src/components/results/types/Dataset.json b/frontend/src/components/results/types/Dataset.json index a33d66e..c5416c9 100644 --- a/frontend/src/components/results/types/Dataset.json +++ b/frontend/src/components/results/types/Dataset.json @@ -3,6 +3,7 @@ { "key": "txt_sameAs", "type": ["list", "truncated", "link"] }, { "key": "txt_license", "type": "list", "label": "License" }, { "key": "txt_citation", "type": "list", "label": "Related Works" }, + { "key": "txt_creditText", "type": "list", "label": "Recommended Citation" }, "txt_version", { "key": "txt_keywords", "type": "keywords" }, { From 9e60661c38dfe987c2161cbe02d03ad634935736 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Wed, 29 Nov 2023 10:35:51 -0400 Subject: [PATCH 08/20] expose Distribution for Documents --- frontend/src/components/results/types/CreativeWork.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/src/components/results/types/CreativeWork.json b/frontend/src/components/results/types/CreativeWork.json index 689e71f..1f71948 100644 --- a/frontend/src/components/results/types/CreativeWork.json +++ b/frontend/src/components/results/types/CreativeWork.json @@ -17,5 +17,6 @@ "key": "txt_contributor", "type": "list", "label": "Contributor(s)" - } + }, + { "key": "txt_distribution", "type": ["list", "link"] } ] \ No newline at end of file From cb0f4c43d3d231c0d0326086ce015b5678d739c8 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Thu, 30 Nov 2023 10:39:16 -0400 Subject: [PATCH 09/20] handle Name as list (as well as Dict) --- indexer/indexer.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/indexer/indexer.py b/indexer/indexer.py index aab0aeb..e1c0f1e 100755 --- a/indexer/indexer.py +++ b/indexer/indexer.py @@ -185,14 +185,18 @@ def genericType_toAtts(orig, rid=None): #handle case of name as list for i in v: pos = 0 - print(i.values()) - for val in i.values(): - if val == "en": - listForm = list(i.values()) - print('***Name: ' + listForm[pos+1]) - data.append(Att(None, listForm[pos+1], k)) - data.append(Att('txt', listForm[pos+1], k)) - data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region')) + if isinstance(i, dict) == True: + print(i.values()) + for val in i.values(): + if val == "en": + listForm = list(i.values()) + print('***Name: ' + listForm[pos+1]) + data.append(Att(None, listForm[pos+1], k)) + data.append(Att('txt', listForm[pos+1], k)) + data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region')) + else: + data.append(Att(None, i, k)) + data.append(Att('txt', i, k)) elif k == 'description': if isinstance(v, list) == False: #print('type is: ',type(v)) From 7ba1cb565a37a8fe0a94a5cd45580ceb3d32e5b7 Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Thu, 30 Nov 2023 11:55:38 -0400 Subject: [PATCH 10/20] handle type:Movie --- indexer/indexer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/indexer/indexer.py b/indexer/indexer.py index e1c0f1e..6795614 100755 --- a/indexer/indexer.py +++ b/indexer/indexer.py @@ -151,9 +151,9 @@ def genericType_toAtts(orig, rid=None): if orig['@type'] == 'Project' or orig['@type'] == 'ResearchProject': print('***changing type:Project to type:ResearchProject') origType = 'ResearchProject' - #handle type:DigitalDocument as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 ) - elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument': - print('***changing type:DigitalDocument to type:CreativeWork') + #handle CreativeWork subsets as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 ) + elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument' or orig['@type'] == 'Movie': + print('***changing type:' + orig['@type'] + ' to type:CreativeWork') origType = 'CreativeWork' else: origType = orig['@type'] From f1cfb66d7172c5f1672627c5492a9e49b7d80a0d Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Thu, 30 Nov 2023 13:34:31 -0400 Subject: [PATCH 11/20] handle name as list in Solr --- solr/conf/schema.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/conf/schema.xml b/solr/conf/schema.xml index 875c6bc..b81f40c 100644 --- a/solr/conf/schema.xml +++ b/solr/conf/schema.xml @@ -104,7 +104,7 @@ schema. In this case the version should be set to the next CKAN version number. - + From 078a0eaec814332f833e6448a2519c88a0486c2d Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Mon, 4 Dec 2023 15:45:08 -0400 Subject: [PATCH 12/20] handle ISO date with Z --- indexer/conversions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer/conversions.py b/indexer/conversions.py index f7f20c1..d034596 100644 --- a/indexer/conversions.py +++ b/indexer/conversions.py @@ -254,7 +254,7 @@ def _parseDate(field, d): try: dt = isoparse(d) return [ - Att('dt', dt.isoformat(), field), + Att('dt', dt.isoformat(timespec='seconds').replace('+00:00', 'Z'), field), Att('n', dt.year, field.replace('Date', 'Year')), ] except ValueError: From 277fe8ee8e2eacaafce339bd80a95881e7d3feff Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Thu, 7 Dec 2023 08:25:55 -0400 Subject: [PATCH 13/20] modify script description --- indexer/indexer-graph-solr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexer/indexer-graph-solr.py b/indexer/indexer-graph-solr.py index ea32253..b4eb3dc 100644 --- a/indexer/indexer-graph-solr.py +++ b/indexer/indexer-graph-solr.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Purpose: Load a directory of JSON files, that are generated from - the ODIS graph->Solr process +Purpose: Load a directory of JSON files, generated from the ODIS graph->Solr + process (mdp2solr.sh), into an existing Solr core. Steps: 1) set your Solr core endpoint variable at the commandline by: From b6533c3d5fd3436864b19c3f877c27d287f65cef Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Mon, 29 Jan 2024 11:06:00 -0400 Subject: [PATCH 14/20] update license year --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 961f29a..f5a508b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,6 @@ This repo contains the code for the Ocean Info Hub Global Search Portal. * `/indexer` contains all of the code to ingest the OIH graph into the SOLR Instance * `/solr` contains the configuration required for the solr instance, including the schema. * `/frontend` contains the code for the static javascript app. This will produce a container in dev mode running a live server, and a static html/javascript site in production mode. -* `/regions` contains the QGIS file defining the gographical regions. +* `/regions` contains the QGIS file defining the geographical regions. See the individual README files for more information. From 11bd81ee74937e3f5778e946f4b41f87047ab22c Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Mon, 29 Jan 2024 11:07:48 -0400 Subject: [PATCH 15/20] update license year --- LICENSE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.md b/LICENSE.md index 9f4a111..f8097f0 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Ocean InfoHub +Copyright (c) 2024 Ocean InfoHub Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 50bc388a84e03dbe2c1a90b35ee6b315901d5a8e Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Fri, 12 Apr 2024 09:39:33 -0300 Subject: [PATCH 16/20] add Dependabot for GitHub Actions --- .github/dependabot.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..c11a11f --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "github-actions" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" From 4b7a8524c9af4d9da15659835375e9ccc9b61398 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 12 Apr 2024 12:40:09 +0000 Subject: [PATCH 17/20] Bump rectalogic/notify-irc from 1 to 2 Bumps [rectalogic/notify-irc](https://github.com/rectalogic/notify-irc) from 1 to 2. - [Release notes](https://github.com/rectalogic/notify-irc/releases) - [Commits](https://github.com/rectalogic/notify-irc/compare/v1...v2) --- updated-dependencies: - dependency-name: rectalogic/notify-irc dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/irc_notify.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/irc_notify.yml b/.github/workflows/irc_notify.yml index f2aba83..dda0be8 100644 --- a/.github/workflows/irc_notify.yml +++ b/.github/workflows/irc_notify.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest steps: - name: irc push - uses: rectalogic/notify-irc@v1 + uses: rectalogic/notify-irc@v2 if: github.event_name == 'push' with: channel: "#oih" @@ -26,7 +26,7 @@ jobs: ${{ github.actor }} pushed ${{ github.event.ref }} ${{ github.event.compare }} ${{ join(github.event.commits.*.message) }} - name: irc pull request - uses: rectalogic/notify-irc@v1 + uses: rectalogic/notify-irc@v2 if: github.event_name == 'pull_request' with: channel: "#oih" @@ -36,7 +36,7 @@ jobs: message: | ${{ github.actor }} opened PR ${{ github.event.pull_request.html_url }} - name: irc tag created - uses: rectalogic/notify-irc@v1 + uses: rectalogic/notify-irc@v2 if: github.event_name == 'create' && github.event.ref_type == 'tag' with: channel: "#oih" From ed535e4f61c88f0a10abcd50384de76493ecdaa9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 12 Apr 2024 12:40:11 +0000 Subject: [PATCH 18/20] Bump actions/checkout from 3 to 4 Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/check-crlf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-crlf.yml b/.github/workflows/check-crlf.yml index e87dcbe..73c0bc8 100644 --- a/.github/workflows/check-crlf.yml +++ b/.github/workflows/check-crlf.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Checkout repository contents - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Use action to check for CRLF endings uses: erclu/check-crlf@v1.2.0 From d364327d3f5bbc64233127a59d10c6a559710f6c Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Fri, 12 Apr 2024 12:08:40 -0300 Subject: [PATCH 19/20] add title to PR notification --- .github/workflows/irc_notify.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/irc_notify.yml b/.github/workflows/irc_notify.yml index dda0be8..482f505 100644 --- a/.github/workflows/irc_notify.yml +++ b/.github/workflows/irc_notify.yml @@ -35,6 +35,7 @@ jobs: notice: true message: | ${{ github.actor }} opened PR ${{ github.event.pull_request.html_url }} + ${{ github.event.pull_request.title }} - name: irc tag created uses: rectalogic/notify-irc@v2 if: github.event_name == 'create' && github.event.ref_type == 'tag' From 3abe7fa16c62ed276a7b93c9c6a975af6625fb9f Mon Sep 17 00:00:00 2001 From: Jeff McKenna Date: Tue, 7 May 2024 12:11:15 -0300 Subject: [PATCH 20/20] handle type SoftwareSourceCode ( issue #131 ) --- indexer/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer/indexer.py b/indexer/indexer.py index 6795614..206b6f5 100755 --- a/indexer/indexer.py +++ b/indexer/indexer.py @@ -152,7 +152,7 @@ def genericType_toAtts(orig, rid=None): print('***changing type:Project to type:ResearchProject') origType = 'ResearchProject' #handle CreativeWork subsets as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 ) - elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument' or orig['@type'] == 'Movie': + elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument' or orig['@type'] == 'Movie' or orig['@type'] == 'SoftwareSourceCode': print('***changing type:' + orig['@type'] + ' to type:CreativeWork') origType = 'CreativeWork' else: