Merge pull request #50 from IQSS/develop

Update from IQSS develop
IQSS · Mar 30, 2020 · ad9e2f9 · ad9e2f9
2 parents 6844b6b + 6e2c4b6
commit ad9e2f9
Show file tree

Hide file tree

Showing 52 changed files with 3,529 additions and 1,043 deletions.
diff --git a/doc/release-notes/6485-multiple-stores.md b/doc/release-notes/6485-multiple-stores.md
@@ -29,8 +29,4 @@ Any additional S3 options you have set will need to be replaced as well, followi
 
 Once these options are set, restarting the glassfish service is all that is needed to complete the change.  
 
-<<<<<<< HEAD
 Note that the "\-Ddataverse.files.directory", if defined, continues to control where temporary files are stored (in the /temp subdir of that directory), independent of the location of any 'file' store defined above.
-=======
-Note that the "\-Ddataverse.files.directory", if defined, continues to control where temporary files are stored (in the /temp subdir of that directory), independent of the location of any 'file' store defined above.
->>>>>>> branch 'IQSS/6485' of https://github.com/TexasDigitalLibrary/dataverse.git
diff --git a/doc/release-notes/6489-release-notes.md b/doc/release-notes/6489-release-notes.md
@@ -0,0 +1,17 @@
+# S3 Direct Upload support
+
+S3 stores can now optionally be configured to support direct upload of files, as one option for supporting upload of larger files.
+
+General information about this capability can be found in the <a href="http://guides.dataverse.org/en/latest/developers/big-data-support.html">Big Data Support Guide</a> with specific information about how to enable it in the <a href="http://guides.dataverse.org/en/latest/installation/config.html">Configuration Guide</a> - File Storage section.
+
+**Upgrade Information:** 
+
+Direct upload to S3 is enabled per store by one new jvm option:
+
+    ./asadmin create-jvm-options "\-Ddataverse.files.<id>.upload-redirect=true"
+
+The existing :MaxFileUploadSizeInBytes property and ```dataverse.files.<id>.url-expiration-minutes``` jvm option for the same store also apply to direct upload.
+
+Direct upload via the Dataverse web interface is transparent to the user and handled automatically by the browser. Some minor differences in file upload exist: directly uploaded files are not unzipped and Dataverse does not scan their content to help in assigning a MIME type. Ingest of tabular files and metadata extraction from FITS files will occur, but can be turned off for files above a specified size limit through the new dataverse.files.<id>.ingestsizelimit jvm option.
+
+API calls to support direct upload also exist, and, if direct upload is enabled for a store in Dataverse, the latest DVUploader (v1.0.8) provides a'-directupload' flag that enables its use. 
diff --git a/doc/release-notes/6514-shib-updates b/doc/release-notes/6514-shib-updates
@@ -0,0 +1 @@
+New DB option :ShibAffiliationAttribute
diff --git a/doc/release-notes/6650-export-import-mismatch b/doc/release-notes/6650-export-import-mismatch
@@ -0,0 +1,3 @@
+Run ReExportall to update JSON Exports
+
+http://guides.dataverse.org/en/4.19/admin/metadataexport.html?highlight=export#batch-exports-through-the-api
diff --git a/doc/sphinx-guides/source/_static/api/file-provenance.json b/doc/sphinx-guides/source/_static/api/file-provenance.json
@@ -0,0 +1 @@
+{"prefix": {"pre_0": "http://www.w3.org/2001/XMLSchema", "s-prov": "http://s-prov/ns/#", "provone": "http://purl.dataone.org/provone/2015/01/15/ontology#", "vargen": "http://openprovenance.org/vargen#", "foaf": "http://xmlns.com/foaf/0.1/", "dcterms": "http://purl.org/dc/terms/", "tmpl": "http://openprovenance.org/tmpl#", "var": "http://openprovenance.org/var#", "vcard": "http://www.w3.org/2006/vcard/ns#", "swirrl": "http://project-dare.eu/ns#"}, "bundle": {"vargen:SessionSnapshot": {"prefix": {"s-prov": "http://s-prov/ns/#", "provone": "http://purl.dataone.org/provone/2015/01/15/ontology#", "vargen": "http://openprovenance.org/vargen#", "tmpl": "http://openprovenance.org/tmpl#", "var": "http://openprovenance.org/var#", "vcard": "http://www.w3.org/2006/vcard/ns#", "swirrl": "http://project-dare.eu/ns#"}, "entity": {"vargen:inData": {"swirrl:volumeId": {"$": "var:rawVolumeId", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "provone:Data", "type": "prov:QUALIFIED_NAME"}}, "vargen:inFile": {"prov:atLocation": {"$": "var:atLocation", "type": "prov:QUALIFIED_NAME"}, "s-prov:format": {"$": "var:format", "type": "prov:QUALIFIED_NAME"}, "s-prov:checksum": {"$": "var:checksum", "type": "prov:QUALIFIED_NAME"}}, "vargen:WorkData": {"swirrl:volumeId": {"$": "var:workVolumeId", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "provone:Data", "type": "prov:QUALIFIED_NAME"}}, "var:JupSnapshot": {"prov:generatedAt": {"$": "var:generatedAt", "type": "prov:QUALIFIED_NAME"}, "prov:atLocation": {"$": "var:repoUrl", "type": "prov:QUALIFIED_NAME"}, "s-prov:description": {"$": "var:description", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "swirrl:NotebookSnapshot", "type": "prov:QUALIFIED_NAME"}, "swirrl:sessionId": {"$": "var:sessionId", "type": "prov:QUALIFIED_NAME"}}}, "used": {"_:id1": {"prov:activity": "vargen:snapshot", "prov:entity": "var:Jupyter"}, "_:id2": {"prov:activity": "vargen:snapshot", "prov:entity": "vargen:WorkData"}, "_:id3": {"prov:activity": "vargen:snapshot", "prov:entity": "vargen:inData"}}, "wasDerivedFrom": {"_:id4": {"prov:usedEntity": "var:Jupyter", "prov:generatedEntity": "var:JupSnapshot"}}, "wasAssociatedWith": {"_:id5": {"prov:activity": "vargen:snapshot", "prov:agent": "var:snapAgent"}}, "actedOnBehalfOf": {"_:id6": {"prov:delegate": "var:snapAgent", "prov:responsible": "var:user"}}, "activity": {"vargen:snapshot": {"prov:atLocation": {"$": "var:method_path", "type": "prov:QUALIFIED_NAME"}, "tmpl:startTime": {"$": "var:startTime", "type": "prov:QUALIFIED_NAME"}, "tmpl:endTime": {"$": "var:endTime", "type": "prov:QUALIFIED_NAME"}}}, "wasGeneratedBy": {"_:id7": {"prov:activity": "vargen:snapshot", "prov:entity": "var:JupSnapshot"}}, "agent": {"var:user": {"vcard:uid": {"$": "var:name", "type": "prov:QUALIFIED_NAME"}, "swirrl:authMode": {"$": "var:authmode", "type": "prov:QUALIFIED_NAME"}, "swirrl:group": {"$": "var:group", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "prov:Person", "type": "prov:QUALIFIED_NAME"}}, "var:snapAgent": {"vcard:uid": {"$": "var:name_api", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "prov:SoftwareAgent", "type": "prov:QUALIFIED_NAME"}}}, "hadMember": {"_:id8": {"prov:collection": "vargen:inData", "prov:entity": "vargen:inFile"}}}}}
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -2190,6 +2190,8 @@ The fully expanded example above (without environment variables) looks like this
 
   curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/files/:persistentId/prov-freeform?persistentId=doi:10.5072/FK2/AAA000" -H "Content-type:application/json" --upload-file provenance.json
 
+See a sample JSON file :download:`file-provenance.json <../_static/api/file-provenance.json>` from http://openprovenance.org (c.f. Huynh, Trung Dong and Moreau, Luc (2014) ProvStore: a public provenance repository. At 5th International Provenance and Annotation Workshop (IPAW'14), Cologne, Germany, 09-13 Jun 2014. pp. 275-277).
+
 Delete Provenance JSON for an uploaded file
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst
@@ -6,7 +6,52 @@ Big data support is highly experimental. Eventually this content will move to th
 .. contents:: |toctitle|
         :local:
 
-Various components need to be installed and configured for big data support.
+Various components need to be installed and/or configured for big data support.
+
+S3 Direct Upload and Download
+-----------------------------
+
+A lightweight option for supporting file sizes beyond a few gigabytes - a size that can cause performance issues when uploaded through the Dataverse server itself - is to configure an S3 store to provide direct upload and download via 'pre-signed URLs'. When these options are configured, file uploads and downloads are made directly to and from a configured S3 store using secure (https) connections that enforce Dataverse's access controls. (The upload and download URLs are signed with a unique key that only allows access for a short time period and Dataverse will only generate such a URL if the user has permission to upload/download the specific file in question.)
+
+This option can handle files >40GB and could be appropriate for files up to a TB. Other options can scale farther, but this option has the advantages that it is simple to configure and does not require any user training - uploads and downloads are done via the same interface as normal uploads to Dataverse.
+
+To configure these options, an administrator must set two JVM options for the Dataverse server using the same process as for other configuration options:
+
+``./asadmin create-jvm-options "-Ddataverse.files.<id>.download-redirect=true"``
+``./asadmin create-jvm-options "-Ddataverse.files.<id>.upload-redirect=true"``
+
+
+With multiple stores configured, it is possible to configure one S3 store with direct upload and/or download to support large files (in general or for specific dataverses) while configuring only direct download, or no direct access for another store.  
+
+It is also possible to set file upload size limits per store. See the :MaxFileUploadSizeInBytes setting described in the :doc:`/installation/config` guide.
+
+At present, one potential drawback for direct-upload is that files are only partially 'ingested', tabular and FITS files are processed, but zip files are not unzipped, and the file contents are not inspected to evaluate their mimetype. This could be appropriate for large files, or it may be useful to completely turn off ingest processing for performance reasons (ingest processing requires a copy of the file to be retrieved by Dataverse from the S3 store). A store using direct upload can be configured to disable all ingest processing for files above a given size limit:
+
+``./asadmin create-jvm-options "-Ddataverse.files.<id>.ingestsizelimit=<size in bytes>"``
+
+
+**IMPORTANT:** One additional step that is required to enable direct download to work with previewers is to allow cross site (CORS) requests on your S3 store. 
+The example below shows how to enable the minimum needed CORS rules on a bucket using the AWS CLI command line tool. Note that you may need to add more methods and/or locations, if you also need to support certain previewers and external tools. 
+
+``aws s3api put-bucket-cors --bucket <BUCKET_NAME> --cors-configuration file://cors.json``
+
+with the contents of the file cors.json as follows:
+
+.. code-block:: json
+
+        {
+          "CORSRules": [
+             {
+                "AllowedOrigins": ["https://<DATAVERSE SERVER>"],
+                "AllowedHeaders": ["*"],
+                "AllowedMethods": ["PUT", "GET"]
+             }
+          ]
+        }
+
+Alternatively, you can enable CORS using the AWS S3 web interface, using json-encoded rules as in the example above. 
+
+Since the direct upload mechanism creates the final file rather than an intermediate temporary file, user actions, such as neither saving or canceling an upload session before closing the browser page, can leave an abandoned file in the store. The direct upload mechanism attempts to use S3 Tags to aid in identifying/removing such files. Upon upload, files are given a "dv-status":"temp" tag which is removed when the dataset changes are saved and the new file(s) are added in Dataverse. Note that not all S3 implementations support Tags: Minio does not. WIth such stores, direct upload works, but Tags are not used. 
 
 Data Capture Module (DCM)
 -------------------------

diff --git a/doc/sphinx-guides/source/developers/deployment.rst b/doc/sphinx-guides/source/developers/deployment.rst
@@ -82,23 +82,26 @@ Download and Run the "Create Instance" Script
 
 Once you have done the configuration above, you are ready to try running the "ec2-create-instance.sh" script to spin up Dataverse in AWS.
 
-Download :download:`ec2-create-instance.sh <../../../../scripts/installer/ec2-create-instance.sh>` and put it somewhere reasonable. For the purpose of these instructions we'll assume it's in the "Downloads" directory in your home directory.
+Download :download:`ec2-create-instance.sh<https://raw.githubusercontent.com/IQSS/dataverse-ansible/master/ec2/ec2-create-instance.sh>` and put it somewhere reasonable. For the purpose of these instructions we'll assume it's in the "Downloads" directory in your home directory.
 
-ec2-create-instance accepts a number few command-line switches:
+To run it with default values you just need the script, but you may also want a current copy of the ansible :download:`group vars<https://raw.githubusercontent.com/IQSS/dataverse-ansible/master/defaults/main.yml>`_ file.
+
+ec2-create-instance accepts a number of command-line switches, including:
 
 * -r: GitHub Repository URL (defaults to https://github.com/IQSS/dataverse.git)
 * -b: branch to build (defaults to develop)
 * -p: pemfile directory (defaults to $HOME)
 * -g: Ansible GroupVars file (if you wish to override role defaults)
+* -h: help (displays usage for each available option)
 
 ``bash ~/Downloads/ec2-create-instance.sh -b develop -r https://github.com/scholarsportal/dataverse.git -g main.yml``
 
-Now you will need to wait around 15 minutes until the deployment is finished. Eventually, the output should tell you how to access the installation of Dataverse in a web browser or via ssh. It will also provide instructions on how to delete the instance when you are finished with it. Please be aware that AWS charges per minute for a running instance. You can also delete your instance from https://console.aws.amazon.com/console/home?region=us-east-1 .
+You will need to wait for 15 minutes or so until the deployment is finished, longer if you've enabled sample data and/or the API test suite. Eventually, the output should tell you how to access the installation of Dataverse in a web browser or via SSH. It will also provide instructions on how to delete the instance when you are finished with it. Please be aware that AWS charges per minute for a running instance. You may also delete your instance from https://console.aws.amazon.com/console/home?region=us-east-1 .
 
-Caveats
-~~~~~~~
+Caveat Recipiens
+~~~~~~~~~~~~~~~~
 
-Please note that while the script should work fine on newish branches, older branches that have different dependencies such as an older version of Solr may not produce a working Dataverse installation. Your mileage may vary.
+Please note that while the script should work well on new-ish branches, older branches that have different dependencies such as an older version of Solr may not produce a working Dataverse installation. Your mileage may vary.
 
 ----
 

diff --git a/doc/sphinx-guides/source/developers/testing.rst b/doc/sphinx-guides/source/developers/testing.rst
@@ -108,22 +108,33 @@ Unfortunately, the term "integration tests" can mean different things to differe
 Running the Full API Test Suite Using EC2
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To run the API test suite on EC2 you should first follow the steps in the :doc:`deployment` section to get set up for AWS in general and EC2 in particular.
+To run the API test suite in an EC2 instance you should first follow the steps in the :doc:`deployment` section to get set up for AWS in general and EC2 in particular.
 
-Then read the instructions in https://github.com/IQSS/dataverse-sample-data for EC2 but be sure to make the adjustments below.
+You may always retrieve a current copy of the ec2-create-instance.sh script and accompanying group_var.yml file from the `dataverse-ansible repo<https://github.com/IQSS/dataverse-ansible/>`_:
 
-Edit ``ec2config.yaml`` to change ``test_suite`` to ``true``.
+- `ec2-create-instance.sh<https://raw.githubusercontent.com/IQSS/dataverse-ansible/master/ec2/ec2-create-instance.sh>`_
+- `main.yml<https://raw.githubusercontent.com/IQSS/dataverse-ansible/master/defaults/main.yml>`_
 
-Pass in the repo and branch you are testing. You should also specify a local directory where server.log and other useful information will be written so you can start debugging any failures.
+Edit ``main.yml`` to set the desired GitHub repo, branch, and to ensure that the API test suite is enabled:
+
+- ``dataverse_repo: https://github.com/IQSS/dataverse.git``
+- ``dataverse_branch: develop``
+- ``dataverse.api.test_suite: true``
+- ``dataverse.sampledata.enabled: true``
+
+If you wish, you may pass the local path of a logging directory, which will tell ec2-create-instance.sh to `grab glassfish, maven and other logs<https://github.com/IQSS/dataverse-ansible/blob/master/ec2/ec2-create-instance.sh#L185>`_ for your review.
+
+Finally, run the script:
 
 .. code-block:: bash
 
-  export REPO=https://github.com/IQSS/dataverse.git
-  export BRANCH=123-my-branch
-  export LOGS=/tmp/123
+  $ ./ec2-create-instance.sh -g main.yml -l log_dir
+
+Near the beginning and at the end of the ec2-create-instance.sh output you will see instructions for connecting to the instance via SSH. If you are actively working on a branch and want to refresh the warfile after each commit, you may wish to call a `redeploy.sh<https://github.com/IQSS/dataverse-ansible/blob/master/templates/redeploy.sh.j2>`_ script placed by the Ansible role, which will do a "git pull" against your branch, build the warfile, deploy the warfile, then restart glassfish. By default this script is written to /tmp/dataverse/redeploy.sh. You may invoke the script by appending it to the SSH command in ec2-create's output:
+
+.. code-block:: bash
 
-  mkdir $LOGS
-  ./ec2-create-instance.sh -g ec2config.yaml -r $REPO -b $BRANCH -l $LOGS
+  $ ssh -i your_pem.pem [email protected] /tmp/dataverse/redeploy.sh
 
 Running the full API test suite using Docker
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Run ReExportall to update JSON Exports

		http://guides.dataverse.org/en/4.19/admin/metadataexport.html?highlight=export#batch-exports-through-the-api
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"prefix": {"pre_0": "http://www.w3.org/2001/XMLSchema", "s-prov": "http://s-prov/ns/#", "provone": "http://purl.dataone.org/provone/2015/01/15/ontology#", "vargen": "http://openprovenance.org/vargen#", "foaf": "http://xmlns.com/foaf/0.1/", "dcterms": "http://purl.org/dc/terms/", "tmpl": "http://openprovenance.org/tmpl#", "var": "http://openprovenance.org/var#", "vcard": "http://www.w3.org/2006/vcard/ns#", "swirrl": "http://project-dare.eu/ns#"}, "bundle": {"vargen:SessionSnapshot": {"prefix": {"s-prov": "http://s-prov/ns/#", "provone": "http://purl.dataone.org/provone/2015/01/15/ontology#", "vargen": "http://openprovenance.org/vargen#", "tmpl": "http://openprovenance.org/tmpl#", "var": "http://openprovenance.org/var#", "vcard": "http://www.w3.org/2006/vcard/ns#", "swirrl": "http://project-dare.eu/ns#"}, "entity": {"vargen:inData": {"swirrl:volumeId": {"$": "var:rawVolumeId", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "provone:Data", "type": "prov:QUALIFIED_NAME"}}, "vargen:inFile": {"prov:atLocation": {"$": "var:atLocation", "type": "prov:QUALIFIED_NAME"}, "s-prov:format": {"$": "var:format", "type": "prov:QUALIFIED_NAME"}, "s-prov:checksum": {"$": "var:checksum", "type": "prov:QUALIFIED_NAME"}}, "vargen:WorkData": {"swirrl:volumeId": {"$": "var:workVolumeId", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "provone:Data", "type": "prov:QUALIFIED_NAME"}}, "var:JupSnapshot": {"prov:generatedAt": {"$": "var:generatedAt", "type": "prov:QUALIFIED_NAME"}, "prov:atLocation": {"$": "var:repoUrl", "type": "prov:QUALIFIED_NAME"}, "s-prov:description": {"$": "var:description", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "swirrl:NotebookSnapshot", "type": "prov:QUALIFIED_NAME"}, "swirrl:sessionId": {"$": "var:sessionId", "type": "prov:QUALIFIED_NAME"}}}, "used": {"_:id1": {"prov:activity": "vargen:snapshot", "prov:entity": "var:Jupyter"}, "_:id2": {"prov:activity": "vargen:snapshot", "prov:entity": "vargen:WorkData"}, "_:id3": {"prov:activity": "vargen:snapshot", "prov:entity": "vargen:inData"}}, "wasDerivedFrom": {"_:id4": {"prov:usedEntity": "var:Jupyter", "prov:generatedEntity": "var:JupSnapshot"}}, "wasAssociatedWith": {"_:id5": {"prov:activity": "vargen:snapshot", "prov:agent": "var:snapAgent"}}, "actedOnBehalfOf": {"_:id6": {"prov:delegate": "var:snapAgent", "prov:responsible": "var:user"}}, "activity": {"vargen:snapshot": {"prov:atLocation": {"$": "var:method_path", "type": "prov:QUALIFIED_NAME"}, "tmpl:startTime": {"$": "var:startTime", "type": "prov:QUALIFIED_NAME"}, "tmpl:endTime": {"$": "var:endTime", "type": "prov:QUALIFIED_NAME"}}}, "wasGeneratedBy": {"_:id7": {"prov:activity": "vargen:snapshot", "prov:entity": "var:JupSnapshot"}}, "agent": {"var:user": {"vcard:uid": {"$": "var:name", "type": "prov:QUALIFIED_NAME"}, "swirrl:authMode": {"$": "var:authmode", "type": "prov:QUALIFIED_NAME"}, "swirrl:group": {"$": "var:group", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "prov:Person", "type": "prov:QUALIFIED_NAME"}}, "var:snapAgent": {"vcard:uid": {"$": "var:name_api", "type": "prov:QUALIFIED_NAME"}, "prov:type": {"$": "prov:SoftwareAgent", "type": "prov:QUALIFIED_NAME"}}}, "hadMember": {"_:id8": {"prov:collection": "vargen:inData", "prov:entity": "vargen:inFile"}}}}}