diff --git a/.gitignore b/.gitignore index 7b83762..e21b78b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ dist/ build/ *egg-info/ test.py -misc_documentation.md \ No newline at end of file +misc_documentation.md +*.log +collision_reports/ \ No newline at end of file diff --git a/README.md b/README.md index bf12321..b7048cb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Biomarkerkb Backend Dataset Viewer +# Biomarker Backend API Work in progress. @@ -10,6 +10,8 @@ Work in progress. - [Populate Database](#populate-database) - [Creating and Starting Docker Container for the APIs](#creating-and-starting-docker-container-for-the-apis) - [Config File Definitions](#config-file-definitions) +- [Internal Backend Documentation](#internal-backend-documentation) + - [ID Assignment System](#id-assignment-system) API documentation can be found [here](./api/biomarker/README.md). @@ -44,13 +46,13 @@ python create_mongodb_container.py -s $SER docker ps --all ``` -The first command will navigate you into the api directory. The second command will run the script. The `$SER` argument should be replaced with the server you are running on (dev, tst, beta, prd). The last command lists all docker containers. You should see the docker mongodb docker container that the script created, in the format of `running_biomarkerkb_mongo_$SER` where `$SER` is the specified server. +The first command will navigate you into the api directory. The second command will run the script. The `$SER` argument should be replaced with the server you are running on (dev, tst, beta, prd). The last command lists all docker containers. You should see the docker mongodb docker container that the script created, in the format of `running_biomarker-api_mongo_$SER` where `$SER` is the specified server. Expected output should look something like this: ```bash -Found container: running_biomarkerkb_mongo_dev -Found network: biomarkerkb_backend_network_dev +Found container: running_biomarker-api_mongo_dev +Found network: biomarker-api_backend_network_dev e6c50502da1b 5e1146780c4fa96a6af6e4555cd119368e9907c4d50ad4790f9f5e54e13bf043 @@ -59,6 +61,8 @@ e6c50502da1b The first two print statements indicate that an old instance of the container and docker network were found. These will be removed by the script. The `e6c50502da1b` is the ID of the removed container. This indicates that the `docker rm -f ...` command executed successfully and removed the existing container. The second to last line is the ID of the newly created docker network. The last line is the ID of the newly created docker container. +Start the MongoDB container using the `docker start {container}` command. + ## Initialize MongoDB User Stay in the `/api` subdirectory and run the `init_mongodb.py` script: @@ -74,17 +78,19 @@ Where the `$SER` argument is the specified server. This should only be run once. To load data, run the `load_data.py` script from the `/api` directory. ```bash -python load_data.py -s $SER -f $FP +python load_data.py -s $SER -v $VER ``` -Where the `$SER` argument is the specified server and `$FP` is the filepath to the seed csv data. +Where the `$SER` argument is the specified server and `$VER` is the filepath to the data release to load. If testing on a local machine, you can test using code or a GUI option such as MongoDB Compass. The connection string should look something along the lines of: ```bash -mongodb://biomarkeradmin:biomarkerpass@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=biomarkerkbdb +mongodb://biomarkeradmin:biomarkerpass@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=biomarkerdb_api ``` +The `load_data.py` script will handle the biomarker ID assignment. More information about the under the hood implementation is available in the [ID Assignment System](#id-assignment-system) section. If any collisions are detected during the ID assignment process an output message will be printed indicating the file, document, core values string, and resulting hash value that caused the collision. In this case, the record is NOT added to the MongoDB instance. If no collision was found, the record will be added to the biomarker collection with the new ordinal ID assigned. In the case of no collision, the record has the `biomarker_id` value replaced and the updated JSON is written back out to overwrite the input file. + ## Creating and Starting Docker Container for the APIs To create the API container, run the `create_api_container.py` script from the `/api` directory. @@ -96,13 +102,7 @@ docker ps --all The first command will run the script. The `$SER` argument should be replaced with the server you are running on (dev, tst, beta, prd). The last command lists all docker containers. You should see the api container that the script created, in the format of `running_biomarkerkb_api_$SER` where `$SER` is the specified server. -After the container is up and running, you can manually test the API using Python's `request` library, curl, or in the web browser. An example API call: - -```bash -http://localhost:8081/dataset/randomsample?sample=5 -``` - -API documentation can be found [here](https://github.com/biomarker-ontology/biomarkerkb-backend-datasetviewer/tree/main/api/biomarkerkb#endpoints). +API documentation can be found [here](./api/biomarker/README.md). # Config File Definitions @@ -115,11 +115,6 @@ API documentation can be found [here](https://github.com/biomarker-ontology/biom "tst": "test server api port", "dev": "development server api port" }, - "mail":{ - "server": "not used for now", - "port": "not used for now", - "sender": "not used for now" - }, "data_path": "prefix filepath for the bind-mounted directory", "dbinfo": { "dbname": "database name", @@ -135,11 +130,47 @@ API documentation can be found [here](https://github.com/biomarker-ontology/biom "user": "admin username", "password": "admin password" }, - "biomarkerkb": { - "db": "biomarkerkbdb database", - "user": "biomarkerkb database username", - "password": "biomarkerkb database password" + "biomarkerdb_api": { + "db": "database name", + "collection": "data collection", + "id_collection": "ID map", + "user": "biomarker database username", + "password": "biomarker database password" } } } ``` + +# Internal Backend Documentation + +## ID Assignment System + +The high level workflow for the ID assignment system is as follows: + +```mermaid +flowchart TD + A[Data Release with JSON Data] --> B{load_data.py} + B --> C[Extracts the core field elements] + C --> D[Preprocesses core field values] + D --> E[Concatenates core fields in alphabetical order] + E --> F[Resulting string is hashed] + F --> G[Check the id_map collection for potential collision] + G --> H[If collision:\nDon't load and add to output message] + G --> I[If no collision:\nAssign new ordinal ID in id_map collection] + I --> J[Assign new ordinal ID to record and load into MongoDB] +``` + +The core fields are defined as in the Biomarker-Partnership RFC (which can be found in [this](https://github.com/biomarker-ontology/biomarker-partnership) repository). + +When loading data into the project, the core field values are extracted, cleaned, and concatenated. The resulting string is hashed and that hash value is checked for a potential collision in the MongoDB `id_map_collection`. If no collision is found, a new entry is added to the `id_map_collection` which stores the hash value and a a human readable ordinal ID. The core values string that generated the hash value is also stored with each entry for potential debugging purposes. + +Example: +```json +{ + "hash_value": "", + "ordinal_id": "", + "core_values_str": "" +} +``` + +The ordinal ID format is two letters followed by four digits. The ID space goes from `AA0000` to `ZZ9999`. \ No newline at end of file diff --git a/api/Dockerfile b/api/Dockerfile index a814fa0..2c60b8a 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -2,16 +2,16 @@ FROM python:3.10.4 WORKDIR /app -ENV FLASK_APP=biomarkerkb +ENV FLASK_APP=biomarker ENV FLASK_ENV=production COPY ./requirements.txt . RUN pip install -r requirements.txt # copy wheel distribution and install it -COPY ./dist/biomarkerkb-1.0-py3-none-any.whl . -RUN pip install biomarkerkb-1.0-py3-none-any.whl +COPY ./dist/biomarker-1.0-py3-none-any.whl . +RUN pip install biomarker-1.0-py3-none-any.whl COPY . . -ENTRYPOINT ["gunicorn", "-b", ":80", "biomarkerkb:create_app()"] \ No newline at end of file +ENTRYPOINT FLASK_APP=biomarker gunicorn -b :80 'biomarker:create_app()' --timeout 120 --graceful-timeout 60 \ No newline at end of file diff --git a/api/README.md b/api/README.md index b71d3c2..d0966a8 100644 --- a/api/README.md +++ b/api/README.md @@ -4,12 +4,13 @@ | Directory/File | | |-------------------------------|-------------------------------------------------------------------| -| `biomarkerkb/` | The biomarkerkb data api. | +| `biomarker/` | The biomarker api. | | `config.json` | Config file for the api setup. | | `create_api_container.py` | Creates the api container. | | `create_mongodb_container.py` | Creates the initial MongoDB container. | | `Dockerfile` | Dockerfile for the api image (used in `create_api_container.py`) | +| `id.py` | Defines the logic for the ID assignment system. | | `init_mongodb.py` | Creates the database user scoped to the biomarkerkbdb. | | `load_data.py` | Loads the MongoDB collection (`biomarker_collection`) with the seed data (from a csv file). | | `requirements.txt` | Requirements file for the api image. | -| `setup.py` | Setup script for packaging the biomarkerkb project. | \ No newline at end of file +| `setup.py` | Setup script for packaging the biomarker project. | \ No newline at end of file diff --git a/api/biomarker/README.md b/api/biomarker/README.md index 5234769..0e94d22 100644 --- a/api/biomarker/README.md +++ b/api/biomarker/README.md @@ -1,41 +1,51 @@ # API -- [Documentation](#documentation) - - [Endpoints](#endpoints) - - [Models](#models) +All endpoints are hosted at the root URL https://hivelab.biochemistry.gwu.edu/biomarker/api/. + +- [Endpoints](#endpoints) + - [Dataset Endpoints](#dataset-endpoints) + - [ID Endpoints](#id-endpoints) +- [Models](#models) - [Directory Strucutre](#directory-structure) -## Documentation +## Endpoints -### Endpoints +### Dataset Endpoints -`GET /dataset/getall` -Returns the entire dataset. -- Example call: `http://{HOST}:8081/dataset/getall` -- Parameters: - - x-fields (optional): optional fields mask -- Return schema: [`data_model`](#models) +`GET /dataset/getall?page={page}&per_page={per_page}` +- Parameters: + - `page`: The page number to return (default = 1). + - `per_page`: The number of records to return per page (default = 50). +- Returns: + - `200 Success`: The biomarker records. + + +`GET /dataset/randomsample?sample={sample}` +- Parameters: + - `sample`: The number of samples to return (default = 1). +- Returns: + - `200 Success`: The random subset of biomarker records. + - `400 Bad Request`: Error indicating an invalid sample size was provided (sample must be positive integer). ---- +### ID Endpoints -`GET /dataset/randomsample` -Returns a random subset of the dataset. -- Example call: `http://{HOST}:8081/dataset/randomsample?sample={NUMBER}` +`GET /id/getbiomarker?biomarker_id={biomarker_id}` - Parameters: - - sample (optional, default = 1): number of samples to return - - x-fields (optional): optional fields mask -- Return schema: [`data_model`](#models) + - `biomarker_id`: The biomarker ID to query for. +- Returns: + - `200 Success`: A single biomarker record corresponding to the `biomarker_id` param. + - `400 No biomarker ID provided`: Error indicating param was not included. + - `404 Not Found`: Error indicating biomarker ID was not found. -### Models +## Models -`data_model`: -| Field | Type | Description | -|-----------------------|-----------|-----------------------------------| +The data models can be seen [here](data_models.py). ## Directory Structure | Directory/File | | |-------------------------------|-------------------------------------------------------------------| | `__init__.py` | Entry point for the api module. | -| `dataset.py` | The local dataset module, which defines the dataset API. | -| `config/` | Config files for flask instance. | \ No newline at end of file +| `dataset.py` | The general dataset API endpoints. | +| `id.py` | The biomarker ID specific API endpoints. | +| `data_models.py` | Defines the data models for the API documentation. | diff --git a/api/biomarker/__init__.py b/api/biomarker/__init__.py index 88e9784..0c47070 100644 --- a/api/biomarker/__init__.py +++ b/api/biomarker/__init__.py @@ -2,26 +2,30 @@ from flask_cors import CORS from flask_restx import Api from .dataset import api as dataset_api -from flask_pymongo import PyMongo +from .id import api as id_api +from pymongo import MongoClient +import os + +MONGO_URI = os.getenv('MONGODB_CONNSTRING') +DB_NAME = 'biomarkerdb_api' +DB_COLLECTION = 'biomarker_collection' def create_app(): # create flask instance app = Flask(__name__) - app.config['ENV'] = 'development' - - if app.config['ENV'] == 'production': - app.config.from_pyfile('./config/config.py') - else: - app.config.from_pyfile('./config/config_dev.py') - CORS(app) - mongo = PyMongo(app) - app.mongo = mongo + + # initialize mongo client + mongo_client = MongoClient(MONGO_URI) + mongo_db = mongo_client[DB_NAME] + app.mongo_db = mongo_db + app.config['DB_COLLECTION'] = DB_COLLECTION # setup the api using the flask_restx library api = Api(app, version = '1.0', title = 'Biomarker APIs', description = 'Biomarker Knowledgebase API') api.add_namespace(dataset_api) + api.add_namespace(id_api) return app \ No newline at end of file diff --git a/api/biomarker/config/config.py b/api/biomarker/config/config.py deleted file mode 100644 index a6c35cd..0000000 --- a/api/biomarker/config/config.py +++ /dev/null @@ -1,10 +0,0 @@ -DEBUG = False -TESTING = False - -SERVER = 'prd' -DB_AUTH = 'SCRAM-SHA-1' -DB_NAME = 'biomarkerkbdb' -DB_COLLECTION = 'biomarker_collection' -DB_USERNAME = 'biomarkeradmin' -DB_PASSWORD = 'biomarkerpass' -MONGO_URI = f'mongodb://{DB_USERNAME}:{DB_PASSWORD}@172.18.0.2:27017/{DB_NAME}?authMechanism={DB_AUTH}&authSource={DB_NAME}' \ No newline at end of file diff --git a/api/biomarker/config/config_dev.py b/api/biomarker/config/config_dev.py deleted file mode 100644 index 286d037..0000000 --- a/api/biomarker/config/config_dev.py +++ /dev/null @@ -1,10 +0,0 @@ -DEBUG = True -TESTING = False - -SERVER = 'dev' -DB_AUTH = 'SCRAM-SHA-1' -DB_NAME = 'biomarkerkbdb' -DB_COLLECTION = 'biomarker_collection' -DB_USERNAME = 'biomarkeradmin' -DB_PASSWORD = 'biomarkerpass' -MONGO_URI = f'mongodb://{DB_USERNAME}:{DB_PASSWORD}@172.18.0.2:27017/{DB_NAME}?authMechanism={DB_AUTH}&authSource={DB_NAME}' \ No newline at end of file diff --git a/api/biomarker/data_models.py b/api/biomarker/data_models.py new file mode 100644 index 0000000..f9120ac --- /dev/null +++ b/api/biomarker/data_models.py @@ -0,0 +1,323 @@ +''' Defines the data models for the biomarker API. Currently written for schema v0.3.1. +''' + +from flask_restx import Namespace, fields + +api = Namespace('dataset', description = 'Dataset operations API') + +### define the biomarker component models + +simple_synonym_model = api.model('SimpleSynonym', { + 'synonym': fields.String( + required = False, + description = 'The synonym.' + ) +}) + +assessed_biomarker_entity_model = api.model('AssessedBiomarkerEntity', { + 'recommended_name': fields.String( + required = True, + description = 'The recommended name of the biomarker entity.' + ), + 'synonym': fields.List( + fields.Nested( + simple_synonym_model, + default = [] + ) + ) +}) + +specimen_model = api.model('Specimen', { + 'name': fields.String( + required = False, + description = 'The specimen name.' + ), + 'specimen_id': fields.String( + required = False, + description = 'The specimen name space and ID.' + ), + 'name_space': fields.String( + required = False, + description = 'The name space of the specimen ID.' + ), + 'url': fields.String( + required = False, + description = 'The URL for the specimen within the name space resource.' + ), + 'loinc_code': fields.String( + required = False, + description = 'The LOINC code for the specimen.' + ) +}) + +evidence_list_model = api.model('EvidenceList', { + 'evidence': fields.String( + required = True, + description = 'The evidence.' + ) +}) + +tag_model = api.model('Tag', { + 'tag': fields.String( + required = True, + description = 'The tag.' + ) +}) + +evidence_source_model = api.model('EvidenceSource', { + 'evidence_id': fields.String( + required = True, + description = 'The evidence ID.' + ), + 'database': fields.String( + required = True, + description = 'The database the evidence is from.' + ), + 'url': fields.String( + required = False, + description = 'The URL for the evidence source.' + ), + 'evidence_list': fields.List( + fields.Nested( + evidence_list_model, + required = True + ) + ), + 'tags': fields.List( + fields.Nested( + tag_model, + required = True + ) + ) +}) + +condition_recommended_name_model = api.model('ConditionRecommendedName', { + 'condition_id': fields.String( + required = True, + description = 'The condition resource identifier and ID.' + ), + 'name': fields.String( + required = True, + description = 'The recommended name of the condition.' + ), + 'description': fields.String( + required = False, + description = 'The description of the condition.' + ), + 'resource': fields.String( + required = False, + description = 'The resource for the condition.' + ), + 'url': fields.String( + required = False, + description = 'The URL to the condition in the resource.' + ) +}) + +condition_synonym_model = api.model('ConditionSynonym', { + 'synonym_id': fields.String( + required = False, + description = 'The synonym resource identifier and ID.' + ), + 'name': fields.String( + required = False, + description = 'The synonym name.' + ), + 'resource': fields.String( + required = False, + description = 'The resource for the synonym.' + ), + 'url': fields.String( + required = False, + description = 'The URL to the synonym in the resource.' + ) +}) + +exposure_agent_recommended_name_model = api.model('ExposureAgentRecommendedName', { + 'exposure_agent_id': fields.String( + required = True, + description = 'The exposure agent resource identifier and ID.' + ), + 'name': fields.String( + required = True, + description = 'The recommended name of the exposure agent.' + ), + 'description': fields.String( + required = False, + description = 'The description of the exposure agent.' + ), + 'resource': fields.String( + required = False, + description = 'The resource for the exposure agent.' + ), + 'url': fields.String( + required = False, + description = 'The URL to the exposure agent in the resource.' + ) +}) + +reference_model = api.model('Reference', { + 'reference_id': fields.String( + required = False, + description = 'The reference ID.' + ), + 'type': fields.String( + required = False, + description = 'The reference type.' + ), + 'url': fields.String( + required = False, + description = 'The URL to the reference.' + ) +}) + +simple_evidence_source_model = api.model('SimpleEvidenceSource', { + 'evidence_id': fields.String( + required = False, + description = 'The evidence ID.' + ), + 'database': fields.String( + required = False, + description = 'The database the evidence is from.' + ), + 'url': fields.String( + required = False, + description = 'The URL for the evidence source.' + ) +}) + +### top level object models + +biomarker_component_model = api.model('BiomarkerComponent', { + 'biomarker': fields.String( + required = True, + description = 'The entity change.' + ), + 'assessed_biomarker_entity': fields.Nested( + assessed_biomarker_entity_model, + required = True + ), + 'assessed_entity_type': fields.String( + required = True, + description = 'The entity type.' + ), + 'specimen': fields.List( + fields.Nested( + specimen_model + ), + required = False + ), + 'evidence_source': fields.List( + fields.Nested( + evidence_source_model + ), + default = [] + ) +}) + +biomarker_role_model = api.model('BiomarkerRole', { + 'role': fields.String( + required = True, + description = 'The role of the biomarker.' + ), +}) + +condition_model = api.model('Condition', { + 'condition_id': fields.String( + required = True, + description = 'The condition resource identifier and ID.' + ), + 'recommended_name': fields.Nested( + condition_recommended_name_model, + required = True + ), + 'synonyms': fields.List( + fields.Nested( + condition_synonym_model + ), + default = [] + ), +}) + +exposure_agent_model = api.model('ExposureAgent', { + 'exposure_agent_id': fields.String( + required = True, + description = 'The exposure agent resource identifier and ID.' + ), + 'recommended_name': fields.Nested( + exposure_agent_recommended_name_model, + required = True + ) +}) + +citation_model = api.model('Citation', { + 'citation_title': fields.String( + required = False, + description = 'The title of the citation.' + ), + 'journal': fields.String( + required = False, + description = 'The journal the citation is from.' + ), + 'authors': fields.String( + required = False, + description = 'The authors of the citation.' + ), + 'date': fields.String( + required = False, + description = 'The date of the citation.' + ), + 'reference': fields.List( + fields.Nested( + reference_model, + required = False + ) + ), + 'evidence_source': fields.List( + fields.Nested( + simple_evidence_source_model + ), + default = [] + ) +}) + +### define the top level data model + +data_model = api.model('DataModel', { + 'biomarker_id': fields.String( + required = True, + description = 'The unique ID for the biomarker.' + ), + 'biomarker_component': fields.List( + fields.Nested( + biomarker_component_model + ), + required = True + ), + 'best_biomarker_role': fields.List( + fields.Nested( + biomarker_role_model, + required = True + ) + ), + 'condition': fields.Nested( + condition_model, + required = False + ), + 'exposure_agent': fields.Nested( + exposure_agent_model, + required = False + ), + 'evidence_source': fields.List( + fields.Nested( + evidence_source_model + ), + default = [] + ), + 'citation': fields.List( + fields.Nested( + citation_model + ), + default = [] + ) +}) \ No newline at end of file diff --git a/api/biomarker/dataset.py b/api/biomarker/dataset.py index 52f7750..be1af5c 100644 --- a/api/biomarker/dataset.py +++ b/api/biomarker/dataset.py @@ -1,51 +1,42 @@ -from flask_restx import Namespace, Resource, fields +from flask_restx import Namespace, Resource, inputs from flask import current_app as app from flask import request +from .data_models import data_model -api = Namespace ('dataset', description = 'Dataset operations API') +api = Namespace('dataset', description = 'Dataset operations API') def get_collection_name(): return app.config['DB_COLLECTION'] -data_model = api.model('Data', { - 'biomarker_id': fields.String(description = 'Unique biomarker identifier.'), - 'main_x_ref': fields.String(description = 'Accession or identifier that most closely matches the biomarker term.'), - 'assessed_biomarker_entity': fields.String(description = 'Change in entity and common name.'), - 'biomarker_status': fields.String(description = 'Change measured in disease versus healthy individual.'), - 'best_biomarker_type': fields.String(description = 'Category of best biomarker.'), - 'specimen_type': fields.String(description = 'Type of speciment used to access the biomarker (with Uberon ID)'), - 'loinc_code': fields.String(description = 'Lab test ID associated with biomarker.'), - 'condition_name': fields.String(description = 'Condition name with DOID.'), - 'assessed_entity_type': fields.String(description = 'Entity type of the biomarker.'), - 'evidence_source': fields.String(description = 'Source of biomarker with corresponding link to data page within the source.'), - 'notes': fields.String(description = 'Meta data, if applicable.') -}) - class DatasetGetAll(Resource): ''' Get the entire dataset. ''' - @api.doc(description = 'Returns all the data records.') + @api.doc(description = 'Returns all the data records Supports pagination and per page filtering.') + @api.param('page', 'The page number to return.', type = int, default = 1) + @api.param('per_page', 'The number of records to return per page.', type = int, default = 50) @api.response(200, 'Success', data_model) - @api.marshal_list_with(data_model) def get(self): - data = app.mongo.db[get_collection_name()].find() + page = request.args.get('page', default = 1, type = int) + per_page = request.args.get('per_page', default = 50, type = int) + data = app.mongo_db[get_collection_name()].find({}, {'_id': 0}).skip((page - 1) * per_page).limit(per_page) return list(data) class DatasetRandomSample(Resource): ''' Get a random subset of data. ''' - @api.doc(description = 'Returns a random subset of the data.') - @api.param('sample', 'The number of samples to return.') + @api.doc(description = 'Returns a random subset of the data. The sample size must be a positive integer.') + @api.param('sample', 'The number of samples to return.', type = inputs.positive, default = 1) @api.response(200, 'Success', data_model) - @api.marshal_list_with(data_model) + @api.response(400, 'Bad Request') def get(self): try: - sample_size = int(request.args.get('sample', default = 1)) + sample_size = request.args.get('sample', default = 1, type = int) except ValueError: - return {'message': 'Invalid sample size provided'}, 400 - if sample_size <= 0: - return {'message': 'Sample size must be a positive integer.'}, 400 - data = app.mongo.db[get_collection_name()].aggregate([{'$sample': {'size': sample_size}}]) + return {'message': 'Invalid sample size provided. Sample must be a positive integer.'}, 400 + data = app.mongo_db[get_collection_name()].aggregate([ + {'$sample': {'size': sample_size}}, + {'$project': {'_id': 0}} + ]) return list(data) api.add_resource(DatasetGetAll, '/getall') diff --git a/api/biomarker/id.py b/api/biomarker/id.py new file mode 100644 index 0000000..f02c815 --- /dev/null +++ b/api/biomarker/id.py @@ -0,0 +1,31 @@ +from flask_restx import Namespace, Resource +from flask import current_app as app +from flask import request +from .data_models import data_model + +api = Namespace('id', description = 'Get records by biomarker ID.') + +def get_collection_name(): + return app.config['DB_COLLECTION'] + +class Biomarker(Resource): + ''' Get a single biomarker record by biomarker ID. + ''' + @api.doc(description = 'Returns a single biomarker record by biomarker ID.') + @api.response(200, 'Success', data_model) + @api.response(400, 'No biomarker ID provided') + @api.response(404, 'Not Found') + @api.param('biomarker_id', 'The biomarker ID.', type = str, required = True) + def get(self): + biomarker_id = request.args.get('biomarker_id', default = None) + if not biomarker_id: + return {'message': 'No biomarker ID provided'}, 400 + try: + data = app.mongo_db[get_collection_name()].find_one({'biomarker_id': biomarker_id}, {'_id': 0}) + except Exception as e: + return {'message': 'Invalid biomarker ID provided'}, 400 + if not data: + return {'message': 'Biomarker ID not found'}, 404 + return data + +api.add_resource(Biomarker, '/getbiomarker') \ No newline at end of file diff --git a/api/config.json b/api/config.json index 40f5cc5..7160be8 100644 --- a/api/config.json +++ b/api/config.json @@ -1,34 +1,30 @@ { - "project":"biomarkerkb", + "project":"biomarker-api", "api_port":{ - "prd":"8081", - "beta":"8881", - "tst":"8081", - "dev":"8081" + "prd":"8089", + "beta":"8889", + "tst":"8089", + "dev":"8089" }, - "mail":{ - "server":"x.x.x.x", - "port":"25", - "sender":"no-reply@glygen.gwu.edu" - }, - "data_path":"/data/shared/biomarkerkb/", + "data_path":"/data/shared/biomarkerdb/", "dbinfo":{ - "dbname":"biomarkerkbdb", + "dbname":"biomarkerdb_api", "port":{ - "prd":"7070", + "prd":"7071", "beta":"7770", - "tst":"6060", + "tst":"6061", "dev":"27017" }, - "bridge_network":"biomarkerkb_backend_network", + "bridge_network":"biomarker_api_backend_network", "admin":{ "db":"admin", "user":"superadmin", "password":"superpass" }, - "biomarkerkbdb":{ - "db":"biomarkerkbdb", + "biomarkerdb_api":{ + "db":"biomarkerdb_api", "collection": "biomarker_collection", + "id_collection": "id_map_collection", "user":"biomarkeradmin", "password":"biomarkerpass" } diff --git a/api/create_api_container.py b/api/create_api_container.py index 673372e..885e470 100644 --- a/api/create_api_container.py +++ b/api/create_api_container.py @@ -58,6 +58,8 @@ def main(): cmd_list = [] # command to package the api cmd_list.append('python setup.py bdist_wheel') + # if no python error, use this + # cmd_list.append('python3 setup.py bdist_wheel') # command to build a docker image from the dockerfile cmd_list.append(f'docker build -t {api_image} .') @@ -75,6 +77,8 @@ def main(): def run_command(cmd): result = subprocess.run(cmd, shell = True, text = True, encoding = 'utf-8', errors = 'replace', capture_output = True) + # for python 3.6 and below + # result = subprocess.run(cmd, shell = True, universal_newlines = True, errors = 'replace', stdout = subprocess.PIPE, stderr = subprocess.PIPE) if result.returncode != 0: print(f'Command failed with error code {result.returncode}: {result.stderr}') else: diff --git a/api/create_mongodb_container.py b/api/create_mongodb_container.py index 3f86d41..3405772 100644 --- a/api/create_mongodb_container.py +++ b/api/create_mongodb_container.py @@ -69,7 +69,7 @@ def main(): # create mongo container command # the external port is dynamically assigned using mongo_port, and the container's port of 27017 is used because that is the default port that MongoDB listens on inside the container mongo_cmd = f'docker create --name {mongo_container_name} --network {mongo_network_name} -p 127.0.0.1:{mongo_port}:27017' - mongo_cmd += f' -v {data_path}/db/{server}:/data/db {e_params} mongo' + mongo_cmd += f' -v {data_path}/api_db/{server}:/data/db {e_params} mongo' cmd_list.append(mongo_cmd) # run the commands diff --git a/api/id.py b/api/id.py new file mode 100644 index 0000000..4c5b370 --- /dev/null +++ b/api/id.py @@ -0,0 +1,156 @@ +''' Handles the ID assignment/collision operations. +''' + +import sys +import hashlib +import re +import pymongo +import logging + +def clean_value(value: str) -> str: + ''' Cleans the passed value using regex. Removes all non-alphanumeric + characters and makes the value lowercase. + + Parameters + ---------- + value: str + The value to clean. + + Returns + ------- + str + The cleaned value. + ''' + value = re.sub(r'[^a-zA-Z0-9]', '', value).lower() + return value + +def generate_custom_id(document: dict) -> tuple: + ''' Generates the custom hash ID for the document. + + Parameters + ---------- + document: json + The document to generate the ID for. + + Returns + ------- + tuple: (str, str) + Returns the custom hash ID and the concatenated core values string. + ''' + # hold the core field values + core_values = [] + + # grab the core fields from the biomarker component + for component in document['biomarker_component']: + core_values.append(component['biomarker']) + core_values.append(component['assessed_biomarker_entity']['recommended_name']) + core_values.append(component['assessed_biomarker_entity_id']) + + # grab top level core fields + if 'condition' in document.keys() and document['condition'] != None: + core_values.append(document['condition']['condition_id']) + elif 'exposure_agent' in document.keys() and document['exposure_agent'] != None: + core_values.append(document['exposure_agent']['exposure_agent_id']) + + # clean the core values + core_values = [clean_value(v) for v in core_values] + # inplace sort the core_values alphabetically + core_values.sort() + core_values_str = '_'.join(core_values) + + # generate the SHA-256 hash of the core values + return hashlib.sha256(core_values_str.encode('utf-8')).hexdigest(), core_values_str + +def check_collision(hash_value: str, dbh, id_collection: str) -> bool: + ''' Checks if the hash value already exists in the database. + + Parameters + ---------- + hash_value: str + The hash value to check. + dbh: pymongo.MongoClient + The database handle. + id_collection: str + The name of the collection to check for the hash value. + + Returns + ------- + bool + True if the hash value already exists in the database, False otherwise. + ''' + # check if the hash value already exists in the database + if dbh[id_collection].find_one({'hash_value': hash_value}) != None: + return True + return False + +def _increment_ordinal_id(ordinal_id: str) -> str: + ''' Increments the ordinal id. + + Parameters + ---------- + ordinal_id: str + The current max ordinal ID to increment. + + Returns + ------- + str + The incremented ordinal ID. + ''' + # extract the letters and numbers from the ordinal ID + letters = ordinal_id[:2] + numbers = int(ordinal_id[2:]) + + # increment the numbers + if numbers < 9999: + return letters + str(numbers + 1).zfill(4) + # check if the maximum ordinal ID has been reached + if letters == 'ZZ': + raise ValueError('Maximum ordinal ID reached.') + + # increment the letters + first_letter, second_letter = letters + # roll over the second letter + if second_letter == 'Z': + first_letter = chr(ord(first_letter) + 1) + second_letter = 'A' + else: + second_letter = chr(ord(second_letter) + 1) + + return first_letter + second_letter + '0000' + +def add_hash_and_increment_ordinal(hash_value: str, core_values_str: str, dbh, id_collection: str) -> str: + ''' Adds the hash value and core values string to the id collection and assigns and incremented + ordinal ID. + + Parameters + ---------- + hash_value: str + The hash value to add. + core_values_str: str + The core values string to add. + dbh: pymongo.MongoClient + The database handle. + id_collection: str + The name of the collection to add the hash value and core values string to. + + Returns + ------- + str + The newly assigned ordinal ID to be used as the human readable biomarker ID. + ''' + # grab the current max ordinal ID + max_entry = dbh[id_collection].find_one(sort=[('ordinal_id', pymongo.DESCENDING)]) + max_ordinal_id = max_entry['ordinal_id'] if max_entry else 'AA0000' + + # increment the ordinal ID + try: + new_ordinal_id = _increment_ordinal_id(max_ordinal_id) + except ValueError as e: + print(f'ValueError: {e}') + logging.error(e) + sys.exit(1) + + # add the hash value, incremented ordinal id, and core values string to the id collection + dbh[id_collection].insert_one({'hash_value': hash_value, 'ordinal_id': new_ordinal_id, 'core_values_str': core_values_str}) + + return new_ordinal_id \ No newline at end of file diff --git a/api/init_mongodb.py b/api/init_mongodb.py index 15e46c4..064c17b 100644 --- a/api/init_mongodb.py +++ b/api/init_mongodb.py @@ -50,11 +50,11 @@ def main(): # establish the database connection and create the db user try: client = pymongo.MongoClient(host, - username = admin_user, - password = admin_pass, - authSource = admin_db, - authMechanism = 'SCRAM-SHA-1', - serverSelectionTimeoutMS = 10000) + username = admin_user, + password = admin_pass, + authSource = admin_db, + authMechanism = 'SCRAM-SHA-1', + serverSelectionTimeoutMS = 10000) # test the connection client.server_info() # create db user diff --git a/api/load_data.py b/api/load_data.py index be01522..6154ddd 100644 --- a/api/load_data.py +++ b/api/load_data.py @@ -1,8 +1,94 @@ import sys import json -import csv +import glob +import os import pymongo +from id import * from optparse import OptionParser +import logging +from datetime import datetime +import copy + +BATCH_SIZE = 1000 + +def process_data(data: dict, dbh, db_collection: str, id_collection: str, fp: str) -> tuple: + ''' Processes the data for the current data file and inserts into the database if applicable. + + Parameters + ---------- + data: dict + The data to process. + dbh: pymongo.MongoClient + The database handle. + db_collection: str + The name of the collection to insert the data into. + id_collection: str + The name of the collection to check for hash collisions. + fp: str + The filepath to the data file. + + Returns + ------- + tuple + The updated data with the new biomarker ids and a message indicating the status of the insert operation. + ''' + bulk_ops = [] + output_messages = [] + collisions = {} + collision_count = 1 + collision_report_filename = f'{os.path.splitext(os.path.split(fp)[1])[0]}_collisions.json' + collision_report_path = f'./collision_reports/{collision_report_filename}' + + # iterate over entries in the data + for document in data: + # generate hash value for data record + hash_value, core_values_str = generate_custom_id(document) + # if there is a hash collision, don't add and add to output messages + if check_collision(hash_value, dbh, id_collection): + output_message = f'\nCollision detected for record in:\n\tFile: {fp}:\n\tDocument: {document}\n\tCore Values Str: {core_values_str}\n\tHash Value: {hash_value}\n' + collisions[collision_count] = { + 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'file': fp, + 'core_values_str': core_values_str, + 'hash_value': hash_value, + 'document': document + } + collision_count += 1 + print(output_message) + output_messages.append(output_message) + else: + biomarker_id = add_hash_and_increment_ordinal(hash_value, core_values_str, dbh, id_collection) + document['biomarker_id'] = biomarker_id + # add insert operation to bulk operations list + document_copy = copy.deepcopy(document) + bulk_ops.append(pymongo.InsertOne(document_copy)) + + # if bulk operations list is full, execute the bulk write to avoid memory issues + if len(bulk_ops) >= BATCH_SIZE: + dbh[db_collection].bulk_write(bulk_ops) + bulk_ops = [] + + # execute the remaining bulk operations + if bulk_ops: + dbh[db_collection].bulk_write(bulk_ops) + + if not output_messages: + return data, f'Successfully inserted all data records with no collisions for the file: {fp}.' + else: + with open(collision_report_path, 'w') as f: + json.dump(collisions, f, indent = 4) + return data, '\n'.join(output_messages) + f'\nWriting collision report to: {collision_report_path}.' + +def setup_logging(log_path: str) -> None: + ''' Configures the logger to write to a file. + + Parameters + ---------- + log_path: str + The path to the log file. + ''' + logging.basicConfig(filename = log_path, level = logging.DEBUG, + format = '%(asctime)s %(levelname)s %(message)s') def main(): @@ -14,16 +100,16 @@ def main(): # -s or --server, takes the values of dev, tst, beta, or prd parser.add_option('-s', '--server', action = 'store', dest = 'server', help = 'dev/tst/beta/prd') # -f or --file, takes the filepath to the input file - parser.add_option('-f', '--file', action = 'store', dest = 'csvfile', help = 'Filepath of the input CSV file') + parser.add_option('-v', '--ver', action = 'store', dest = 'release_ver', help = 'data release version') # parse the command line arguments (options, _) = parser.parse_args() # check the input arguments - if not options.server or not options.csvfile: + if not options.server or not options.release_ver: parser.print_help() sys.exit(1) server = options.server - fp = options.csvfile + data_ver = options.release_ver # read in config file with open('config.json', 'r') as f: @@ -35,8 +121,12 @@ def main(): host = f'mongodb://127.0.0.1:{mongo_port}' # database db_name = config_obj['dbinfo']['dbname'] - # collection + # data root path + data_root_path = config_obj['data_path'] + # data collection db_collection = config_obj['dbinfo'][db_name]['collection'] + # id collection + id_collection = config_obj['dbinfo'][db_name]['id_collection'] # database user info db_user = config_obj['dbinfo'][db_name]['user'] db_pass = config_obj['dbinfo'][db_name]['password'] @@ -44,11 +134,11 @@ def main(): # establish database connection try: client = pymongo.MongoClient(host, - username = db_user, - password = db_pass, - authSource = db_name, - authMechanism = 'SCRAM-SHA-1', - serverSelectionTimeoutMS = 10000) + username = db_user, + password = db_pass, + authSource = db_name, + authMechanism = 'SCRAM-SHA-1', + serverSelectionTimeoutMS = 10000) # test the connection client.server_info() except pymongo.errors.ServerSelectionTimeoutError as err: @@ -61,12 +151,23 @@ def main(): # get the database handle dbh = client[db_name] - # open the input csv file and populate the database collection - # TODO rework this logic to bulk load - with open(fp, 'r') as f: - reader = csv.DictReader(f) - for row in reader: - dbh[db_collection].insert_one(row) + # setup logging in current directory + log_path = f'./collision_reports/load_data_{server}.log' + setup_logging(log_path) + logging.info(f'Loading data for server: {server} and data release version: {data_ver}. #####################') + + # glob pattern for JSON data model files + data_release_glob_pattern = f'{data_root_path}/releases/data/v-{data_ver}/datamodeldb/*.json' + # process each file + for fp in glob.glob(data_release_glob_pattern): + with open(fp, 'r') as f: + data = json.load(f) + updated_data, output_message = process_data(data, dbh, db_collection, id_collection, fp) + logging.info(output_message) + with open(fp, 'w') as f: + json.dump(updated_data, f, indent = 4) + + logging.info(f'Finished loading data for server: {server} and data release version: {data_ver}. ---------------------') if __name__ == '__main__': main() diff --git a/api/setup.py b/api/setup.py index 95348d9..74970b4 100644 --- a/api/setup.py +++ b/api/setup.py @@ -2,7 +2,7 @@ def main(): - setup(name = 'biomarkerkb', + setup(name = 'biomarker', version = '1.0', packages = find_packages(), include_package_data = True,