diff --git a/.github/workflows/cli-coverage.yml b/.github/workflows/cli-coverage.yml index 65269542..15820a3a 100644 --- a/.github/workflows/cli-coverage.yml +++ b/.github/workflows/cli-coverage.yml @@ -16,8 +16,8 @@ jobs: image: postgres env: POSTGRES_USER: postgres - POSTGRES_PASSWORD: dockerpassword - POSTGRES_DB: pipestat-test + POSTGRES_PASSWORD: docker + POSTGRES_DB: bedbase POSTGRES_HOST: localhost ports: - 5432:5432 diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 0b37a9fe..1389362c 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -20,8 +20,8 @@ jobs: image: postgres env: POSTGRES_USER: postgres - POSTGRES_PASSWORD: dockerpassword - POSTGRES_DB: pipestat-test + POSTGRES_PASSWORD: docker + POSTGRES_DB: bedbase POSTGRES_HOST: localhost ports: - 5432:5432 diff --git a/MANIFEST.in b/MANIFEST.in index 9547f624..0126442a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,6 @@ include README.md include LICENSE.txt include requirements/* include bbconf/schemas/* +include bbconf/modules/* +include bbconf/config_parser/* +include bbconf/models/* diff --git a/README.md b/README.md index a7076339..4ff58b7e 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ## What is this? `bbconf` is a configuration and management tool for BEDbase, facilitating the reading of configuration files, -setting up connections to PostgreSQL and Qdrant databases, managing file paths, and storing transformer models. +setting up connections to PostgreSQL, PEPhub, S3, and Qdrant databases, managing file paths, and storing transformer models. It formalizes communication pathways for pipelines and downstream tools, ensuring seamless interaction." --- diff --git a/bbconf/__init__.py b/bbconf/__init__.py index 2b5105dc..c225ce3c 100644 --- a/bbconf/__init__.py +++ b/bbconf/__init__.py @@ -1,11 +1,13 @@ import logging + import coloredlogs -from bbconf.bbconf import BedBaseConf, get_bedbase_cfg +from bbconf.bbagent import BedBaseAgent + from ._version import __version__ from .const import PKG_NAME -__all__ = ["BedBaseConf", "get_bedbase_cfg", "__version__"] +__all__ = ["BedBaseAgent", "__version__"] _LOGGER = logging.getLogger(PKG_NAME) coloredlogs.install( diff --git a/bbconf/_version.py b/bbconf/_version.py index df124332..3d187266 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.4.2" +__version__ = "0.5.0" diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py new file mode 100644 index 00000000..ec847e21 --- /dev/null +++ b/bbconf/bbagent.py @@ -0,0 +1,62 @@ +from pathlib import Path +from typing import Union + +from sqlalchemy.orm import Session +from sqlalchemy.sql import distinct, func, select + +from bbconf.config_parser.bedbaseconfig import BedBaseConfig +from bbconf.db_utils import Bed, BedSets +from bbconf.models.base_models import StatsReturn +from bbconf.modules.bedfiles import BedAgentBedFile +from bbconf.modules.bedsets import BedAgentBedSet +from bbconf.modules.objects import BBObjects + + +class BedBaseAgent(object): + def __init__( + self, + config: Union[Path, str], + ): + """ + Initialize connection to the pep_db database. You can use The basic connection parameters + or libpq connection string. + + """ + + self.config = BedBaseConfig(config) + + self.__bed = BedAgentBedFile(self.config) + self.__bedset = BedAgentBedSet(self.config) + self.__objects = BBObjects(self.config) + + @property + def bed(self) -> BedAgentBedFile: + return self.__bed + + @property + def bedset(self) -> BedAgentBedSet: + return self.__bedset + + @property + def objects(self) -> BBObjects: + return self.__objects + + def get_stats(self) -> StatsReturn: + """ + Get statistics for a bed file + + :return: statistics + """ + with Session(self.config.db_engine.engine) as session: + number_of_bed = session.execute(select(func.count(Bed.id))).one()[0] + number_of_bedset = session.execute(select(func.count(BedSets.id))).one()[0] + + number_of_genomes = session.execute( + select(func.count(distinct(Bed.genome_alias))) + ).one()[0] + + return StatsReturn( + bedfiles_number=number_of_bed, + bedsets_number=number_of_bedset, + genomes_number=number_of_genomes, + ) diff --git a/bbconf/bbconf.py b/bbconf/bbconf.py deleted file mode 100644 index 28a03523..00000000 --- a/bbconf/bbconf.py +++ /dev/null @@ -1,722 +0,0 @@ -import os -from logging import getLogger -from typing import List, Optional, Dict, Union, Literal -from textwrap import indent - -import yacman -from pipestat import PipestatManager -from pipestat.exceptions import RecordNotFoundError, SchemaError - -from sqlmodel import SQLModel, Field, select -import qdrant_client - -from sqlalchemy.orm import relationship -from sqlalchemy import inspect - -from bbconf.const import ( - CFG_PATH_KEY, - CFG_PATH_PIPELINE_OUTPUT_KEY, - CFG_PATH_BEDSTAT_DIR_KEY, - CFG_PATH_TEXT2VEC_KEY, - DEFAULT_SECTION_VALUES, - CFG_PATH_BEDBUNCHER_DIR_KEY, - BED_TABLE, - BED_TABLE_SCHEMA, - BEDSET_TABLE, - BEDSET_TABLE_SCHEMA, - BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY, - CFG_REMOTE_KEY, - BEDSETS_REL_KEY, - BEDFILES_REL_KEY, - CFG_PATH_REGION2VEC_KEY, - CFG_PATH_VEC2VEC_KEY, - CFG_QDRANT_KEY, - CFG_QDRANT_PORT_KEY, - CFG_QDRANT_API_KEY, - CFG_QDRANT_HOST_KEY, - CFG_QDRANT_COLLECTION_NAME_KEY, - DEFAULT_TEXT2VEC_MODEL, - DEFAULT_VEC2VEC_MODEL, - DEFAULT_REGION2_VEC_MODEL, - CFG_ACCESS_METHOD_KEY, - PKG_NAME, -) -from bbconf.exceptions import ( - BedBaseConfError, - MissingConfigDataError, - MissingThumbnailError, - MissingObjectError, - BadAccessMethodError, -) -from bbconf.helpers import raise_missing_key, get_bedbase_cfg -from bbconf.models import DRSModel, AccessMethod, AccessURL - -from geniml.text2bednn import text2bednn -from geniml.search import QdrantBackend -from fastembed.embedding import FlagEmbedding -from geniml.region2vec import Region2VecExModel -from geniml.io import RegionSet - -_LOGGER = getLogger(PKG_NAME) - - -class BedBaseConf: - """ - This class standardizes reporting of bedstat and bedbuncher results. - It formalizes a way for these pipelines and downstream tools - to communicate -- the produced results can easily and reliably become an - input for the server. The object exposes API for interacting with the - results and is backed by a [PostgreSQL](https://www.postgresql.org/) - database. - """ - - def __init__(self, config_path: str = None, database_only: bool = False): - """ - Initialize the object - - :param str config_path: path to the bedbase configuration file - :param bool database_only: whether the database managers should not - keep an in-memory copy of the data in the database - """ - - cfg_path = get_bedbase_cfg(config_path) - - self._config = self._read_config_file(cfg_path) - - # Create Pipestat objects and tables if they do not exist - _LOGGER.debug("Creating pipestat objects...") - self.__pipestats = { - BED_TABLE: PipestatManager( - config_file=cfg_path, - schema_path=BED_TABLE_SCHEMA, - database_only=database_only, - ), - BEDSET_TABLE: PipestatManager( - config_file=cfg_path, - schema_path=BEDSET_TABLE_SCHEMA, - database_only=database_only, - ), - } - - self._create_bedset_bedfiles_table() - - # setup t2bsi object - self._t2bsi = None - try: - self._senta2vec_hg_model_name = self.config[CFG_PATH_KEY].get( - CFG_PATH_TEXT2VEC_KEY, DEFAULT_TEXT2VEC_MODEL - ) - _LOGGER.debug("Setting up qdrant database connection...") - self._qdrant_backend = self._init_qdrant_backend() - - if self.config[CFG_PATH_KEY].get(CFG_PATH_REGION2VEC_KEY) and self.config[ - CFG_PATH_KEY - ].get(CFG_PATH_VEC2VEC_KEY): - self.region2vec_model = self.config[CFG_PATH_KEY].get( - CFG_PATH_REGION2VEC_KEY - ) - self._t2bsi = self._create_t2bsi_object() - else: - if not self.config[CFG_PATH_KEY].get(CFG_PATH_REGION2VEC_KEY): - _LOGGER.debug( - f"{CFG_PATH_REGION2VEC_KEY} was not provided in config file! Using default.." - ) - self.region2vec_model = DEFAULT_REGION2_VEC_MODEL - else: - self.region2vec_model = self.config[CFG_PATH_KEY].get( - CFG_PATH_REGION2VEC_KEY - ) - - if not self.config[CFG_PATH_KEY].get(CFG_PATH_VEC2VEC_KEY): - self.config[CFG_PATH_KEY][ - CFG_PATH_VEC2VEC_KEY - ] = DEFAULT_VEC2VEC_MODEL - - except qdrant_client.http.exceptions.ResponseHandlingException as err: - _LOGGER.error(f"error in Connection to qdrant! skipping... Error: {err}") - - def _read_config_file(self, config_path: str) -> dict: - """ - Read configuration file and insert default values if not set - - :param config_path: configuration file path - :return: None - :raises: raise_missing_key (if config key is missing) - """ - _config = yacman.YAMLConfigManager(filepath=config_path).exp - - if CFG_PATH_KEY not in _config: - raise_missing_key(CFG_PATH_KEY) - - if not _config[CFG_PATH_KEY]: - _config[CFG_PATH_KEY] = {} - - if CFG_PATH_PIPELINE_OUTPUT_KEY not in _config[CFG_PATH_KEY]: - raise_missing_key(CFG_PATH_PIPELINE_OUTPUT_KEY) - - if CFG_PATH_BEDSTAT_DIR_KEY not in _config[CFG_PATH_KEY]: - raise_missing_key(CFG_PATH_BEDSTAT_DIR_KEY) - - if CFG_PATH_BEDBUNCHER_DIR_KEY not in _config[CFG_PATH_KEY]: - raise_missing_key(CFG_PATH_BEDBUNCHER_DIR_KEY) - - # Setting default values if doesn't exist in config file - for section, mapping in DEFAULT_SECTION_VALUES.items(): - if section not in _config: - _config[section] = {} - for key, default in mapping.items(): - if key not in _config[section]: - _LOGGER.debug( - f"Config lacks '{section}.{key}' key. Setting to: {default}" - ) - _config[section][key] = default - - if CFG_PATH_REGION2VEC_KEY not in _config[CFG_PATH_KEY]: - _LOGGER.warning("Region2vec config key is missing in configuration file") - _config[CFG_PATH_KEY][CFG_PATH_REGION2VEC_KEY] = None - - return _config - - def search_bed_by_text( - self, query: str - ) -> List[Dict[str, Union[int, float, Dict[str, str], List[float]]]]: - """ - Search for bed files by text query in the qdrant database - - :param query: strign query provided by user - :return: a list of dictionary that contains the search results in this format: - { - "id": - "score": - "payload": { - - } - "vector": [] - } - """ - if self._t2bsi is None: - raise BedBaseConfError( - "Can't perform search, ensure qdrant_db credentials in config file" - ) - return self._t2bsi.nl_vec_search(query) - - def __str__(self): - """ - Generate string representation of the object - - :return str: string representation of the object - """ - - res = f"{self.__class__.__name__}\n" - res += f"{BED_TABLE}:\n" - res += f"{indent(str(self.bed), ' ')}" - res += f"\n{BEDSET_TABLE}:\n" - res += f"{indent(str(self.bedset), ' ')}" - res += "\nconfig:\n" - res += f"{indent(str(self.config), ' ')}" - return res - - @property - def config(self) -> dict: - """ - Config used to initialize the object - - :return dict: bedbase configuration file contents - """ - return self._config - - @property - def bed(self) -> PipestatManager: - """ - PipestatManager of the bedfiles table - - :return pipestat.PipestatManager: manager of the bedfiles table - """ - return self.__pipestats[BED_TABLE] - - @property - def bedset(self) -> PipestatManager: - """ - PipestatManager of the bedsets table - - :return pipestat.PipestatManager: manager of the bedsets table - """ - return self.__pipestats[BEDSET_TABLE] - - def _check_table_exists(self, table_name: str) -> bool: - """ - Check if the specified table exists on the 'bed' pipestatmanager object - - :param str table_name: table name to be checked - :return bool: whether the specified table exists - """ - with self.bed.backend.session as s: - return inspect(s.bind).has_table(table_name=table_name) - - def _get_output_path( - self, table_name: str, remote_key: str, remote: bool = False - ) -> str: - """ - Get path to the output of the selected pipeline - - :param str table_name: name of the table that is populated by the - pipeline to return the output path for - :param str remote_key: - :param bool remote: whether to use remote url base - :return str: path to the selected pipeline output - """ - dir_key = ( - CFG_PATH_BEDBUNCHER_DIR_KEY - if table_name == BEDSET_TABLE - else CFG_PATH_BEDSTAT_DIR_KEY - ) - base = ( - self.config[CFG_REMOTE_KEY][remote_key]["prefix"] - if remote - else self.config[CFG_PATH_KEY][CFG_PATH_PIPELINE_OUTPUT_KEY] - ) - if remote and not base: - raise MissingConfigDataError( - f"{CFG_REMOTE_KEY} key value is invalid: {base}" - ) - return os.path.join(base, self.config[CFG_PATH_KEY][dir_key]) - - def get_bedbuncher_output_path(self, remote_key, remote=False) -> str: - """ - Get path to the output of the bedbuncher pipeline - - :param bool remote: whether to use remote url base - :return str: path to the bedbuncher pipeline output - """ - return self._get_output_path( - table_name=BEDSET_TABLE, remote_key=remote_key, remote=remote - ) - - def get_bedstat_output_path(self, remote_key, remote=False) -> str: - """ - Get path to the output of the bedstat pipeline - - :param bool remote: whether to use remote url base - :return str: path to the bedstat pipeline output - """ - return self._get_output_path( - table_name=BED_TABLE, remote_key=remote_key, remote=remote - ) - - def _create_bedset_bedfiles_table(self): - """ - Create a relationship table - """ - - class BedFileBedSetAssociation(SQLModel, table=True): - __tablename__ = BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY - bedfile_id: Optional[int] = Field( - default=None, - foreign_key=f"{self.bed.cfg['pipeline_name']}__sample.id", - primary_key=True, - ) - bedset_id: Optional[int] = Field( - default=None, - foreign_key=f"{self.bedset.cfg['pipeline_name']}__sample.id", - primary_key=True, - ) - - __table_args__ = {"extend_existing": True} - - self.rel_table = BedFileBedSetAssociation - returned_model = BedFileBedSetAssociation.__table__ - - # this will create a relationship between bedfiles and bedsets, and will have mapping in both tables (bedfiles, bedsets) - self.BedfileORM.__mapper__.add_property( - BEDSETS_REL_KEY, - relationship( - self.BedsetORM, - secondary=returned_model, - backref=BEDFILES_REL_KEY, - ), - ) - - SQLModel.metadata.create_all(bind=self.bed.backend.db_engine_key) - - def report_relationship( - self, bedset_record_id: str, bedfile_record_id: str - ) -> None: - """ - Report a bedfile for bedset. - - Inserts the ID pair into the relationship table, which allows to - manage many to many bedfile bedset relationships - - :param int bedset_record_id: record identifier of the bedset to report bedfile for - :param int bedfile_record_id: record identifier of the bedfile to report - """ - - if not self._check_table_exists( - table_name=BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY - ): - self._create_bedset_bedfiles_table() - - with self.bed.backend.session as s: - bedset_statement = select(self.BedsetORM).where( - self.BedsetORM.record_identifier == bedset_record_id - ) - bedset = s.exec(bedset_statement).one() - - bedfile_statement = select(self.BedfileORM).where( - self.BedfileORM.record_identifier == bedfile_record_id - ) - bedfile = s.exec(bedfile_statement).one() - - if not bedfile: - raise BedBaseConfError( - f"Bedfile with if: {bedfile_record_id} doesn't exists. Can't add bedfile to bedset" - ) - - # add relationship - bedset.bedfiles.append(bedfile) - s.add(bedfile) - s.commit() - - return None - - def remove_relationship( - self, bedset_record_id: str, bedfile_record_id: Union[str, List[str]] = None - ) -> None: - """ - Remove entries from the relationships table - - :param str bedset_record_id: id of the bedset to remove - :param list[str] bedfile_record_id: ids of the bedfiles to remove for the - selected bedset. If none provided, all the relationsips for the - selected bedset will be removed. - """ - - if not self._check_table_exists( - table_name=BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY - ): - raise BedBaseConfError( - f"Can't remove a relationship, '{BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY}' does not exist" - ) - - with self.bedset.backend.session as s: - bedset_statement = select(self.BedsetORM).where( - self.BedsetORM.record_identifier == bedset_record_id - ) - bedset = s.exec(bedset_statement).one() - - if bedfile_record_id is None: - list_of_bed = [bed for bed in bedset.bedfiles] - - for bed in list_of_bed: - bedset.bedfiles.remove(bed) - else: - if isinstance(bedfile_record_id, str): - bedfile_record_id = [bedset_record_id] - - for bedfile_id in bedfile_record_id: - bedfile_statement = select(self.BedfileORM).where( - self.BedfileORM.record_identifier == bedfile_id - ) - bedfile = s.exec(bedfile_statement).one() - - bedset.bedfiles.remove(bedfile) - s.add(bedset) - s.commit() - - def select_bedfiles_from_bedset( - self, - bedset_record_id: str, - metadata: bool = False, - ) -> List[dict]: - """ - Select bedfiles that are part of a bedset that matches the query - - :param: bedset_record_id: record identifier of the bedset to query - :param: metadata: whether to include metadata in the result - :return: matched bedfiles table contents - """ - if metadata: - with self.bed.backend.session as session: - statement = select(self.BedsetORM).where( - self.BedsetORM.record_identifier == bedset_record_id - ) - results = session.exec(statement).one().bedfiles - bedfile_list = [bedfile.model_dump() for bedfile in results] - else: - # Probably we can do it in more simple way - with self.bed.backend.session as session: - statement = select(self.BedfileORM.record_identifier).where( - self.BedfileORM.id.in_( - select(self.rel_table.bedfile_id).where( - self.rel_table.bedset_id - == select(self.BedsetORM.id) - .where(self.BedsetORM.record_identifier == bedset_record_id) - .scalar_subquery() - ) - ) - ) - bedfile_list = session.exec(statement).all() - bedfile_list = [ - {"record_identifier": bedset_id} for bedset_id in bedfile_list - ] - return bedfile_list - - def select_unique(self, table_name: str, column: str = None) -> List[dict]: - """ - Select unique value in given column and table - - :param str table_name: table to query in - :param str column: column to include in the result - :return list[dict]: unique entries in the column - """ - - if table_name == "bedfile__sample": - with self.bed.backend.session: - values = self.bed.backend.select_records(columns=column)["records"] - elif table_name == "bedsets__sample": - with self.bedset.backend.session: - values = self.bedset.backend.select_records(columns=column)["records"] - else: - raise SchemaError(f"Incorrect table name provided {table_name}") - - return [i for n, i in enumerate(values) if i not in values[n + 1 :]] - - @property - def BedfileORM(self) -> SQLModel: - """ - return: ORM of bedfile table (SQLModelMetaclass) - """ - return self.bed.backend.get_model("bedfile__sample") - - @property - def BedsetORM(self) -> SQLModel: - """ - return: ORM of bedset table (SQLModelMetaclass) - """ - return self.bedset.backend.get_model("bedsets__sample") - - @property - def t2bsi(self) -> text2bednn.Text2BEDSearchInterface: - """ - :return: object with search functions - """ - return self._t2bsi - - @property - def qdrant_backend(self) -> QdrantBackend: - return self._qdrant_backend - - def _init_qdrant_backend(self) -> QdrantBackend: - """ - Create qdrant client object using credentials provided in config file - :return: QdrantClient - """ - return QdrantBackend( - collection=self._config[CFG_QDRANT_KEY][CFG_QDRANT_COLLECTION_NAME_KEY], - qdrant_host=self._config[CFG_QDRANT_KEY][CFG_QDRANT_HOST_KEY], - qdrant_port=self._config[CFG_QDRANT_KEY][CFG_QDRANT_PORT_KEY], - qdrant_api_key=self._config[CFG_QDRANT_KEY][CFG_QDRANT_API_KEY], - ) - - def _create_t2bsi_object(self) -> Union[text2bednn.Text2BEDSearchInterface, None]: - """ - Create Text 2 BED search interface and return this object - :return: Text2BEDSearchInterface object - """ - - try: - return text2bednn.Text2BEDSearchInterface( - nl2vec_model=FlagEmbedding(model_name=self._senta2vec_hg_model_name), - vec2vec_model=self._config[CFG_PATH_KEY][CFG_PATH_VEC2VEC_KEY], - search_backend=self.qdrant_backend, - ) - except Exception as e: - _LOGGER.error("Error in creating Text2BEDSearchInterface object: " + str(e)) - return None - - def add_bed_to_qdrant( - self, - bed_id: str, - bed_file: Union[str, RegionSet], - payload: dict = None, - region_to_vec: Region2VecExModel = None, - ) -> None: - """ - Convert bed file to vector and add it to qdrant database - - :param bed_id: bed file id - :param bed_file: path to the bed file, or RegionSet object - :param payload: additional metadata to store alongside vectors - :param region_to_vec: initiated region to vector model. If None, new object will be created. - :return: None - """ - - _LOGGER.info(f"Adding bed file to qdrant. bed_id: {bed_id}") - # Convert bedfile to vector - if isinstance(bed_file, str): - bed_region_set = RegionSet(bed_file) - elif isinstance(bed_file, RegionSet): - bed_region_set = bed_file - else: - raise BedBaseConfError( - "Could not add add region to qdrant. Invalid type, or path. " - ) - if not region_to_vec or isinstance(self.region2vec_model, str): - reg_2_vec_obj = Region2VecExModel(self.region2vec_model) - else: - reg_2_vec_obj = region_to_vec - bed_embedding = reg_2_vec_obj.encode( - bed_region_set, - pooling="mean", - ) - - # Upload bed file vector to the database - vec_dim = bed_embedding.shape[0] - self.qdrant_backend.load( - ids=[bed_id], - vectors=bed_embedding.reshape(1, vec_dim), - payloads=[{**payload}], - ) - return None - - def get_prefixed_uri(self, postfix: str, access_id: str) -> str: - """ - Return uri with correct prefix (schema) - - :param postfix: postfix of the uri (or everything after uri schema) - :param access_id: access method name - :return: full uri path - """ - - try: - prefix = self.config[CFG_ACCESS_METHOD_KEY][access_id]["prefix"] - return os.path.join(prefix, postfix) - except KeyError: - _LOGGER.error(f"Access method {access_id} is not defined.") - raise BadAccessMethodError(f"Access method {access_id} is not defined.") - - def get_thumbnail_uri( - self, - record_type: Literal["bed", "bedset"], - record_id: str, - result_id: str, - access_id: str = "http", - ) -> str: - """ - Create URL to access a bed- or bedset-associated thumbnail - - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name) - :param access_id: access id (e.g. http, s3, etc.) - :return: string with thumbnail - """ - - try: - result = self.get_result(record_type, record_id, result_id) - return self.get_prefixed_uri(result["thumbnail_path"], access_id) - except KeyError: - _LOGGER.error( - f"Thumbnail for {record_type} {record_id} {result_id} is not defined." - ) - raise MissingThumbnailError( - f"Thumbnail for {record_type} {record_id} {result_id} is not defined." - ) - - def get_object_uri( - self, - record_type: Literal["bed", "bedset"], - record_id: str, - result_id: str, - access_id: str, - ) -> str: - """ - Create URL to access a bed- or bedset-associated file - - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name) - :param access_id: access id (e.g. http, s3, etc.) - :return: - """ - result = self.get_result(record_type, record_id, result_id) - return self.get_prefixed_uri(result["path"], access_id) - - def get_result( - self, - record_type: Literal["bed", "bedset"], - record_id: str, - result_id: Union[str, List[str]], - ) -> dict: - """ - Generic getter that can return a result from either bed or bedset - - :param record_type: table_name ["bed", "bedset"] - :param record_id: record identifier - :param result_id: column name (result name) - :return: pipestat result - """ - if record_type == "bed": - result = self.bed.retrieve_one(record_id, result_id) - elif record_type == "bedset": - result = self.bedset.retrieve_one(record_id, result_id) - else: - raise BedBaseConfError( - f"Record type {record_type} is not supported. Only bed and bedset are supported." - ) - - _LOGGER.info(f"Getting uri for {record_type} {record_id} {result_id}") - _LOGGER.info(f"Result: {result}") - return result - - def get_drs_metadata( - self, - record_type: Literal["bed", "bedset"], - record_id: str, - result_id: str, - base_uri: str, - ) -> DRSModel: - """ - Get DRS metadata for a bed- or bedset-associated file - - :param record_type: bed or bedset - :param record_id: record identifier - :param result_id: name of the result file to get metadata for - :param base_uri: base uri to use for the self_uri field (server hostname of DRS broker) - :return: DRS metadata - """ - - access_methods = [] - object_id = f"{record_type}.{record_id}.{result_id}" - result_ids = [result_id, "pipestat_created_time", "pipestat_modified_time"] - record_metadata = self.get_result( - record_type, record_id, result_ids - ) # only get result once - if not record_metadata: - raise RecordNotFoundError("This record does not exist") - - if not record_metadata[result_id] or not record_metadata[result_id]["path"]: - raise MissingObjectError("This object does not exist") - - path = record_metadata[result_id]["path"] - for access_id in self.config[CFG_ACCESS_METHOD_KEY].keys(): - access_dict = AccessMethod( - type=access_id, - access_id=access_id, - access_url=AccessURL(url=self.get_prefixed_uri(path, access_id)), - region=self.config[CFG_ACCESS_METHOD_KEY][access_id].get( - "region", None - ), - ) - access_methods.append(access_dict) - drs_dict = DRSModel( - id=object_id, - self_uri=f"drs://{base_uri}/{object_id}", - size=record_metadata[result_id].get("size", "unknown"), - created_time=record_metadata.get("pipestat_created_time", "unknown"), - updated_time=record_metadata.get("pipestat_modified_time", "unknown"), - checksums=object_id, - access_methods=access_methods, - ) - - return drs_dict diff --git a/bbconf/config_parser/__init__.py b/bbconf/config_parser/__init__.py new file mode 100644 index 00000000..3d089de4 --- /dev/null +++ b/bbconf/config_parser/__init__.py @@ -0,0 +1,3 @@ +from .bedbaseconfig import BedBaseConfig + +__all__ = ["BedBaseConfig"] diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py new file mode 100644 index 00000000..92fc08fd --- /dev/null +++ b/bbconf/config_parser/bedbaseconfig.py @@ -0,0 +1,412 @@ +import logging +import os +import warnings +from pathlib import Path +from typing import List, Literal, Union + +import boto3 +import qdrant_client +import yacman +from botocore.exceptions import EndpointConnectionError + +from geniml.search import QdrantBackend, BED2BEDSearchInterface, Text2BEDSearchInterface +from geniml.search.query2vec import BED2Vec, Text2Vec +from geniml.region2vec import Region2VecExModel + +from pephubclient import PEPHubClient + +from bbconf.config_parser.const import ( + S3_BEDSET_PATH_FOLDER, + S3_FILE_PATH_FOLDER, + S3_PLOTS_PATH_FOLDER, +) +from bbconf.config_parser.models import ConfigFile +from bbconf.const import ( + PKG_NAME, +) +from bbconf.db_utils import BaseEngine +from bbconf.exceptions import ( + BadAccessMethodError, + BedBaseConfError, + BedbaseS3ConnectionError, +) +from bbconf.helpers import get_absolute_path, get_bedbase_cfg +from bbconf.models.base_models import FileModel +from bbconf.models.bed_models import BedFiles, BedPlots +from bbconf.models.bedset_models import BedSetPlots +from bbconf.models.drs_models import AccessMethod, AccessURL + +_LOGGER = logging.getLogger(PKG_NAME) + + +class BedBaseConfig: + def __init__(self, config: Union[Path, str]): + self.cfg_path = get_bedbase_cfg(config) + self._config = self._read_config_file(self.cfg_path) + + self._db_engine = self._init_db_engine() + self._qdrant_engine = self._init_qdrant_backend() + self._t2bsi = self._init_t2bsi_object() + self._b2bsi = self._init_b2bsi_object() + self._r2v = self._init_r2v_object() + + self._phc = self._init_pephubclient() + self._boto3_client = self._init_boto3_client() + + @staticmethod + def _read_config_file(config_path: str) -> ConfigFile: + """ + Read configuration file and insert default values if not set + + :param config_path: configuration file path + :return: None + :raises: raise_missing_key (if config key is missing) + """ + _config = yacman.YAMLConfigManager(filepath=config_path).exp + + config_dict = {} + for field_name, annotation in ConfigFile.model_fields.items(): + try: + config_dict[field_name] = annotation.annotation( + **_config.get(field_name) + ) + except TypeError: + # TODO: this should be more specific + config_dict[field_name] = annotation.annotation() + + return ConfigFile(**config_dict) + + @property + def config(self) -> ConfigFile: + """ + Get configuration + + :return: configuration object + """ + return self._config + + @property + def db_engine(self) -> BaseEngine: + """ + Get database engine + + :return: database engine + """ + return self._db_engine + + @property + def t2bsi(self) -> Union[Text2BEDSearchInterface, None]: + """ + Get text2bednn object + + :return: text2bednn object + """ + return self._t2bsi + + @property + def b2bsi(self) -> Union[BED2BEDSearchInterface, None]: + """ + Get bed2bednn object + + :return: bed2bednn object + """ + return self._b2bsi + + @property + def r2v(self) -> Region2VecExModel: + """ + Get region2vec object + + :return: region2vec object + """ + return self._r2v + + @property + def qdrant_engine(self) -> QdrantBackend: + """ + Get qdrant engine + + :return: qdrant engine + """ + return self._qdrant_engine + + @property + def phc(self) -> PEPHubClient: + """ + Get PEPHub client + + :return: PEPHub client + """ + return self._phc + + @property + def boto3_client(self) -> boto3.client: + """ + Get boto3 client + + :return: boto3 client + """ + return self._boto3_client + + def _init_db_engine(self) -> BaseEngine: + return BaseEngine( + host=self._config.database.host, + port=self._config.database.port, + database=self._config.database.database, + user=self._config.database.user, + password=self._config.database.password, + drivername=f"{self._config.database.dialect}+{self._config.database.driver}", + ) + + def _init_qdrant_backend(self) -> QdrantBackend: + """ + Create qdrant client object using credentials provided in config file + + :return: QdrantClient + """ + try: + return QdrantBackend( + collection=self._config.qdrant.collection, + qdrant_host=self._config.qdrant.host, + qdrant_port=self._config.qdrant.port, + qdrant_api_key=self._config.qdrant.api_key, + ) + except qdrant_client.http.exceptions.ResponseHandlingException as err: + _LOGGER.error(f"error in Connection to qdrant! skipping... Error: {err}") + warnings.warn( + f"error in Connection to qdrant! skipping... Error: {err}", UserWarning + ) + + def _init_t2bsi_object(self) -> Union[Text2BEDSearchInterface, None]: + """ + Create Text 2 BED search interface and return this object + + :return: Text2BEDSearchInterface object + """ + + try: + return Text2BEDSearchInterface( + backend=self.qdrant_engine, + query2vec=Text2Vec( + text_embedder=self._config.path.text2vec, + v2v=self._config.path.vec2vec, + ), + ) + except Exception as e: + _LOGGER.error("Error in creating Text2BEDSearchInterface object: " + str(e)) + warnings.warn( + "Error in creating Text2BEDSearchInterface object: " + str(e), + UserWarning, + ) + return None + + def _init_b2bsi_object(self) -> Union[BED2BEDSearchInterface, None]: + """ + Create Bed 2 BED search interface and return this object + + :return: Bed2BEDSearchInterface object + """ + try: + return BED2BEDSearchInterface( + backend=self.qdrant_engine, + query2vec=BED2Vec(model=self._config.path.region2vec), + ) + except Exception as e: + _LOGGER.error("Error in creating BED2BEDSearchInterface object: " + str(e)) + warnings.warn( + "Error in creating BED2BEDSearchInterface object: " + str(e), + UserWarning, + ) + return None + + @staticmethod + def _init_pephubclient() -> Union[PEPHubClient, None]: + """ + Create Pephub client object using credentials provided in config file + + :return: PephubClient + """ + try: + return PEPHubClient() + except Exception as e: + _LOGGER.error(f"Error in creating PephubClient object: {e}") + warnings.warn(f"Error in creating PephubClient object: {e}", UserWarning) + return None + + def _init_boto3_client( + self, + ) -> boto3.client: + """ + Create Pephub client object using credentials provided in config file + + :return: PephubClient + """ + try: + return boto3.client( + "s3", + endpoint_url=self._config.s3.endpoint_url, + aws_access_key_id=self._config.s3.aws_access_key_id, + aws_secret_access_key=self._config.s3.aws_secret_access_key, + ) + except Exception as e: + _LOGGER.error(f"Error in creating boto3 client object: {e}") + warnings.warn(f"Error in creating boto3 client object: {e}", UserWarning) + return None + + def _init_r2v_object(self) -> Region2VecExModel: + """ + Create Region2VecExModel object using credentials provided in config file + """ + return Region2VecExModel(self.config.path.region2vec) + + def upload_s3(self, file_path: str, s3_path: Union[Path, str]) -> None: + """ + Upload file to s3. + + :param file_path: local path to the file + :param s3_path: path to the file in s3 with file name + :return: None + """ + if not self._boto3_client: + _LOGGER.warning( + "Could not upload file to s3. Connection to s3 not established. Skipping.." + ) + raise BedbaseS3ConnectionError( + "Could not upload file to s3. Connection error." + ) + if not os.path.exists(file_path): + raise BedBaseConfError(f"File {file_path} does not exist.") + _LOGGER.info(f"Uploading file to s3: {s3_path}") + return self._boto3_client.upload_file(file_path, self.config.s3.bucket, s3_path) + + def upload_files_s3( + self, + identifier: str, + files: Union[BedFiles, BedPlots, BedSetPlots], + base_path: str, + type: Literal["files", "plots", "bedsets"] = "files", + ) -> Union[BedFiles, BedPlots, BedSetPlots]: + """ + Upload files to s3. + + :param identifier: bed file identifier + :param files: dictionary with files to upload + :param base_path: local path to the output files + :param type: type of files to upload [files, plots, bedsets] + :return: None + """ + + if type == "files": + s3_output_base_folder = S3_FILE_PATH_FOLDER + elif type == "plots": + s3_output_base_folder = S3_PLOTS_PATH_FOLDER + elif type == "bedsets": + s3_output_base_folder = S3_BEDSET_PATH_FOLDER + else: + raise BedBaseConfError( + f"Invalid type: {type}. Should be 'files', 'plots', or 'bedsets'" + ) + + for key, value in files: + if not value: + continue + file_base_name = os.path.basename(value.path) + file_path = get_absolute_path(value.path, base_path) + s3_path = os.path.join( + s3_output_base_folder, + identifier[0], + identifier[1], + file_base_name, + ) + self.upload_s3(file_path, s3_path=s3_path) + + setattr(value, "name", key) + setattr(value, "size", os.path.getsize(file_path)) + setattr(value, "path", s3_path) + + if value.path_thumbnail: + file_base_name_thumbnail = os.path.basename(value.path_thumbnail) + file_path_thumbnail = get_absolute_path(value.path_thumbnail, base_path) + s3_path_thumbnail = os.path.join( + s3_output_base_folder, + identifier[0], + identifier[1], + file_base_name_thumbnail, + ) + self.upload_s3(file_path_thumbnail, s3_path=s3_path_thumbnail) + setattr(value, "path_thumbnail", s3_path_thumbnail) + + return files + + def delete_s3(self, s3_path: str) -> None: + """ + Delete file from s3. + + :param s3_path: path to the file in s3 + :return: None + """ + if not self._boto3_client: + _LOGGER.warning( + "Could not delete file from s3. Connection to s3 not established. Skipping.." + ) + raise BedbaseS3ConnectionError( + "Could not delete file from s3. Connection error." + ) + try: + _LOGGER.info(f"Deleting file from s3: {s3_path}") + return self._boto3_client.delete_object( + Bucket=self.config.s3.bucket, Key=s3_path + ) + except EndpointConnectionError: + raise BedbaseS3ConnectionError( + "Could not delete file from s3. Connection error." + ) + + def delete_files_s3(self, files: List[FileModel]) -> None: + """ + Delete files from s3. + + :param files: list of file objects + :return: None + """ + for file in files: + self.delete_s3(file.path) + if file.path_thumbnail: + self.delete_s3(file.path_thumbnail) + return None + + def get_prefixed_uri(self, postfix: str, access_id: str) -> str: + """ + Return uri with correct prefix (schema) + + :param postfix: postfix of the uri (or everything after uri schema) + :param access_id: access method name, e.g. http, s3, etc. + :return: full uri path + """ + + try: + prefix = getattr(self.config.access_methods, access_id).prefix + return os.path.join(prefix, postfix) + except KeyError: + _LOGGER.error(f"Access method {access_id} is not defined.") + raise BadAccessMethodError(f"Access method {access_id} is not defined.") + + def construct_access_method_list(self, rel_path: str) -> List[AccessMethod]: + """ + Construct access method list for a given record + + :param rel_path: relative path to the record + :return: list of access methods + """ + access_methods = [] + for access_id in self.config.access_methods.model_dump().keys(): + access_dict = AccessMethod( + type=access_id, + access_id=access_id, + access_url=AccessURL(url=self.get_prefixed_uri(rel_path, access_id)), + region=self.config.access_methods.model_dump()[access_id].get( + "region", None + ), + ) + access_methods.append(access_dict) + return access_methods diff --git a/bbconf/config_parser/const.py b/bbconf/config_parser/const.py new file mode 100644 index 00000000..5dee2c44 --- /dev/null +++ b/bbconf/config_parser/const.py @@ -0,0 +1,27 @@ +DEFAULT_DB_NAME = "bedbase" +DEFAULT_DB_PORT = 5432 +DEFAULT_DB_DIALECT = "postgresql" +DEFAULT_DB_DRIVER = "psycopg" + +DEFAULT_QDRANT_HOST = "localhost" +DEFAULT_QDRANT_PORT = 6333 +DEFAULT_QDRANT_COLLECTION_NAME = "bedbase" +DEFAULT_QDRANT_API_KEY = None + +DEFAULT_SERVER_PORT = 80 +DEFAULT_SERVER_HOST = "0.0.0.0" + +DEFAULT_TEXT2VEC_MODEL = "sentence-transformers/all-MiniLM-L6-v2" +DEFAULT_VEC2VEC_MODEL = "databio/v2v-MiniLM-v2-ATAC-hg38" +DEFAULT_REGION2_VEC_MODEL = "databio/r2v-ChIP-atlas-hg38" + +DEFAULT_PEPHUB_NAMESPACE = "databio" +DEFAULT_PEPHUB_NAME = "bedbase_all" +DEFAULT_PEPHUB_TAG = "default" + +DEFAULT_S3_BUCKET = "bedbase" + + +S3_FILE_PATH_FOLDER = "files" +S3_PLOTS_PATH_FOLDER = "stats" +S3_BEDSET_PATH_FOLDER = "bedsets" diff --git a/bbconf/config_parser/models.py b/bbconf/config_parser/models.py new file mode 100644 index 00000000..6f2c2569 --- /dev/null +++ b/bbconf/config_parser/models.py @@ -0,0 +1,88 @@ +from typing import Optional, Union + +from pydantic import BaseModel, ConfigDict + +from bbconf.config_parser.const import ( + DEFAULT_DB_DIALECT, + DEFAULT_DB_DRIVER, + DEFAULT_DB_NAME, + DEFAULT_DB_PORT, + DEFAULT_PEPHUB_NAME, + DEFAULT_PEPHUB_NAMESPACE, + DEFAULT_PEPHUB_TAG, + DEFAULT_QDRANT_COLLECTION_NAME, + DEFAULT_QDRANT_PORT, + DEFAULT_REGION2_VEC_MODEL, + DEFAULT_S3_BUCKET, + DEFAULT_SERVER_HOST, + DEFAULT_SERVER_PORT, + DEFAULT_TEXT2VEC_MODEL, + DEFAULT_VEC2VEC_MODEL, +) + + +class ConfigDB(BaseModel): + host: str + port: int = DEFAULT_DB_PORT + user: str + password: str + database: str = DEFAULT_DB_NAME + dialect: str = DEFAULT_DB_DIALECT + driver: Optional[str] = DEFAULT_DB_DRIVER + + model_config = ConfigDict(extra="forbid") + + +class ConfigQdrant(BaseModel): + host: str + port: int = DEFAULT_QDRANT_PORT + api_key: Optional[str] = None + collection: str = DEFAULT_QDRANT_COLLECTION_NAME + + +class ConfigServer(BaseModel): + host: str = DEFAULT_SERVER_HOST + port: int = DEFAULT_SERVER_PORT + + +class ConfigPath(BaseModel): + region2vec: str = DEFAULT_REGION2_VEC_MODEL + vec2vec: str = DEFAULT_VEC2VEC_MODEL + text2vec: str = DEFAULT_TEXT2VEC_MODEL + + +class AccessMethodsStruct(BaseModel): + type: str + description: str = None + prefix: str + + +class AccessMethods(BaseModel): + http: AccessMethodsStruct = None + s3: AccessMethodsStruct = None + local: AccessMethodsStruct = None + + +class ConfigS3(BaseModel): + endpoint_url: Union[str, None] = None + aws_access_key_id: Union[str, None] = None + aws_secret_access_key: Union[str, None] = None + bucket: Union[str, None] = DEFAULT_S3_BUCKET + + +class ConfigPepHubClient(BaseModel): + namespace: Union[str, None] = DEFAULT_PEPHUB_NAMESPACE + name: Union[str, None] = DEFAULT_PEPHUB_NAME + tag: Union[str, None] = DEFAULT_PEPHUB_TAG + + +class ConfigFile(BaseModel): + database: ConfigDB + qdrant: ConfigQdrant = None + server: ConfigServer + path: ConfigPath + access_methods: AccessMethods = None + s3: ConfigS3 = None + phc: ConfigPepHubClient = None + + model_config = ConfigDict(extra="allow") diff --git a/bbconf/const.py b/bbconf/const.py index 68f5cbff..b41ddfb8 100644 --- a/bbconf/const.py +++ b/bbconf/const.py @@ -1,102 +1,4 @@ -""" -Constant variables shared among packages that constitute bedbase project -""" - -import os - -SCHEMA_DIRNAME = "schemas" -SCHEMAS_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), SCHEMA_DIRNAME) -BED_TABLE_SCHEMA = os.path.join(SCHEMAS_PATH, "bedfiles_schema.yaml") -BEDSET_TABLE_SCHEMA = os.path.join(SCHEMAS_PATH, "bedsets_schema.yaml") -DIST_TABLE_SCHEMA = os.path.join(SCHEMAS_PATH, "distance_schema.yaml") - PKG_NAME = "bbconf" -DOC_URL = "TBA" # TODO: add documentation URL once it's established - -BED_TABLE = "bedfile__sample" -BEDSET_TABLE = "bedsets__sample" - -BEDFILES_REL_KEY = "bedfiles" -BEDSETS_REL_KEY = "bedsets" - -BEDFILE_BEDSET_ASSOCIATION_TABLE_KEY = "bedset_bedfiles_new" - -CFG_ENV_VARS = ["BEDBASE"] - -PIPESTATS_KEY = "__pipestats" -COMMON_DECL_BASE_KEY = "__common_declarative_base" - -HIDDEN_ATTR_KEYS = [PIPESTATS_KEY, COMMON_DECL_BASE_KEY] - -# config file constants -CFG_PATH_KEY = "path" -CFG_PATH_BEDSTAT_DIR_KEY = "bedstat_dir" -CFG_PATH_BEDBUNCHER_DIR_KEY = "bedbuncher_dir" -CFG_PATH_PIPELINE_OUTPUT_KEY = "pipeline_output_path" -CFG_PATH_REGION2VEC_KEY = "region2vec" -CFG_PATH_VEC2VEC_KEY = "vec2vec" -CFG_PATH_TEXT2VEC_KEY = "text2vec" - - -CFG_DATABASE_KEY = "database" -CFG_DATABASE_NAME_KEY = "name" -CFG_DATABASE_HOST_KEY = "host" -CFG_DATABASE_PORT_KEY = "port" -CFG_DATABASE_PASSWORD_KEY = "password" -CFG_DATABASE_USER_KEY = "user" - -CFG_QDRANT_KEY = "qdrant" - -CFG_QDRANT_HOST_KEY = "host" -CFG_QDRANT_PORT_KEY = "port" -CFG_QDRANT_API_KEY = "api_key" -CFG_QDRANT_COLLECTION_NAME_KEY = "collection" - -CFG_SERVER_KEY = "server" -CFG_SERVER_HOST_KEY = "host" -CFG_SERVER_PORT_KEY = "port" - -CFG_REMOTE_KEY = "remotes" - -DB_DEFAULT_HOST = "localhost" -DB_DEFAULT_USER = "postgres" -DB_DEFAULT_PASSWORD = "bedbasepassword" -DB_DEFAULT_NAME = "postgres" -DB_DEFAULT_PORT = 5432 -DB_DEFAULT_DIALECT = "postgresql" - -CFG_ACCESS_METHOD_KEY = "access_methods" - -DEFAULT_QDRANT_HOST = "localhost" -DEFAULT_QDRANT_PORT = 6333 -DEFAULT_QDRANT_COLLECTION_NAME = "bedbase" -DEFAULT_QDRANT_API_KEY = None - -SERVER_DEFAULT_PORT = 80 -SERVER_DEFAULT_HOST = "0.0.0.0" - -DEFAULT_SECTION_VALUES = { - CFG_DATABASE_KEY: { - CFG_DATABASE_USER_KEY: DB_DEFAULT_USER, - CFG_DATABASE_PASSWORD_KEY: DB_DEFAULT_PASSWORD, - CFG_DATABASE_NAME_KEY: DB_DEFAULT_NAME, - CFG_DATABASE_PORT_KEY: DB_DEFAULT_PORT, - CFG_DATABASE_HOST_KEY: DB_DEFAULT_HOST, - }, - CFG_SERVER_KEY: { - CFG_SERVER_HOST_KEY: SERVER_DEFAULT_HOST, - CFG_SERVER_PORT_KEY: SERVER_DEFAULT_PORT, - }, - CFG_QDRANT_KEY: { - CFG_QDRANT_HOST_KEY: DEFAULT_QDRANT_HOST, - CFG_QDRANT_PORT_KEY: DEFAULT_QDRANT_PORT, - CFG_QDRANT_COLLECTION_NAME_KEY: DEFAULT_QDRANT_COLLECTION_NAME, - CFG_QDRANT_API_KEY: DEFAULT_QDRANT_API_KEY, - }, -} -DEFAULT_TEXT2VEC_MODEL = "sentence-transformers/all-MiniLM-L6-v2" -DEFAULT_VEC2VEC_MODEL = "databio/v2v-MiniLM-v2-ATAC-hg38" -DEFAULT_REGION2_VEC_MODEL = "databio/r2v-ChIP-atlas-hg38" DRS_ACCESS_URL = "{server_url}/objects/{object_id}/access/{access_id}" diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py new file mode 100644 index 00000000..f08e5b32 --- /dev/null +++ b/bbconf/db_utils.py @@ -0,0 +1,357 @@ +import datetime +import logging +from typing import List, Optional + +from sqlalchemy import ( + TIMESTAMP, + BigInteger, + ForeignKey, + Result, + Select, + event, + select, +) +from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy.engine import URL, Engine, create_engine +from sqlalchemy.exc import ProgrammingError +from sqlalchemy.ext.compiler import compiles +from sqlalchemy.orm import ( + DeclarativeBase, + Mapped, + Session, + mapped_column, + relationship, +) +from sqlalchemy_schemadisplay import create_schema_graph + +from bbconf.const import PKG_NAME + +_LOGGER = logging.getLogger(PKG_NAME) + + +POSTGRES_DIALECT = "postgresql+psycopg" + + +class SchemaError(Exception): + def __init__(self): + super().__init__( + """PEP_db connection error! The schema of connected db is incorrect!""" + ) + + +class BIGSERIAL(BigInteger): + pass + + +@compiles(BIGSERIAL, POSTGRES_DIALECT) +def compile_bigserial_pg(type_, compiler, **kw): + return "BIGSERIAL" + + +@compiles(JSON, POSTGRES_DIALECT) +def compile_jsonb_pg(type_, compiler, **kw): + return "JSONB" + + +class Base(DeclarativeBase): + type_annotation_map = {datetime.datetime: TIMESTAMP(timezone=True)} + + +@event.listens_for(Base.metadata, "after_create") +def receive_after_create(target, connection, tables, **kw): + """ + listen for the 'after_create' event + """ + if tables: + _LOGGER.info("A table was created") + else: + _LOGGER.info("A table was not created") + + +def deliver_update_date(context): + return datetime.datetime.now(datetime.timezone.utc) + + +class Bed(Base): + __tablename__ = "bed" + + id: Mapped[str] = mapped_column(primary_key=True, index=True) + name: Mapped[Optional[str]] + genome_alias: Mapped[Optional[str]] + genome_digest: Mapped[Optional[str]] + description: Mapped[Optional[str]] + bed_type: Mapped[str] = mapped_column(default="bed3") + bed_format: Mapped[str] = mapped_column(default="bed") + indexed: Mapped[bool] = mapped_column( + default=False, comment="Whether sample was added to qdrant" + ) + pephub: Mapped[bool] = mapped_column( + default=False, comment="Whether sample was added to pephub" + ) + + submission_date: Mapped[datetime.datetime] = mapped_column( + default=deliver_update_date + ) + last_update_date: Mapped[Optional[datetime.datetime]] = mapped_column( + default=deliver_update_date, + onupdate=deliver_update_date, + ) + + files: Mapped[List["Files"]] = relationship( + "Files", back_populates="bedfile", cascade="all, delete-orphan" + ) + + bedsets: Mapped[List["BedFileBedSetRelation"]] = relationship( + "BedFileBedSetRelation", back_populates="bedfile", cascade="all, delete-orphan" + ) + + stats: Mapped["BedStats"] = relationship( + back_populates="bed", cascade="all, delete-orphan" + ) + + +class BedStats(Base): + __tablename__ = "bed_stats" + + id: Mapped[str] = mapped_column( + ForeignKey("bed.id", ondelete="CASCADE"), + primary_key=True, + index=True, + ) + number_of_regions: Mapped[Optional[float]] + gc_content: Mapped[Optional[float]] + median_tss_dist: Mapped[Optional[float]] + mean_region_width: Mapped[Optional[float]] + exon_frequency: Mapped[Optional[float]] + intron_frequency: Mapped[Optional[float]] + promoterprox_frequency: Mapped[Optional[float]] + intergenic_frequency: Mapped[Optional[float]] + promotercore_frequency: Mapped[Optional[float]] + fiveutr_frequency: Mapped[Optional[float]] + threeutr_frequency: Mapped[Optional[float]] + fiveutr_percentage: Mapped[Optional[float]] + threeutr_percentage: Mapped[Optional[float]] + promoterprox_percentage: Mapped[Optional[float]] + exon_percentage: Mapped[Optional[float]] + intron_percentage: Mapped[Optional[float]] + intergenic_percentage: Mapped[Optional[float]] + promotercore_percentage: Mapped[Optional[float]] + tssdist: Mapped[Optional[float]] + + bed: Mapped["Bed"] = relationship("Bed", back_populates="stats") + + +class Files(Base): + __tablename__ = "files" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + name: Mapped[str] = mapped_column( + nullable=False, comment="Name of the file, e.g. bed, bigBed" + ) + title: Mapped[Optional[str]] + type: Mapped[str] = mapped_column( + default="file", comment="Type of the object, e.g. file, plot, ..." + ) + path: Mapped[str] + path_thumbnail: Mapped[str] = mapped_column( + nullable=True, comment="Thumbnail path of the file" + ) + description: Mapped[Optional[str]] + size: Mapped[Optional[int]] = mapped_column(default=0, comment="Size of the file") + + bedfile_id: Mapped[int] = mapped_column( + ForeignKey("bed.id", ondelete="CASCADE"), nullable=True, index=True + ) + bedset_id: Mapped[int] = mapped_column( + ForeignKey("bedsets.id", ondelete="CASCADE"), nullable=True, index=True + ) + + bedfile: Mapped["Bed"] = relationship("Bed", back_populates="files") + bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="files") + + +# class Plots(Base): +# __tablename__ = "plots" +# +# id: Mapped[int] = mapped_column(primary_key=True) +# name: Mapped[str] = mapped_column(nullable=False, comment="Name of the plot") +# description: Mapped[Optional[str]] = mapped_column( +# comment="Description of the plot" +# ) +# path: Mapped[str] = mapped_column(comment="Path to the plot file") +# path_thumbnail: Mapped[str] = mapped_column( +# nullable=True, comment="Path to the thumbnail of the plot file" +# ) +# +# bedfile_id: Mapped[int] = mapped_column(ForeignKey("bed.id"), nullable=True) +# bedset_id: Mapped[int] = mapped_column(ForeignKey("bedsets.id"), nullable=True) +# +# bedfile: Mapped["Bed"] = relationship("Bed", back_populates="plots") +# bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="plots") + + +class BedFileBedSetRelation(Base): + __tablename__ = "bedfile_bedset_relation" + bedset_id: Mapped[int] = mapped_column( + ForeignKey("bedsets.id", ondelete="CASCADE"), primary_key=True + ) + bedfile_id: Mapped[int] = mapped_column( + ForeignKey("bed.id", ondelete="CASCADE"), primary_key=True + ) + + bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="bedfiles") + bedfile: Mapped["Bed"] = relationship("Bed", back_populates="bedsets") + + +class BedSets(Base): + __tablename__ = "bedsets" + + id: Mapped[str] = mapped_column(primary_key=True, index=True) + name: Mapped[str] = mapped_column(nullable=False, comment="Name of the bedset") + description: Mapped[Optional[str]] = mapped_column( + comment="Description of the bedset" + ) + submission_date: Mapped[datetime.datetime] = mapped_column( + default=deliver_update_date + ) + last_update_date: Mapped[Optional[datetime.datetime]] = mapped_column( + default=deliver_update_date, + onupdate=deliver_update_date, + ) + md5sum: Mapped[Optional[str]] = mapped_column(comment="MD5 sum of the bedset") + + bedset_means: Mapped[Optional[dict]] = mapped_column( + JSON, comment="Mean values of the bedset" + ) + bedset_standard_deviation: Mapped[Optional[dict]] = mapped_column( + JSON, comment="Median values of the bedset" + ) + + bedfiles: Mapped[List["BedFileBedSetRelation"]] = relationship( + "BedFileBedSetRelation", back_populates="bedset", cascade="all, delete-orphan" + ) + # plots: Mapped[List["Plots"]] = relationship("Plots", back_populates="bedset") + files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset") + + +class BaseEngine: + """ + A class with base methods, that are used in several classes. + """ + + def __init__( + self, + *, + host: str = "localhost", + port: int = 5432, + database: str = "bedbase", + user: str = None, + password: str = None, + drivername: str = POSTGRES_DIALECT, + dsn: str = None, + echo: bool = False, + ): + """ + Initialize connection to the bedbase database. You can use The basic connection parameters + or libpq connection string. + + :param host: database server address e.g., localhost or an IP address. + :param port: the port number that defaults to 5432 if it is not provided. + :param database: the name of the database that you want to connect. + :param user: the username used to authenticate. + :param password: password used to authenticate. + :param drivername: driver used in + :param dsn: libpq connection string using the dsn parameter + (e.g. 'postgresql://user_name:password@host_name:port/db_name') + """ + if not dsn: + dsn = URL.create( + host=host, + port=port, + database=database, + username=user, + password=password, + drivername=drivername, + ) + + self._engine = create_engine(dsn, echo=echo) + self.create_schema(self._engine) + self.check_db_connection() + + def create_schema(self, engine=None): + """ + Create sql schema in the database. + + :param engine: sqlalchemy engine [Default: None] + :return: None + """ + if not engine: + engine = self._engine + Base.metadata.create_all(engine) + return None + + def delete_schema(self, engine=None) -> None: + """ + Delete sql schema in the database. + + :param engine: sqlalchemy engine [Default: None] + :return: None + """ + if not engine: + engine = self._engine + Base.metadata.drop_all(engine) + return None + + def session_execute(self, statement: Select) -> Result: + """ + Execute statement using sqlalchemy statement + + :param statement: SQL query or a SQL expression that is constructed using + SQLAlchemy's SQL expression language + :return: query result represented with declarative base + """ + _LOGGER.debug(f"Executing statement: {statement}") + with Session(self._engine) as session: + query_result = session.execute(statement) + + return query_result + + @property + def session(self): + """ + :return: started sqlalchemy session + """ + return self._start_session() + + @property + def engine(self) -> Engine: + """ + :return: sqlalchemy engine + """ + return self._engine + + def _start_session(self): + session = Session(self.engine) + try: + session.execute(select(Bed).limit(1)) + except ProgrammingError: + raise SchemaError() + + return session + + def check_db_connection(self): + try: + self.session_execute(select(Bed).limit(1)) + except ProgrammingError: + raise SchemaError() + + def create_schema_graph(self, output_file: str = "schema.svg"): + """ + Create schema graph of the database. + + :param output_file: path to the output file + :return: None + """ + graph = create_schema_graph(engine=self.engine, metadata=Base.metadata) + graph.write(output_file, format="svg", prog="dot") + return None diff --git a/bbconf/exceptions.py b/bbconf/exceptions.py index c9a910ce..6d891d7f 100644 --- a/bbconf/exceptions.py +++ b/bbconf/exceptions.py @@ -1,5 +1,4 @@ import abc -from .const import DOC_URL class BedBaseConfError(Exception): @@ -8,19 +7,16 @@ class BedBaseConfError(Exception): __metaclass__ = abc.ABCMeta -class BadAccessMethodError(BedBaseConfError): - """Access ID is not well defined""" +class BedbaseS3ConnectionError(BedBaseConfError): + """connection error to s3""" pass -class MissingConfigDataError(BedBaseConfError): - """Exception for invalid config file.""" +class BadAccessMethodError(BedBaseConfError): + """Access ID is not well defined""" - def __init__(self, msg): - spacing = " " if msg[-1] in ["?", ".", "\n"] else "; " - suggest = "For config format documentation please see: " + DOC_URL - super(MissingConfigDataError, self).__init__(msg + spacing + suggest) + pass class BedBaseConnectionError(BedBaseConfError): @@ -35,7 +31,25 @@ class MissingThumbnailError(BedBaseConfError): pass +class BedFIleExistsError(BedBaseConfError): + """Error where files exists, and should not be overwritten""" + + pass + + class MissingObjectError(BedBaseConfError): """Error type for missing object""" pass + + +class BEDFileNotFoundError(BedBaseConfError): + """Error type for missing bedfile""" + + pass + + +class BedSetNotFoundError(BedBaseConfError): + """Error type for missing bedset""" + + pass diff --git a/bbconf/helpers.py b/bbconf/helpers.py index 9672fde5..47fe67c3 100644 --- a/bbconf/helpers.py +++ b/bbconf/helpers.py @@ -1,14 +1,16 @@ import logging +import os from yacman import select_config -from .const import CFG_ENV_VARS -from .exceptions import MissingConfigDataError, BedBaseConnectionError -from typing import NoReturn +from bbconf.exceptions import BedBaseConnectionError _LOGGER = logging.getLogger(__name__) +CFG_ENV_VARS = ["BEDBASE"] + + def get_bedbase_cfg(cfg: str = None) -> str: """ Determine path to the bedbase configuration file @@ -29,9 +31,15 @@ def get_bedbase_cfg(cfg: str = None) -> str: return selected_cfg -def raise_missing_key(key: str) -> NoReturn: - """ - Raise missing key with message +def get_absolute_path(path: str, base_path: str) -> str: """ + Get absolute path to the file and create it if it doesn't exist - raise MissingConfigDataError(f"Config lacks '{key}' key") + :param path: path to the file (abs or relative) + :param base_path: base path to the file (will be added to the relative path) + + :return: absolute path to the file + """ + if not os.path.isabs(path) or not os.path.exists(path): + return os.path.join(base_path, path) + return path diff --git a/bbconf/models/__init__.py b/bbconf/models/__init__.py new file mode 100644 index 00000000..935a5e3f --- /dev/null +++ b/bbconf/models/__init__.py @@ -0,0 +1,2 @@ +# from bedfiles import BedAgentBedFile +# from bedsets import BedAgentBedSet diff --git a/bbconf/models/base_models.py b/bbconf/models/base_models.py new file mode 100644 index 00000000..91e4e5d6 --- /dev/null +++ b/bbconf/models/base_models.py @@ -0,0 +1,24 @@ +from typing import List, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field + +from .drs_models import AccessMethod + + +class FileModel(BaseModel): + name: str + title: Optional[str] = None + path: str + path_thumbnail: Optional[Union[str, None]] = Field(None, alias="thumbnail_path") + description: Optional[str] = None + size: Optional[int] = None + object_id: Optional[str] = None + access_methods: List[AccessMethod] = None + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class StatsReturn(BaseModel): + bedfiles_number: int = 0 + bedsets_number: int = 0 + genomes_number: int = 0 diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py new file mode 100644 index 00000000..b0a646af --- /dev/null +++ b/bbconf/models/bed_models.py @@ -0,0 +1,138 @@ +import datetime +from typing import List, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field + +from .base_models import FileModel + + +class BedPlots(BaseModel): + chrombins: FileModel = None + gccontent: FileModel = None + partitions: FileModel = None + expected_partitions: FileModel = None + cumulative_partitions: FileModel = None + widths_histogram: FileModel = None + neighbor_distances: FileModel = None + open_chromatin: FileModel = None + + model_config = ConfigDict(extra="ignore") + + +class BedFiles(BaseModel): + bed_file: Union[FileModel, None] = None + bigbed_file: Union[FileModel, None] = None + + model_config = ConfigDict( + populate_by_name=True, + extra="ignore", + ) + + +class BedClassification(BaseModel): + name: Optional[str] = None + genome_alias: str = None + genome_digest: Union[str, None] = None + bed_type: str = Field( + default="bed3", pattern="^bed(?:[3-9]|1[0-5])(?:\+|$)[0-9]?+$" + ) + bed_format: str = None + + model_config = ConfigDict(extra="ignore") + + +class BedStatsModel(BaseModel): + number_of_regions: Optional[float] = Field(None, alias="regions_no") + gc_content: Optional[float] = None + median_tss_dist: Optional[float] = None + mean_region_width: Optional[float] = None + + exon_frequency: Optional[float] = None + exon_percentage: Optional[float] = None + + intron_frequency: Optional[float] = None + intron_percentage: Optional[float] = None + + intergenic_percentage: Optional[float] = None + intergenic_frequency: Optional[float] = None + + promotercore_frequency: Optional[float] = None + promotercore_percentage: Optional[float] = None + + fiveutr_frequency: Optional[float] = None + fiveutr_percentage: Optional[float] = None + + threeutr_frequency: Optional[float] = None + threeutr_percentage: Optional[float] = None + + promoterprox_frequency: Optional[float] = None + promoterprox_percentage: Optional[float] = None + + model_config = ConfigDict(extra="ignore", populate_by_name=True) + + +class BedPEPHub(BaseModel): + sample_name: str + genome: str = "" + organism: str = "" + species_id: str = "" + cell_type: str = "" + cell_line: str = "" + exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)") + library_source: str = Field( + "", description="Library source (e.g. genomic, transcriptomic)" + ) + genotype: str = Field("", description="Genotype of the sample") + target: str = Field("", description="Target of the assay (e.g. H3K4me3)") + antibody: str = Field("", description="Antibody used in the assay") + treatment: str = Field( + "", description="Treatment of the sample (e.g. drug treatment)" + ) + tissue: str = Field("", description="Tissue type") + global_sample_id: str = Field("", description="Global sample identifier") + global_experiment_id: str = Field("", description="Global experiment identifier") + description: str = Field("", description="Description of the sample") + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + +class BedMetadata(BedClassification): + id: str + name: Optional[Union[str, None]] = "" + description: Optional[str] = None + submission_date: datetime.datetime = None + last_update_date: Optional[datetime.datetime] = None + stats: Union[BedStatsModel, None] = None + # classification: BedClassification = None + plots: Union[BedPlots, None] = None + files: Union[BedFiles, None] = None + raw_metadata: Optional[Union[BedPEPHub, None]] = None + + # genome_alias: str = None + # genome_digest: str = None + # bed_type: str = Field( + # default="bed3", pattern="^bed(?:[3-9]|1[0-5])(?:\+|$)[0-9]?+$" + # ) + # bed_format: str = None + # full_response: bool = False + + +class BedListResult(BaseModel): + count: int + limit: int + offset: int + results: List[BedMetadata] + + +class QdrantSearchResult(BaseModel): + id: str + payload: dict + score: float + metadata: Union[BedMetadata, None] = None + + +class BedListSearchResult(BaseModel): + count: int + limit: int + offset: int + results: List[QdrantSearchResult] = None diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py new file mode 100644 index 00000000..20175429 --- /dev/null +++ b/bbconf/models/bedset_models.py @@ -0,0 +1,41 @@ +from typing import List, Union + +from pydantic import BaseModel, ConfigDict + +from .base_models import FileModel +from .bed_models import BedMetadata, BedStatsModel + + +class BedSetStats(BaseModel): + mean: BedStatsModel = None + sd: BedStatsModel = None + + +class BedSetPlots(BaseModel): + region_commonality: FileModel = None + + model_config = ConfigDict(extra="ignore") + + +class BedSetMetadata(BaseModel): + id: str + name: str + md5sum: str + statistics: Union[BedSetStats, None] = None + plots: Union[BedSetPlots, None] = None + description: str = None + bed_ids: List[str] = None + + +class BedSetListResult(BaseModel): + count: int + limit: int + offset: int + results: List[BedSetMetadata] + + +class BedSetBedFiles(BaseModel): + count: int + limit: int + offset: int + results: List[BedMetadata] diff --git a/bbconf/models.py b/bbconf/models/drs_models.py similarity index 86% rename from bbconf/models.py rename to bbconf/models/drs_models.py index cd1cb403..01435a56 100644 --- a/bbconf/models.py +++ b/bbconf/models/drs_models.py @@ -1,9 +1,10 @@ import datetime -from typing import Optional, List +from typing import List, Optional, Union from pydantic import BaseModel +# DRS Models class AccessURL(BaseModel): url: str headers: Optional[dict] = None @@ -20,7 +21,7 @@ class DRSModel(BaseModel): id: str name: Optional[str] = None self_uri: str - size: str + size: Union[int, None] = None created_time: Optional[datetime.datetime] = None updated_time: Optional[datetime.datetime] = None checksums: str diff --git a/bbconf/modules/__init__.py b/bbconf/modules/__init__.py new file mode 100644 index 00000000..935a5e3f --- /dev/null +++ b/bbconf/modules/__init__.py @@ -0,0 +1,2 @@ +# from bedfiles import BedAgentBedFile +# from bedsets import BedAgentBedSet diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py new file mode 100644 index 00000000..31e7e117 --- /dev/null +++ b/bbconf/modules/bedfiles.py @@ -0,0 +1,785 @@ +from logging import getLogger +from typing import Dict, Union + +import numpy as np +from geniml.bbclient import BBClient +from geniml.io import RegionSet +from pephubclient.exceptions import ResponseError +from qdrant_client.models import Distance, PointIdsList, VectorParams +from sqlalchemy import select +from sqlalchemy.orm import Session + +from bbconf.config_parser.bedbaseconfig import BedBaseConfig +from bbconf.const import ( + PKG_NAME, +) +from bbconf.db_utils import Bed, BedStats, Files +from bbconf.exceptions import ( + BedBaseConfError, + BedFIleExistsError, + BEDFileNotFoundError, +) +from bbconf.models.bed_models import ( + BedClassification, + BedFiles, + BedListResult, + BedListSearchResult, + BedMetadata, + BedPEPHub, + BedPlots, + BedStatsModel, + FileModel, + QdrantSearchResult, +) + +_LOGGER = getLogger(PKG_NAME) + +QDRANT_GENOME = "hg38" + + +class BedAgentBedFile: + """ + Class that represents Bedfile in Database. + + This class has method to add, delete, get files and metadata from the database. + """ + + def __init__(self, config: BedBaseConfig): + """ + :param config: config object with database and qdrant engine and credentials + """ + self._sa_engine = config.db_engine.engine + self._db_engine = config.db_engine + self._qdrant_engine = config.qdrant_engine + self._boto3_client = config.boto3_client + self._config = config + + def get(self, identifier: str, full: bool = False) -> BedMetadata: + """ + Get file metadata by identifier. + + :param identifier: bed file identifier + :param full: if True, return full metadata, including statistics, files, and raw metadata from pephub + :return: project metadata + """ + statement = select(Bed).where(Bed.id == identifier) + + bed_plots = BedPlots() + bed_files = BedFiles() + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + + if full: + for result in bed_object.files: + # PLOTS + if result.name in BedPlots.model_fields: + setattr( + bed_plots, + result.name, + FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self._config.construct_access_method_list( + result.path + ), + ), + ) + # FILES + elif result.name in BedFiles.model_fields: + ( + setattr( + bed_files, + result.name, + FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self._config.construct_access_method_list( + result.path + ), + ), + ), + ) + + else: + _LOGGER.error( + f"Unknown file type: {result.name}. And is not in the model fields. Skipping.." + ) + bed_stats = BedStatsModel(**bed_object.stats.__dict__) + else: + bed_plots = None + bed_files = None + bed_stats = None + + try: + if full: + bed_metadata = BedPEPHub( + **self._config.phc.sample.get( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + ) + ) + else: + bed_metadata = None + except Exception as e: + _LOGGER.warning(f"Could not retrieve metadata from pephub. Error: {e}") + bed_metadata = None + + return BedMetadata( + id=bed_object.id, + name=bed_object.name, + stats=bed_stats, + plots=bed_plots, + files=bed_files, + description=bed_object.description, + submission_date=bed_object.submission_date, + last_update_date=bed_object.last_update_date, + raw_metadata=bed_metadata, + genome_alias=bed_object.genome_alias, + genome_digest=bed_object.genome_digest, + bed_type=bed_object.bed_type, + bed_format=bed_object.bed_format, + full_response=full, + ) + + def get_stats(self, identifier: str) -> BedStatsModel: + """ + Get file statistics by identifier. + + :param identifier: bed file identifier + + :return: project statistics as BedStats object + """ + statement = select(BedStats).where(BedStats.id == identifier) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + bed_stats = BedStatsModel(**bed_object.__dict__) + + return bed_stats + + def get_plots(self, identifier: str) -> BedPlots: + """ + Get file plots by identifier. + + :param identifier: bed file identifier + :return: project plots + """ + statement = select(Bed).where(Bed.id == identifier) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + bed_plots = BedPlots() + for result in bed_object.files: + if result.name in BedPlots.model_fields: + setattr( + bed_plots, + result.name, + FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self._config.construct_access_method_list( + result.path + ), + ), + ) + return bed_plots + + def get_files(self, identifier: str) -> BedFiles: + """ + Get file files by identifier. + + :param identifier: bed file identifier + :return: project files + """ + statement = select(Bed).where(Bed.id == identifier) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + bed_files = BedFiles() + for result in bed_object.files: + if result.name in BedFiles.model_fields: + setattr( + bed_files, + result.name, + FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self._config.construct_access_method_list( + result.path + ), + ), + ) + return bed_files + + def get_raw_metadata(self, identifier: str) -> BedPEPHub: + """ + Get file metadata by identifier. + + :param identifier: bed file identifier + :return: project metadata + """ + try: + bed_metadata = self._config.phc.sample.get( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + ) + except Exception as e: + _LOGGER.warning(f"Could not retrieve metadata from pephub. Error: {e}") + bed_metadata = {} + return BedPEPHub(**bed_metadata) + + def get_classification(self, identifier: str) -> BedClassification: + """ + Get file classification by identifier. + + :param identifier: bed file identifier + :return: project classification + """ + statement = select(Bed).where(Bed.id == identifier) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + bed_classification = BedClassification(**bed_object.__dict__) + + return bed_classification + + def get_objects(self, identifier: str) -> Dict[str, FileModel]: + """ + Get all object related to bedfile + + :param identifier: bed file identifier + :return: project objects dict + """ + statement = select(Bed).where(Bed.id == identifier) + return_dict = {} + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + for result in bed_object.files: + return_dict[result.name] = FileModel(**result.__dict__) + + return return_dict + + def get_ids_list( + self, + limit: int = 100, + offset: int = 0, + genome: str = None, + bed_type: str = None, + full: bool = False, + ) -> BedListResult: + """ + Get list of bed file identifiers. + + :param limit: number of results to return + :param offset: offset to start from + :param genome: filter by genome + :param bed_type: filter by bed type. e.g. 'bed6+4' + :param full: if True, return full metadata, including statistics, files, and raw metadata from pephub + + :return: list of bed file identifiers + """ + # TODO: question: Return Annotation? + statement = select(Bed.id) + + # TODO: make it generic, like in pephub + if genome: + statement = statement.where(Bed.genome_alias == genome) + + if bed_type: + statement = statement.where(Bed.bed_type == bed_type) + + statement = statement.limit(limit).offset(offset) + + with Session(self._sa_engine) as session: + bed_ids = session.execute(statement).all() + + return BedListResult( + count=len(bed_ids), + limit=limit, + offset=offset, + results=[self.get(result[0], full=full) for result in bed_ids], + ) + + def add( + self, + identifier: str, + stats: dict, + metadata: dict = None, + plots: dict = None, + files: dict = None, + classification: dict = None, + upload_qdrant: bool = False, + upload_pephub: bool = False, + upload_s3: bool = False, + local_path: str = None, + overwrite: bool = False, + nofail: bool = False, + ) -> None: + """ + Add bed file to the database. + + :param identifier: bed file identifier + :param stats: bed file results {statistics, plots, files, metadata} + :param metadata: bed file metadata (will be saved in pephub) + :param plots: bed file plots + :param files: bed file files + :param classification: bed file classification + :param upload_qdrant: add bed file to qdrant indexs + :param upload_pephub: add bed file to pephub + :param upload_s3: upload files to s3 + :param local_path: local path to the output files + :param overwrite: overwrite bed file if it already exists + :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite + :return: None + """ + _LOGGER.info(f"Adding bed file to database. bed_id: {identifier}") + + if self.exists(identifier): + _LOGGER.warning(f"Bed file with id: {identifier} exists in the database.") + if not overwrite: + if not nofail: + raise BedFIleExistsError( + f"Bed file with id: {identifier} already exists in the database." + ) + _LOGGER.warning("Overwrite set to False. Skipping..") + return None + else: + self.delete(identifier) + + stats = BedStatsModel(**stats) + # TODO: we should not check for specific keys, of the plots! + plots = BedPlots(**plots) + files = BedFiles(**files) + + classification = BedClassification(**classification) + if upload_pephub: + metadata = BedPEPHub(**metadata) + try: + self.upload_pephub(identifier, metadata.model_dump(), overwrite) + except Exception as e: + _LOGGER.warning( + f"Could not upload to pephub. Error: {e}. nofail: {nofail}" + ) + if not nofail: + raise e + else: + _LOGGER.info("upload_pephub set to false. Skipping pephub..") + + if upload_qdrant: + self.upload_file_qdrant( + identifier, files.bed_file.path, {"bed_id": identifier} + ) + _LOGGER.info(f"File uploaded to qdrant. {identifier}") + else: + _LOGGER.info("upload_qdrant set to false. Skipping qdrant..") + + # Upload files to s3 + if upload_s3: + if files: + files = self._config.upload_files_s3( + identifier, files=files, base_path=local_path, type="files" + ) + + if plots: + plots = self._config.upload_files_s3( + identifier, files=plots, base_path=local_path, type="plots" + ) + with Session(self._sa_engine) as session: + new_bed = Bed( + id=identifier, + **classification.model_dump(), + indexed=upload_qdrant, + pephub=upload_pephub, + ) + session.add(new_bed) + if upload_s3: + for k, v in files: + if v: + new_file = Files( + **v.model_dump( + exclude_none=True, + exclude_unset=True, + exclude={"object_id", "access_methods"}, + ), + bedfile_id=identifier, + type="file", + ) + session.add(new_file) + for k, v in plots: + if v: + new_plot = Files( + **v.model_dump( + exclude_none=True, + exclude_unset=True, + exclude={"object_id", "access_methods"}, + ), + bedfile_id=identifier, + type="plot", + ) + session.add(new_plot) + + new_bedstat = BedStats(**stats.model_dump(), id=identifier) + session.add(new_bedstat) + + session.commit() + + return None + + def update( + self, + identifier: str, + stats: dict, + metadata: dict = None, + plots: dict = None, + files: dict = None, + classification: dict = None, + add_to_qdrant: bool = False, + upload_pephub: bool = False, + upload_s3: bool = False, + local_path: str = None, + overwrite: bool = False, + nofail: bool = False, + ): + """ + Update bed file to the database. + + :param identifier: bed file identifier + :param stats: bed file results {statistics, plots, files, metadata} + :param metadata: bed file metadata (will be saved in pephub) + :param plots: bed file plots + :param files: bed file files + :param classification: bed file classification + :param add_to_qdrant: add bed file to qdrant indexs + :param upload_pephub: add bed file to pephub + :param upload_s3: upload files to s3 + :param local_path: local path to the output files + :param overwrite: overwrite bed file if it already exists + :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite + :return: None + """ + if not self.exists(identifier): + raise BEDFileNotFoundError( + f"Bed file with id: {identifier} not found. Cannot update." + ) + + stats = BedStatsModel(**stats) + plots = BedPlots(**plots) + files = BedFiles(**files) + classification = BedClassification(**classification) + + if upload_pephub: + metadata = BedPEPHub(**metadata) + try: + self.update_pephub(identifier, metadata.model_dump(), overwrite) + except Exception as e: + _LOGGER.warning( + f"Could not upload to pephub. Error: {e}. nofail: {nofail}" + ) + if not nofail: + raise e + else: + _LOGGER.info("upload_pephub set to false. Skipping pephub..") + + if add_to_qdrant: + self.upload_file_qdrant( + identifier, files.bed_file.path, {"bed_id": identifier} + ) + + statement = select(Bed).where(Bed.id == identifier) + + if upload_s3: + _LOGGER.warning("S3 upload is not implemented yet") + # if files: + # files = self._config.upload_files_s3( + # identifier, files=files, base_path=local_path, type="files" + # ) + # + # if plots: + # plots = self._config.upload_files_s3( + # identifier, files=plots, base_path=local_path, type="plots" + # ) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + + setattr(bed_object, **stats.model_dump()) + setattr(bed_object, **classification.model_dump()) + + bed_object.indexed = add_to_qdrant + bed_object.pephub = upload_pephub + + if upload_s3: + _LOGGER.warning("S3 upload is not implemented yet") + # for k, v in files: + # if v: + # new_file = Files( + # **v.model_dump(exclude_none=True, exclude_unset=True), + # bedfile_id=identifier, + # type="file", + # ) + # session.add(new_file) + # for k, v in plots: + # if v: + # new_plot = Files( + # **v.model_dump(exclude_none=True, exclude_unset=True), + # bedfile_id=identifier, + # type="plot", + # ) + # session.add(new_plot) + + session.commit() + + raise NotImplementedError + + def delete(self, identifier: str) -> None: + """ + Delete bed file from the database. + + :param identifier: bed file identifier + :return: None + """ + _LOGGER.info(f"Deleting bed file from database. bed_id: {identifier}") + if not self.exists(identifier): + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + + with Session(self._sa_engine) as session: + statement = select(Bed).where(Bed.id == identifier) + bed_object = session.scalar(statement) + + files = [FileModel(**k.__dict__) for k in bed_object.files] + delete_pephub = bed_object.pephub + delete_qdrant = bed_object.indexed + + session.delete(bed_object) + session.commit() + + if delete_pephub: + self.delete_pephub_sample(identifier) + if delete_qdrant: + self.delete_qdrant_point(identifier) + self._config.delete_files_s3(files) + + def upload_pephub(self, identifier: str, metadata: dict, overwrite: bool = False): + if not metadata: + _LOGGER.warning("No metadata provided. Skipping pephub upload..") + return False + self._config.phc.sample.create( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + sample_dict=metadata, + overwrite=overwrite, + ) + + def update_pephub(self, identifier: str, metadata: dict, overwrite: bool = False): + if not metadata: + _LOGGER.warning("No metadata provided. Skipping pephub upload..") + return False + self._config.phc.sample.update( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + sample_dict=metadata, + ) + + def delete_pephub_sample(self, identifier: str): + """ + Delete sample from pephub + + :param identifier: bed file identifier + """ + try: + self._config.phc.sample.remove( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + ) + except ResponseError as e: + _LOGGER.warning(f"Could not delete from pephub. Error: {e}") + + def upload_file_qdrant( + self, + bed_id: str, + bed_file: Union[str, RegionSet], + payload: dict = None, + ) -> None: + """ + Convert bed file to vector and add it to qdrant database + + !Warning: only hg38 genome can be added to qdrant! + + :param bed_id: bed file id + :param bed_file: path to the bed file, or RegionSet object + :param payload: additional metadata to store alongside vectors + :return: None + """ + + _LOGGER.info(f"Adding bed file to qdrant. bed_id: {bed_id}") + if isinstance(bed_file, str): + bed_region_set = RegionSet(bed_file) + elif isinstance(bed_file, RegionSet): + bed_region_set = bed_file + else: + raise BedBaseConfError( + "Could not add add region to qdrant. Invalid type, or path. " + ) + bed_embedding = np.mean(self._config.r2v.encode(bed_region_set), axis=0) + + # Upload bed file vector to the database + vec_dim = bed_embedding.shape[0] + self._qdrant_engine.load( + ids=[bed_id], + vectors=bed_embedding.reshape(1, vec_dim), + payloads=[{**payload}], + ) + return None + + def text_to_bed_search( + self, query: str, limit: int = 10, offset: int = 0 + ) -> BedListSearchResult: + """ + Search for bed files by text query in qdrant database + + :param query: text query + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file metadata + """ + _LOGGER.info(f"Looking for: {query}") + _LOGGER.info(f"Using backend: {self._config.t2bsi}") + + results = self._config.t2bsi.query_search(query, limit=limit, offset=offset) + results_list = [] + for result in results: + result_id = result["id"].replace("-", "") + try: + result_meta = self.get(result_id) + except BEDFileNotFoundError as e: + _LOGGER.warning( + f"Could not retrieve metadata for bed file: {result_id}. Error: {e}" + ) + continue + if result_meta: + results_list.append(QdrantSearchResult(**result, metadata=result_meta)) + return BedListSearchResult( + count=len(results), limit=limit, offset=offset, results=results_list + ) + + def bed_to_bed_search( + self, + region_set: RegionSet, + limit: int = 10, + offset: int = 0, + ) -> BedListSearchResult: + results = self._config.b2bsi.query_search( + region_set, limit=limit, offset=offset + ) + results_list = [] + for result in results: + result_id = result["id"].replace("-", "") + try: + result_meta = self.get(result_id) + except BEDFileNotFoundError as e: + _LOGGER.warning( + f"Could not retrieve metadata for bed file: {result_id}. Error: {e}" + ) + continue + if result_meta: + results_list.append(QdrantSearchResult(**result, metadata=result_meta)) + return BedListSearchResult( + count=len(results_list), + limit=limit, + offset=offset, + results=results_list, + ) + + def reindex_qdrant(self) -> None: + """ + Re-upload all files to quadrant. + !Warning: only hg38 genome can be added to qdrant! + + If you want want to fully reindex/reupload to qdrant, first delete collection and create new one. + + Upload all files to qdrant. + """ + bb_client = BBClient() + + statement = select(Bed.id).where(Bed.genome_alias == QDRANT_GENOME) + + with Session(self._db_engine.engine) as session: + bed_ids = session.execute(statement).all() + + bed_ids = [bed_result[0] for bed_result in bed_ids] + + for record_id in bed_ids: + bed_region_set_obj = bb_client.load_bed(record_id) + + self.upload_file_qdrant( + bed_id=record_id, + bed_file=bed_region_set_obj, + payload={"bed_id": record_id}, + ) + + return None + + def delete_qdrant_point(self, identifier: str) -> None: + """ + Delete bed file from qdrant. + + :param identifier: bed file identifier + :return: None + """ + + result = self._config.qdrant_engine.qd_client.delete( + collection_name=self._config.config.qdrant.collection, + points_selector=PointIdsList( + points=[identifier], + ), + ) + return result + + def create_qdrant_collection(self) -> bool: + """ + Create qdrant collection for bed files. + """ + return self._config.qdrant_engine.qd_client.create_collection( + collection_name=self._config.config.qdrant.collection, + vectors_config=VectorParams(size=100, distance=Distance.DOT), + ) + + def exists(self, identifier: str) -> bool: + """ + Check if bed file exists in the database. + + :param identifier: bed file identifier + :return: True if bed file exists, False otherwise + """ + statement = select(Bed).where(Bed.id == identifier) + + with Session(self._sa_engine) as session: + bed_object = session.scalar(statement) + if not bed_object: + return False + return True diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py new file mode 100644 index 00000000..ffa80aad --- /dev/null +++ b/bbconf/modules/bedsets.py @@ -0,0 +1,447 @@ +import logging + +# TODO: will be available in the next geniml release +# from geniml.io.utils import compute_md5sum_bedset +from hashlib import md5 +from typing import Dict, List + +from sqlalchemy import Float, Numeric, func, or_, select +from sqlalchemy.orm import Session + +from bbconf.config_parser import BedBaseConfig +from bbconf.const import PKG_NAME +from bbconf.db_utils import BedFileBedSetRelation, BedSets, BedStats, Files +from bbconf.exceptions import BEDFileNotFoundError, BedSetNotFoundError +from bbconf.models.bed_models import BedStatsModel +from bbconf.models.bedset_models import ( + BedSetBedFiles, + BedSetListResult, + BedSetMetadata, + BedSetPlots, + BedSetStats, + FileModel, +) +from bbconf.modules.bedfiles import BedAgentBedFile + +_LOGGER = logging.getLogger(PKG_NAME) + + +class BedAgentBedSet: + """ + Class that represents Bedset in Database. + + This class has method to add, delete, get files and metadata from the database. + """ + + def __init__(self, config: BedBaseConfig): + """ + :param config: config object + """ + self.config = config + self._db_engine = self.config.db_engine + + def get(self, identifier: str, full: bool = False) -> BedSetMetadata: + """ + Get file metadata by identifier. + + :param identifier: bed file identifier + :param full: return full record with stats, plots, files and metadata + :return: project metadata + """ + + statement = select(BedSets).where(BedSets.id == identifier) + + with Session(self._db_engine.engine) as session: + bedset_obj = session.scalar(statement) + if not bedset_obj: + raise BedSetNotFoundError(identifier) + list_of_bedfiles = [ + bedset_obj.bedfile_id for bedset_obj in bedset_obj.bedfiles + ] + if full: + plots = BedSetPlots() + for plot in bedset_obj.files: + setattr(plots, plot.name, FileModel(**plot.__dict__)) + + stats = BedSetStats( + mean=BedStatsModel(**bedset_obj.bedset_means), + sd=BedStatsModel(**bedset_obj.bedset_standard_deviation), + ).model_dump() + else: + plots = None + stats = None + + bedset_metadata = BedSetMetadata( + id=bedset_obj.id, + name=bedset_obj.name, + description=bedset_obj.description, + md5sum=bedset_obj.md5sum, + statistics=stats, + plots=plots, + bed_ids=list_of_bedfiles, + ) + + return bedset_metadata + + def get_plots(self, identifier: str) -> BedSetPlots: + """ + Get plots for bedset by identifier. + + :param identifier: bedset identifier + :return: bedset plots + """ + statement = select(BedSets).where(BedSets.id == identifier) + + with Session(self._db_engine.engine) as session: + bedset_object = session.scalar(statement) + if not bedset_object: + raise BedSetNotFoundError(f"Bed file with id: {identifier} not found.") + bedset_files = BedSetPlots() + for result in bedset_object.files: + if result.name in bedset_files.model_fields: + setattr( + bedset_files, + result.name, + FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self.config.construct_access_method_list( + result.path + ), + ), + ) + return bedset_files + + def get_objects(self, identifier: str) -> Dict[str, FileModel]: + """ + Get objects for bedset by identifier. + + :param identifier: bedset identifier + :return: bedset objects + """ + statement = select(BedSets).where(BedSets.id == identifier) + return_dict = {} + + with Session(self._db_engine.engine) as session: + bedset_object = session.scalar(statement) + if not bedset_object: + raise BedSetNotFoundError(f"Bedset with id: {identifier} not found.") + for result in bedset_object.files: + return_dict[result.name] = FileModel( + **result.__dict__, + object_id=f"bed.{identifier}.{result.name}", + access_methods=self.config.construct_access_method_list( + result.path + ), + ) + + return return_dict + + def get_statistics(self, identifier: str) -> BedSetStats: + """ + Get statistics for bedset by identifier. + + :param identifier: bedset identifier + :return: bedset statistics + """ + statement = select(BedSets).where(BedSets.id == identifier) + with Session(self._db_engine.engine) as session: + bedset_object = session.scalar(statement) + if not bedset_object: + raise BedSetNotFoundError(f"Bedset with id: {identifier} not found.") + return BedSetStats( + mean=BedStatsModel(**bedset_object.bedset_means), + sd=BedStatsModel(**bedset_object.bedset_standard_deviation), + ) + + def create( + self, + identifier: str, + name: str, + bedid_list: List[str], + description: str = None, + statistics: bool = False, + plots: dict = None, + upload_pephub: bool = False, + upload_s3: bool = False, + local_path: str = "", + no_fail: bool = False, + ) -> None: + """ + Create bedset in the database. + + :param identifier: bedset identifier + :param name: bedset name + :param description: bedset description + :param bedid_list: list of bed file identifiers + :param statistics: calculate statistics for bedset + :param plots: dictionary with plots + :param upload_pephub: upload bedset to pephub (create view in pephub) + :param upload_s3: upload bedset to s3 + :param local_path: local path to the output files + :param no_fail: do not raise an error if bedset already exists + :return: None + """ + _LOGGER.info(f"Creating bedset '{identifier}'") + + if statistics: + stats = self._calculate_statistics(bedid_list) + else: + stats = None + + if upload_pephub: + try: + self._create_pephub_view(identifier, description, bedid_list, no_fail) + except Exception as e: + _LOGGER.error(f"Failed to create view in pephub: {e}") + if not no_fail: + raise e + + new_bedset = BedSets( + id=identifier, + name=name, + description=description, + bedset_means=stats.mean.model_dump() if stats else None, + bedset_standard_deviation=stats.sd.model_dump() if stats else None, + # md5sum=compute_md5sum_bedset(bedid_list), + md5sum=md5("".join(bedid_list).encode()).hexdigest(), + ) + + if upload_s3: + plots = BedSetPlots(**plots) + plots = self.config.upload_files_s3( + identifier, files=plots, base_path=local_path, type="bedsets" + ) + + with Session(self._db_engine.engine) as session: + session.add(new_bedset) + + for bedfile in bedid_list: + session.add( + BedFileBedSetRelation(bedset_id=identifier, bedfile_id=bedfile) + ) + if upload_s3: + for k, v in plots: + if v: + new_file = Files( + **v.model_dump(exclude_none=True, exclude_unset=True), + bedset_id=identifier, + type="plot", + ) + session.add(new_file) + + session.commit() + + _LOGGER.info(f"Bedset '{identifier}' was created successfully") + return None + + def _calculate_statistics(self, bed_ids: List[str]) -> BedSetStats: + """ + Calculate statistics for bedset. + + :param bed_ids: list of bed file identifiers + :return: statistics + """ + + _LOGGER.info("Calculating bedset statistics") + numeric_columns = BedStatsModel.model_fields + + bedset_sd = {} + bedset_mean = {} + with Session(self._db_engine.engine) as session: + for column_name in numeric_columns: + mean_bedset_statement = select( + func.round( + func.avg(getattr(BedStats, column_name)).cast(Numeric), 4 + ).cast(Float) + ).where(BedStats.id.in_(bed_ids)) + + sd_bedset_statement = select( + func.round( + func.stddev(getattr(BedStats, column_name)).cast(Numeric), + 4, + ).cast(Float) + ).where(BedStats.id.in_(bed_ids)) + + bedset_sd[column_name] = session.execute(mean_bedset_statement).one()[0] + bedset_mean[column_name] = session.execute(sd_bedset_statement).one()[0] + + bedset_stats = BedSetStats( + mean=bedset_mean, + sd=bedset_sd, + ) + + _LOGGER.info("Bedset statistics were calculated successfully") + return bedset_stats + + def _create_pephub_view( + self, + bedset_id: str, + description: str = None, + bed_ids: list = None, + nofail: bool = False, + ) -> None: + """ + Create view in pephub for bedset. + + :param bedset_id: bedset identifier + :param description: bedset description + :param bed_ids: list of bed file identifiers + :param nofail: do not raise an error if sample not found + + :return: None + """ + + _LOGGER.info(f"Creating view in pephub for bedset '{bedset_id}'") + try: + self.config.phc.view.create( + namespace=self.config.config.phc.namespace, + name=self.config.config.phc.name, + tag=self.config.config.phc.tag, + view_name=bedset_id, + # description=description, + sample_list=bed_ids, + ) + except Exception as e: + _LOGGER.error(f"Failed to create view in pephub: {e}") + if not nofail: + raise e + return None + + def get_ids_list( + self, query: str = None, limit: int = 10, offset: int = 0 + ) -> BedSetListResult: + """ + Get list of bedsets from the database. + + :param query: search query + :param limit: limit of results + :param offset: offset of results + :return: list of bedsets + """ + statement = select(BedSets.id) + if query: + sql_search_str = f"%{query}%" + statement = statement.where( + or_( + BedSets.name.ilike(sql_search_str), + BedSets.description.ilike(sql_search_str), + ) + ) + with Session(self._db_engine.engine) as session: + bedset_list = session.execute(statement.limit(limit).offset(offset)) + + result_list = [] + for bedset_id in bedset_list: + result_list.append(self.get(bedset_id[0])) + return BedSetListResult( + count=len(result_list), + limit=limit, + offset=offset, + results=result_list, + ) + + def get_bedset_bedfiles( + self, identifier: str, full: bool = False, limit: int = 100, offset: int = 0 + ) -> BedSetBedFiles: + """ + Get list of bedfiles in bedset. + + :param identifier: bedset identifier + :param full: return full records with stats, plots, files and metadata + :param limit: limit of results + :param offset: offset of results + + :return: list of bedfiles + """ + bed_object = BedAgentBedFile(self.config) + + statement = ( + select(BedFileBedSetRelation) + .where(BedFileBedSetRelation.bedset_id == identifier) + .limit(limit) + .offset(offset) + ) + + with Session(self._db_engine.engine) as session: + bedfiles = session.execute(statement).all() + results = [] + for bedfile in bedfiles: + try: + results.append(bed_object.get(bedfile[0].bedfile_id, full=full)) + except BEDFileNotFoundError as _: + _LOGGER.error(f"Bedfile {bedfile[0].bedfile_id} not found") + + return BedSetBedFiles( + count=len(results), + limit=limit, + offset=offset, + results=results, + ) + + def delete(self, identifier: str) -> None: + """ + Delete bed file from the database. + + :param identifier: bedset identifier + :return: None + """ + if not self.exists(identifier): + raise BedSetNotFoundError(identifier) + + _LOGGER.info(f"Deleting bedset '{identifier}'") + + with Session(self._db_engine.engine) as session: + statement = select(BedSets).where(BedSets.id == identifier) + + bedset_obj = session.scalar(statement) + files = [FileModel(**k.__dict__) for k in bedset_obj.files] + + session.delete(bedset_obj) + session.commit() + + self.delete_phc_view(identifier, nofail=True) + if files: + self.config.delete_files_s3(files) + + def delete_phc_view(self, identifier: str, nofail: bool = False) -> None: + """ + Delete view in pephub. + + :param identifier: bedset identifier + :param nofail: do not raise an error if view not found + :return: None + """ + _LOGGER.info(f"Deleting view in pephub for bedset '{identifier}'") + try: + self.config.phc.view.delete( + namespace=self.config.config.phc.namespace, + name=self.config.config.phc.name, + tag=self.config.config.phc.tag, + view_name=identifier, + ) + except Exception as e: + _LOGGER.error(f"Failed to delete view in pephub: {e}") + if not nofail: + raise e + return None + + def exists(self, identifier: str) -> bool: + """ + Check if bedset exists in the database. + + :param identifier: bedset identifier + :return: True if bedset exists, False otherwise + """ + statement = select(BedSets).where(BedSets.id == identifier) + with Session(self._db_engine.engine) as session: + result = session.execute(statement).one_or_none() + if result: + return True + return False + + def add_bedfile(self, identifier: str, bedfile: str) -> None: + raise NotImplementedError + + def delete_bedfile(self, identifier: str, bedfile: str) -> None: + raise NotImplementedError diff --git a/bbconf/modules/objects.py b/bbconf/modules/objects.py new file mode 100644 index 00000000..19054cfe --- /dev/null +++ b/bbconf/modules/objects.py @@ -0,0 +1,183 @@ +import datetime +import logging +from typing import List, Literal, Union + +from bbconf.config_parser.bedbaseconfig import BedBaseConfig +from bbconf.const import PKG_NAME +from bbconf.exceptions import ( + BedBaseConfError, + MissingObjectError, + MissingThumbnailError, +) +from bbconf.models.bed_models import FileModel +from bbconf.models.drs_models import DRSModel +from bbconf.modules.bedfiles import BedAgentBedFile +from bbconf.modules.bedsets import BedAgentBedSet + +_LOGGER = logging.getLogger(PKG_NAME) + + +class BBObjects: + """ """ + + def __init__(self, config: BedBaseConfig): + """ + :param config: config object + """ + self.config = config + self.bed = BedAgentBedFile(self.config) + self.bedset = BedAgentBedSet(self.config) + + def get_thumbnail_uri( + self, + record_type: Literal["bed", "bedset"], + record_id: str, + result_id: str, + access_id: str = "http", + ) -> str: + """ + Create URL to access a bed- or bedset-associated thumbnail + + :param record_type: table_name ["bed", "bedset"] + :param record_id: record identifier + :param result_id: column name (result name) + :param access_id: access id (e.g. http, s3, etc.) + :return: string with thumbnail + """ + result = self._get_result(record_type, record_id, result_id) + if result.path_thumbnail: + return self.config.get_prefixed_uri(result.path_thumbnail, access_id) + + else: + _LOGGER.error( + f"Thumbnail for {record_type} {record_id} {result_id} is not defined." + ) + raise MissingThumbnailError( + f"Thumbnail for {record_type} {record_id} {result_id} is not defined." + ) + + def get_object_uri( + self, + record_type: Literal["bed", "bedset"], + record_id: str, + result_id: str, + access_id: str, + ) -> str: + """ + Create URL to access a bed- or bedset-associated file + + :param record_type: table_name ["bed", "bedset"] + :param record_id: record identifier + :param result_id: column name (result name) + :param access_id: access id (e.g. http, s3, etc.) + :return: + """ + result = self._get_result(record_type, record_id, result_id) + return self.config.get_prefixed_uri(result.path, access_id) + + def _get_result( + self, + record_type: Literal["bed", "bedset"], + record_id: str, + result_id: Union[str, List[str]], + ) -> FileModel: + """ + Generic getter that can return a result from either bed or bedset + + :param record_type: table_name ["bed", "bedset"] + :param record_id: record identifier + :param result_id: column name (result name). e.g. "bigbedfile", "bed_file", "open_chromatin" + :return: pipestat result + """ + if record_type == "bed": + try: + result = self.bed.get_objects(identifier=record_id)[result_id] + except KeyError: + _LOGGER.error(f"Result {result_id} is not defined for bed {record_id}") + raise MissingObjectError( + f"Result {result_id} is not defined for bed {record_id}" + ) + elif record_type == "bedset": + try: + result = self.bedset.get_objects(identifier=record_id)[result_id] + _LOGGER.error(f"Result {result_id} is not defined for bed {record_id}") + except KeyError: + raise MissingObjectError( + f"Result {result_id} is not defined for bed {record_id}" + ) + + else: + raise BedBaseConfError( + f"Record type {record_type} is not supported. Only bed and bedset are supported." + ) + + _LOGGER.info(f"Getting uri for {record_type} {record_id} {result_id}") + _LOGGER.info(f"Result: {result}") + return result + + def get_drs_metadata( + self, + record_type: Literal["bed", "bedset"], + record_id: str, + result_id: str, + base_uri: str, + ) -> DRSModel: + """ + Get DRS metadata for a bed- or bedset-associated file + + :param record_type: bed or bedset + :param record_id: record identifier + :param result_id: name of the result file to get metadata for + :param base_uri: base uri to use for the self_uri field (server hostname of DRS broker) + :return: DRS metadata + """ + + object_id = f"{record_type}.{record_id}.{result_id}" + bed_result = self.bed.get(record_id) + created_time = bed_result.submission_date + modified_time = bed_result.last_update_date + record_metadata = self._get_result( + record_type, record_id, result_id + ) # only get result once + if not record_metadata: + raise MissingObjectError("Record not found") + + drs_dict = self.construct_drs_metadata( + base_uri, + object_id, + record_metadata, + created_time, + modified_time, + ) + + return drs_dict + + def construct_drs_metadata( + self, + base_uri: str, + object_id: str, + record_metadata: FileModel, + created_time: datetime.datetime = None, + modified_time: datetime.datetime = None, + ): + """ + Construct DRS metadata object + + :param base_uri: base uri to use for the self_uri field (server hostname of DRS broker) + :param object_id: record identifier + :param record_metadata: metadata of the record + :param created_time: time of creation + :param modified_time: time of last modification + :return: DRS metadata + """ + access_methods = self.config.construct_access_method_list(record_metadata.path) + drs_dict = DRSModel( + id=object_id, + self_uri=f"drs://{base_uri}/{object_id}", + size=record_metadata.size or None, + created_time=created_time, + updated_time=modified_time, + checksums=object_id, + access_methods=access_methods, + ) + return drs_dict diff --git a/bbconf/schemas/bedfiles_schema.yaml b/bbconf/schemas/bedfiles_schema.yaml deleted file mode 100644 index 650c3b33..00000000 --- a/bbconf/schemas/bedfiles_schema.yaml +++ /dev/null @@ -1,152 +0,0 @@ -title: BED files schema -description: An output schema for bedfile results. -type: object -properties: - pipeline_name: bedfile - samples: - type: object - properties: - name: - type: string - description: BED file name - genome: - type: object - description: genome assembly of the BED files - bed_type: - type: string - description: type of BED file, eg. "bed3", "bed6+4" - bed_format: - type: string - description: format type of BED file, narrowpeak, broadpeak, or bed - bedfile: - $ref: "#/$defs/file" - label: bedfile - description: BED file - bigbedfile: - $ref: "#/$defs/file" - label: bigBed - description: bigBed file - regions_no: - type: integer - description: Number of regions - gc_content: - type: number - description: GC content - median_tss_dist: - type: number - description: Median TSS distance - mean_region_width: - type: number - description: Mean region width - exon_frequency: - type: number - description: Exon frequency - intron_frequency: - type: number - description: Intron frequency - promoterprox_frequency: - type: number - description: Promoter frequency - intergenic_frequency: - type: number - description: Intergenic frequency - promotercore_frequency: - type: number - description: Promoter core frequency - fiveutr_frequency: - type: number - description: 5' UTR frequency - threeutr_frequency: - type: number - description: 3' UTR frequency - fiveutr_percentage: - type: number - description: 5' UTR percentage - threeutr_percentage: - type: number - description: 3' UTR percentage - promoterprox_percentage: - type: number - description: Promoter proc percentage - exon_percentage: - type: number - description: Exon percentage - intron_percentage: - type: number - description: Intron percentage - intergenic_percentage: - type: number - description: Intergenic percentage - promotercore_percentage: - type: number - description: Promoter core percentage - tssdist: - $ref: "#/$defs/image" - label: TSS_distance - description: Region-TSS distance plot - chrombins: - $ref: "#/$defs/image" - label: Chromosome_bins - description: Regions distribution over chromosomes plot - gccontent: - $ref: "#/$defs/image" - label: GC_content - description: GC content plot - partitions: - $ref: "#/$defs/image" - label: Regions_dist_partitions - description: Regions distribution over genomic partitions plot - expected_partitions: - $ref: "#/$defs/image" - label: Expected_partitions - description: Expected distribution over genomic partitions plot - cumulative_partitions: - $ref: "#/$defs/image" - label: Cumulative_partitions - description: Cumulative distribution over genomic partitions plot - widths_histogram: - $ref: "#/$defs/image" - label: Widths_histogram - description: Quantile-trimmed histogram of widths - neighbor_distances: - $ref: "#/$defs/image" - label: Neighbor_distance - description: Distance between neighbor regions - open_chromatin: - $ref: "#/$defs/image" - label: Open_chromatin - description: Cell specific enrichment for open chromatin plot - other: - type: object - description: Other, not categorized BED file metadata - upload_status: - type: object - description: "Status of the upload to the databases: Dict[s3, qdrant, pephub]" - - -$defs: - image: - type: object - object_type: image - properties: - path: - type: string - thumbnail_path: - type: string - title: - type: string - required: - - path - - thumbnail_path - - title - file: - type: object - object_type: file - properties: - path: - type: string - title: - type: string - required: - - path - - title \ No newline at end of file diff --git a/bbconf/schemas/bedsets_schema.yaml b/bbconf/schemas/bedsets_schema.yaml deleted file mode 100644 index e492a0e7..00000000 --- a/bbconf/schemas/bedsets_schema.yaml +++ /dev/null @@ -1,54 +0,0 @@ -title: bedset_output_schema -description: An output schema for bedset results. -type: object -properties: - pipeline_name: bedsets - samples: - type: object - properties: - name: - type: string - description: BED set name - description: - type: string - description: description of the BED set - md5sum: - type: string - description: digest of the BED set - genome: - type: object - description: genome assembly of the BED sets - bedset_means: - type: object - description: Mean statistics of the BED files in this BED set - bedset_standard_deviation: - type: object - description: Standard deviations of statistics of the BED files in this BED set - region_commonality: - $ref: "#/$defs/image" - label: Region_commonality - description: Region commonality plot - processed: - type: boolean - label: BED_set_status - description: whether the BED set stats/plot are completed - pephub_path: - type: string - description: PEPhub registry path - - -$defs: - image: - type: object - object_type: image - properties: - path: - type: string - thumbnail_path: - type: string - title: - type: string - required: - - path - - thumbnail_path - - title \ No newline at end of file diff --git a/docs/bedbase_overview.svg b/docs/bedbase_overview.svg new file mode 100644 index 00000000..d775db29 --- /dev/null +++ b/docs/bedbase_overview.svg @@ -0,0 +1,3409 @@ + + + + + +BBclientBED2BED searchText2BED searchBedmakerBedstat.bed.bed.bedBedbuncherFastAPIBEDhostPEPhubBBconfOther toolsBBuploaderPEPhubgeofetchgeofetchGEO diff --git a/docs/changelog.md b/docs/changelog.md index d15891cd..bf97d730 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,17 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +# [0.5.0] - 2024-04-08 +### Changed + +- Rebuild bbconf +- Introduced new DB schema +- Added bbagent that will be used to interact with the database +- Updated config schema +- Added new functionality to the bbagent +- New tests + + ## [0.4.2] - 2024-03-12 ### Change - Updated logger diff --git a/docs/schema.svg b/docs/schema.svg new file mode 100644 index 00000000..1b3b8e4b --- /dev/null +++ b/docs/schema.svg @@ -0,0 +1,755 @@ + + + + + + + + + + bed + bed + + - id : VARCHAR + - name : VARCHAR + - genome_alias : VARCHAR + - genome_digest : VARCHAR + - description : VARCHAR + - bed_type : VARCHAR + - bed_format : VARCHAR + - indexed : BOOLEAN + - pephub : BOOLEAN + - submission_date : TIMESTAMP + - last_update_date : TIMESTAMP + + UNIQUE (id) + INDEX (id) + + + + bed_stats + bed_stats + + - id : VARCHAR + - number_of_regions : FLOAT + - gc_content : FLOAT + - median_tss_dist : FLOAT + - mean_region_width : FLOAT + - exon_frequency : FLOAT + - intron_frequency : FLOAT + - promoterprox_frequency : FLOAT + - intergenic_frequency : FLOAT + - promotercore_frequency : FLOAT + - fiveutr_frequency : FLOAT + - threeutr_frequency : FLOAT + - fiveutr_percentage : FLOAT + - threeutr_percentage : FLOAT + - promoterprox_percentage : FLOAT + - exon_percentage : FLOAT + - intron_percentage : FLOAT + - intergenic_percentage : FLOAT + - promotercore_percentage : FLOAT + - tssdist : FLOAT + + UNIQUE (id) + INDEX (id) + + + + id + + id + + bedfile_bedset_relation + bedfile_bedset_relation + + - bedset_id : VARCHAR + - bedfile_id : VARCHAR + + UNIQUE (bedset_id, bedfile_id) + + + + id + + bedfile_id + + files + files + + - id : INTEGER + - name : VARCHAR + - title : VARCHAR + - type : VARCHAR + - path : VARCHAR + - path_thumbnail : VARCHAR + - description : VARCHAR + - size : INTEGER + - bedfile_id : VARCHAR + - bedset_id : VARCHAR + + UNIQUE (id) + INDEX (bedset_id) + INDEX (id) + INDEX (bedfile_id) + + + + files->bed + + + + + id + + bedfile_id + + + bedsets + bedsets + + - id : VARCHAR + - name : VARCHAR + - description : VARCHAR + - submission_date : TIMESTAMP + - last_update_date : TIMESTAMP + - md5sum : VARCHAR + - bedset_means : JSON + - bedset_standard_deviation : JSON + + UNIQUE (id) + INDEX (id) + + + + files->bedsets + + + + + id + + bedset_id + + + id + + bedset_id + + + + diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index b27670ad..af05779d 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,12 @@ -yacman>=0.9.1 -pipestat[dbbackend]>=0.8.0 -geniml>=0.2.0 -psycopg>=3.1.15 -colorlogs \ No newline at end of file +yacman >= 0.9.1 +sqlalchemy >= 2.0.0 +geniml >= 0.3.0 +psycopg >= 3.1.15 +colorlogs +pydantic >= 2.6.4 +botocore >= 1.34.54 +boto3 >= 1.34.54 +pephubclient >= 0.4.1 +sqlalchemy_schemadisplay +# quick fix for gesnim +scipy <= 1.11.0 \ No newline at end of file diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 483af844..65d4db9f 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,5 +1,11 @@ -pytest-cov +black +ruff pytest -coverage -smokeshow python-dotenv +pytest-mock +flake8 +coveralls +pytest-cov +pre-commit +coverage +smokeshow \ No newline at end of file diff --git a/tests/README.md b/tests/README.md index a619e52d..ee346086 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,11 +1,13 @@ # How to setup the test environment -### To create a test database for testing : +# before running the tests, make sure you have the following installed: + +### Create database before running the tests: ``` -docker run --rm -it --name pipestat-test \ +docker run --rm -it --name bedbase-test \ -e POSTGRES_USER=postgres \ - -e POSTGRES_PASSWORD=dockerpassword \ - -e POSTGRES_DB=pipestat-test \ + -e POSTGRES_PASSWORD=docker\ + -e POSTGRES_DB=bedbase \ -p 5432:5432 postgres ``` \ No newline at end of file diff --git a/tests/config_test.yaml b/tests/config_test.yaml new file mode 100644 index 00000000..6373dba5 --- /dev/null +++ b/tests/config_test.yaml @@ -0,0 +1,36 @@ +path: + region2vec: databio/r2v-encode-hg38 + vec2vec: databio/v2v-geo-hg38 +database: + host: localhost + port: 5432 + password: docker + user: postgres + database: bedbase +server: + host: 0.0.0.0 + port: 8000 +qdrant: + host: localhost + port: 6333 + collection: "bedbase" +s3: + bucket: bedbase + endpoint_url: "None" +phc: + namespace: bedbase + name: bedbase + tag: test +access_methods: + http: + type: "https" + description: HTTP compatible path + prefix: https://data2.bedbase.org/ + s3: + type: "s3" + description: S3 compatible path + prefix: s3://data2.bedbase.org/ + local: + type: "https" + description: How to serve local files. + prefix: /static \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 57a718ae..1ef4a853 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,37 +1,112 @@ -""" Test suite shared objects and setup """ - import os + import pytest +from bbconf.bbagent import BedBaseAgent -@pytest.fixture -def test_data_bed(): - s = "test_string" - return {"name": s, "bedfile": {"path": s, "title": s}, "regions_no": 1} +from .utils import BED_TEST_ID +TESTS_DIR = os.path.dirname(os.path.abspath(__file__)) -@pytest.fixture -def test_data_bedset(): - s = "test_string" +CONFIG_PATH = os.path.join( + TESTS_DIR, + "config_test.yaml", +) +DATA_PATH = os.path.join( + TESTS_DIR, + "data", +) + + +def get_bbagent(): + return BedBaseAgent(config=CONFIG_PATH) + + +@pytest.fixture(scope="function") +def bbagent_obj(): + yield BedBaseAgent(config=CONFIG_PATH) + + +@pytest.fixture() +def example_bedset_plot(): return { - "name": s, - "bedset_means": { - "exon_frequency": 271, - "exon_percentage": 0.081, - }, + "name": "chrombins", + "description": "Regions distribution over chromosomes", + "title": "Regions distribution over chromosomes", + "path": "data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf", + "path_thumbnail": "data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png", + "bedset_id": BED_TEST_ID, } -@pytest.fixture -def data_path(): - return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") +@pytest.fixture() +def example_dict(): + plots = { + "chrombins": { + "name": "Regions distribution over chromosomes", + "path": "plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf", + "path_thumbnail": "plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png", + } + } + files = { + "bedfile": { + "name": "Bed file", + "path": os.path.join( + DATA_PATH, "files/bbad85f21962bb8d972444f7f9a3a932.bed.gz" + ), + "description": "Bed file with regions", + } + } + classification = { + "bed_format": "narrowpeak", + "bed_type": "bed6+4", + "genome_alias": "hg38", + "genome_digest": "2230c535660fb4774114bfa966a62f823fdb6d21acf138d4", + "name": "bbad85f21962bb8d972444f7f9a3a932", + } + + return dict( + identifier="bbad85f21962bb8d972444f7f9a3a932", + stats={ + "number_of_regions": 1, + "median_tss_dist": 2, + "mean_region_width": 3, + "exon_frequency": 4, + "exon_percentage": 5, + "intron_frequency": 6, + "intron_percentage": 7, + "intergenic_percentage": 8, + "intergenic_frequency": 9, + "promotercore_frequency": 10, + "promotercore_percentage": 11, + "fiveutr_frequency": 12, + "fiveutr_percentage": 13, + "threeutr_frequency": 14, + "threeutr_percentage": 15, + "promoterprox_frequency": 16, + "promoterprox_percentage": 17, + }, + metadata={"sample_name": "sample_name_1"}, + plots=plots, + files=files, + classification=classification, + upload_qdrant=False, + upload_pephub=False, + upload_s3=True, + local_path=DATA_PATH, + overwrite=False, + nofail=False, + ) @pytest.fixture -def cfg_pth(data_path): - return os.path.join(data_path, "config.yaml") +def load_test_data(): + get_bbagent().config.db_engine() -@pytest.fixture -def invalid_cfg_pth(data_path): - return os.path.join(data_path, "config_invalid.yaml") +@pytest.fixture() +def mocked_phc(mocker): + mocker.patch( + "pephubclient.modules.sample.PEPHubSample.get", + return_value={"sample_name": BED_TEST_ID, "other_metadata": "other_metadata_1"}, + ) diff --git a/tests/data/config.yaml b/tests/data/config.yaml deleted file mode 100644 index 459966fb..00000000 --- a/tests/data/config.yaml +++ /dev/null @@ -1,37 +0,0 @@ -database: - name: pipestat-test - user: postgres - password: dockerpassword - host: localhost - dialect: postgresql - driver: psycopg - port: 5432 -path: - pipeline_output_path: $BEDBASE_DATA_PATH/outputs - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output - remote_url_base: null - region2vec: "add/path/here" -server: - host: 0.0.0.0 - port: 8000 -qdrant: - host: localhost - port: 6333 - api_key: None - collection: bedbase -remotes: - https: - prefix: https://data2.bedbase.org/ - description: HTTP compatible path - s3: - prefix: s3://data2.bedbase.org/ - description: S3 compatible path - -access_methods: - https: - server_url: https://data2.bedbase.org/ - description: HTTP compatible path - s3: - server_url: s3://data2.bedbase.org/ - description: S3 compatible path \ No newline at end of file diff --git a/tests/data/config_invalid.yaml b/tests/data/config_invalid.yaml deleted file mode 100644 index 4d953e3b..00000000 --- a/tests/data/config_invalid.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# full config example. Refer to bbconf/const.py for key names and default values - -path: - pipeline_output_path: $HOME/bedbase \ No newline at end of file diff --git a/tests/data/config_min.yaml b/tests/data/config_min.yaml deleted file mode 100644 index 50caa023..00000000 --- a/tests/data/config_min.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# min config example. Refer to bbconf/const.py for key names and default values - -path: - pipeline_output_path: $HOME/bedbase - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output \ No newline at end of file diff --git a/tests/data/config_noport.yaml b/tests/data/config_noport.yaml deleted file mode 100644 index 0e6b463b..00000000 --- a/tests/data/config_noport.yaml +++ /dev/null @@ -1,16 +0,0 @@ -database: - name: pipestat-test - user: postgres - password: pipestat-password - host: localhost - dialect: postgresql - driver: psycopg - # port: 5432; intentionally commented out to test the defaults setting system -path: - pipeline_output_path: $BEDBASE_DATA_PATH/outputs - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output - remote_url_base: null -server: - host: 0.0.0.0 - port: 8000 \ No newline at end of file diff --git a/tests/data/files/bbad85f21962bb8d972444f7f9a3a932.bed.gz b/tests/data/files/bbad85f21962bb8d972444f7f9a3a932.bed.gz new file mode 100644 index 00000000..8ab86ede Binary files /dev/null and b/tests/data/files/bbad85f21962bb8d972444f7f9a3a932.bed.gz differ diff --git a/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf b/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf new file mode 100644 index 00000000..aacf018f Binary files /dev/null and b/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf differ diff --git a/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png b/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png new file mode 100644 index 00000000..2229cb83 Binary files /dev/null and b/tests/data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png differ diff --git a/tests/test_bbconf.py b/tests/test_bbconf.py deleted file mode 100644 index 9550ccf8..00000000 --- a/tests/test_bbconf.py +++ /dev/null @@ -1,188 +0,0 @@ -""" Tests for BedBaseConf database features """ - -import pytest -from pipestat import PipestatManager -from sqlalchemy.exc import IntegrityError -from sqlmodel import SQLModel, create_engine -from sqlmodel.main import default_registry -import os -import warnings -import sqlalchemy - -from bbconf import BedBaseConf, get_bedbase_cfg -from bbconf.exceptions import MissingConfigDataError - - -DB_URL = "postgresql+psycopg://postgres:dockerpassword@127.0.0.1:5432/pipestat-test" -DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") -pytest_db_skip_reason = "Database is not set up... To run this test, set up the database. Go to test/README.md for more information." - - -def db_setup(): - # Check if the database is setup - try: - BedBaseConf(os.path.join(DATA_PATH, "config.yaml")) - except sqlalchemy.exc.OperationalError: - warnings.warn(UserWarning(f"{pytest_db_skip_reason}")) - return False - return True - - -class ContextManagerDBTesting: - """ - Creates context manager to connect to database at db_url and drop everything from the database upon exit to ensure - the db is empty for each new test. - """ - - def __init__(self, db_url): - self.db_url = db_url - - def __enter__(self): - self.engine = create_engine(self.db_url, echo=True) - self.connection = self.engine.connect() - return self.connection - - def __exit__(self, exc_type, exc_value, exc_traceback): - SQLModel.metadata.drop_all(self.engine) - default_registry.dispose() - self.connection.close() - - -@pytest.mark.skipif( - not db_setup(), - reason=pytest_db_skip_reason, -) -class TestAll: - def test_invalid_config(self, invalid_cfg_pth): - with ContextManagerDBTesting(DB_URL): - with pytest.raises(MissingConfigDataError): - BedBaseConf(get_bedbase_cfg(cfg=invalid_cfg_pth)) - - def test_tables_creation(self, cfg_pth): - # with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - for table in ["bed", "bedset"]: - assert isinstance(getattr(bbc, table), PipestatManager) - - def test_data_insert(self, cfg_pth, test_data_bed, test_data_bedset): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - # bedfiles table - ori_cnt = bbc.bed.record_count - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - assert ori_cnt + 1 == bbc.bed.record_count - # bedsets table - ori_cnt = bbc.bedset.record_count - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - assert ori_cnt + 1 == bbc.bedset.record_count - - def test_nonunique_digest_insert_error( - self, cfg_pth, test_data_bed, test_data_bedset - ): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - assert not bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - assert not bbc.bedset.report( - record_identifier="bedset1", values=test_data_bedset - ) - - def test_reporting_relationships(self, cfg_pth, test_data_bed, test_data_bedset): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - bbc.report_relationship( - bedfile_record_id="bed1", bedset_record_id="bedset1" - ) - - # def test_cant_remove_record_if_in_reltable( - # self, cfg_pth, test_data_bed, test_data_bedset - # ): - # with ContextManagerDBTesting(DB_URL): - # bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - # bbc.bed.report(record_identifier="bed1", values=test_data_bed) - # bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - # bbc.report_relationship( - # bedfile_record_id="bed1", bedset_record_id="bedset1" - # ) - # with pytest.raises(IntegrityError): - # bbc.bed.remove(record_identifier="bed1") - # with pytest.raises(IntegrityError): - # bbc.bedset.remove(record_identifier="bedset1") - - def test_select(self, cfg_pth, test_data_bed, test_data_bedset): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - bbc.report_relationship( - bedfile_record_id="bed1", bedset_record_id="bedset1" - ) - - unique_bedfiles = bbc.select_unique(table_name="bedfile__sample") - assert unique_bedfiles[0]["record_identifier"] == "bed1" - unique_bedsets = bbc.select_unique(table_name="bedsets__sample") - assert unique_bedsets[0]["record_identifier"] == "bedset1" - - def test_removal(self, cfg_pth, test_data_bed, test_data_bedset): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - bbc.report_relationship( - bedfile_record_id="bed1", - bedset_record_id="bedset1", - ) - bbc.remove_relationship( - bedset_record_id="bedset1", bedfile_record_id=["bed1"] - ) - ori_cnt = bbc.bed.record_count - bbc.bed.remove(record_identifier="bed1") - assert ori_cnt - 1 == bbc.bed.record_count - ori_cnt = bbc.bedset.record_count - bbc.bedset.remove(record_identifier="bedset1") - assert ori_cnt - 1 == bbc.bedset.record_count - - def test_config_variables_are_set(self, cfg_pth, test_data_bed, test_data_bedset): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - - print(bbc.config["qdrant"]["host"]) - assert bbc.config["qdrant"]["host"] == "localhost" - assert bbc.config["path"]["region2vec"] is not None - assert bbc.config["database"]["host"] in ["localhost", "127.0.0.1"] - - def test_select_bedfiles_from_bedset( - self, cfg_pth, test_data_bed, test_data_bedset - ): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - bbc.report_relationship( - bedfile_record_id="bed1", bedset_record_id="bedset1" - ) - result = bbc.select_bedfiles_from_bedset(bedset_record_id="bedset1") - assert result[0]["record_identifier"] == "bed1" - - def test_select_bedfiles_from_bedset_with_metadata( - self, cfg_pth, test_data_bed, test_data_bedset - ): - with ContextManagerDBTesting(DB_URL): - bbc = BedBaseConf(get_bedbase_cfg(cfg=cfg_pth)) - bbc.bed.report(record_identifier="bed1", values=test_data_bed) - bbc.bedset.report(record_identifier="bedset1", values=test_data_bedset) - bbc.report_relationship( - bedfile_record_id="bed1", bedset_record_id="bedset1" - ) - result = bbc.select_bedfiles_from_bedset( - bedset_record_id="bedset1", metadata=True - ) - assert result[0]["name"] == "test_string" - - @pytest.mark.skipif(True, reason="not implemented") - def test_get_bed_drs_metadata(self): - # TODO: add test - assert True diff --git a/tests/test_bedfile.py b/tests/test_bedfile.py new file mode 100644 index 00000000..b01b4175 --- /dev/null +++ b/tests/test_bedfile.py @@ -0,0 +1,237 @@ +import pytest +from sqlalchemy.orm import Session +from sqlalchemy.sql import select + +from bbconf.bbagent import BedBaseAgent +from bbconf.db_utils import Bed, Files +from bbconf.exceptions import BedFIleExistsError, BEDFileNotFoundError + +from .conftest import get_bbagent +from .utils import BED_TEST_ID, ContextManagerDBTesting + + +def test_bb_database(): + agent = get_bbagent() + assert isinstance(agent, BedBaseAgent) + + +class Test_BedFile_Agent: + def test_upload(self, bbagent_obj, example_dict, mocker): + upload_s3_mock = mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.upload_s3", + return_value=True, + ) + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=False): + bbagent_obj.bed.add(**example_dict) + + assert upload_s3_mock.called + assert bbagent_obj.bed.exists(example_dict["identifier"]) + + def test_upload_exists(self, bbagent_obj, example_dict, mocker): + mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.upload_s3", + return_value=True, + ) + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=False): + bbagent_obj.bed.add(**example_dict) + with pytest.raises(BedFIleExistsError): + bbagent_obj.bed.add(**example_dict) + + def test_add_nofail(self, bbagent_obj, example_dict, mocker): + mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.upload_s3", + return_value=True, + ) + + example_dict["nofail"] = True + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=False): + bbagent_obj.bed.add(**example_dict) + bbagent_obj.bed.add(**example_dict) + assert bbagent_obj.bed.exists(example_dict["identifier"]) + + def test_get_all(self, bbagent_obj, mocked_phc): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get(BED_TEST_ID, full=True) + assert return_result is not None + assert return_result.files is not None + assert return_result.plots is not None + assert return_result.raw_metadata is not None + + assert return_result.genome_alias == "hg38" + assert return_result.stats.number_of_regions == 1 + + assert return_result.files.bed_file is not None + assert return_result.plots.chrombins is not None + + def test_get_all_not_found(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get(BED_TEST_ID, full=False) + + assert return_result is not None + assert return_result.files is None + assert return_result.plots is None + assert return_result.raw_metadata is None + assert return_result.stats is None + + assert return_result.genome_alias == "hg38" + assert return_result.id == BED_TEST_ID + + def test_get_raw_metadata(self, bbagent_obj, mocked_phc): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_raw_metadata(BED_TEST_ID) + + assert return_result is not None + assert return_result.sample_name == BED_TEST_ID + + def test_get_stats(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_stats(BED_TEST_ID) + + assert return_result is not None + assert return_result.number_of_regions == 1 + + def test_get_files(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_files(BED_TEST_ID) + + assert return_result is not None + assert return_result.bed_file.path is not None + + def test_get_plots(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_plots(BED_TEST_ID) + + assert return_result is not None + assert return_result.chrombins is not None + + def test_get_objects(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_objects(BED_TEST_ID) + + assert "bed_file" in return_result + assert "chrombins" in return_result + + def test_get_classification(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_classification(BED_TEST_ID) + + assert return_result is not None + assert return_result.bed_type == "bed6+4" + + def test_get_list(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list(limit=100, offset=0) + + assert len(return_result.results) == 1 + assert return_result.count == 1 + assert return_result.results[0].id == BED_TEST_ID + assert return_result.limit == 100 + assert return_result.offset == 0 + + def test_get_list_genome_true(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list( + limit=100, offset=0, genome="hg38" + ) + + assert len(return_result.results) == 1 + assert return_result.count == 1 + assert return_result.results[0].id == BED_TEST_ID + assert return_result.limit == 100 + assert return_result.offset == 0 + + def test_get_list_genome_false(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list( + limit=100, offset=0, genome="hg381" + ) + + assert len(return_result.results) == 0 + assert return_result.count == 0 + + def test_get_list_bed_type_true(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list( + limit=100, offset=0, bed_type="bed6+4" + ) + + assert len(return_result.results) == 1 + assert return_result.count == 1 + assert return_result.results[0].id == BED_TEST_ID + assert return_result.limit == 100 + assert return_result.offset == 0 + + def test_get_list_bed_type_false(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list( + limit=100, offset=0, bed_type="bed6+5" + ) + + assert len(return_result.results) == 0 + assert return_result.count == 0 + + def test_get_list_bed_offset(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_ids_list( + limit=100, + offset=1, + ) + + assert len(return_result.results) == 0 + assert return_result.count == 0 + assert return_result.offset == 1 + + def test_bed_delete(self, bbagent_obj, mocker): + mocker.patch("bbconf.config_parser.bedbaseconfig.BedBaseConfig.delete_s3") + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + bbagent_obj.bed.delete(BED_TEST_ID) + + assert not bbagent_obj.bed.exists(BED_TEST_ID) + + with Session(bbagent_obj.config.db_engine.engine) as session: + result = session.scalar(select(Bed).where(Bed.id == BED_TEST_ID)) + assert result is None + + result = session.scalars(select(Files)) + assert len([k for k in result]) == 0 + + def test_bed_delete_not_found(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + with pytest.raises(BEDFileNotFoundError): + bbagent_obj.bed.delete("not_found") + + @pytest.mark.skip("Skipped, not fully implemented") + def test_bed_update(self): + # agent = BedBaseAgent(config=config) + # ff = agent.bed.update("91b2754c8ff01769bacfc80e6923c46e", {"number_of_regions": 44}) + # print(ff) + # assert ff != None + pass + + +@pytest.mark.skip("Skipped, because ML models and qdrant needed") +class TestVectorSearch: + def test_qdrant_search(self, bbagent_obj, mocker): + mocker.patch( + "geniml.text2bednn.text2bednn.Text2BEDSearchInterface.nl_vec_search", + return_value={ + "id": BED_TEST_ID, + "payload": {"bed_id": "39b686ec08206b92b540ed434266ec9b"}, + "score": 0.2146723, + }, + ) + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.text_to_bed_search("something") + assert return_result + + def test_delete_qdrant_point(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + bbagent_obj.bed.delete_qdrant_point(BED_TEST_ID) + + def test_create_qdrant_collection(self): + agent = BedBaseAgent( + config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml" + ) + ff = agent.bed.create_qdrant_collection() + ff + assert True diff --git a/tests/test_bedset.py b/tests/test_bedset.py new file mode 100644 index 00000000..492c5292 --- /dev/null +++ b/tests/test_bedset.py @@ -0,0 +1,217 @@ +import os + +import pytest +from sqlalchemy.orm import Session +from sqlalchemy.sql import select + +from bbconf.db_utils import BedSets +from bbconf.exceptions import BedbaseS3ConnectionError, BedSetNotFoundError + +from .conftest import DATA_PATH +from .utils import BED_TEST_ID, BEDSET_TEST_ID, ContextManagerDBTesting + + +class TestBedset: + def test_calculate_stats(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + results = bbagent_obj.bedset._calculate_statistics([BED_TEST_ID]) + + assert results is not None + assert results.sd is not None + assert results.mean is not None + + def test_crate_bedset_all(self, bbagent_obj, mocker): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=False + ): + mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.upload_s3", + return_value=True, + ) + bbagent_obj.bedset.create( + "testinoo", + "test_name", + description="this is test description", + bedid_list=[ + BED_TEST_ID, + ], + plots={ + "region_commonality": { + "name": "region_commonality", + "description": "Regions distribution over chromosomes", + "title": "Regions distribution over chromosomes", + "path": os.path.join( + DATA_PATH, + "plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf", + ), + "path_thumbnail": os.path.join( + DATA_PATH, + "/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png", + ), + }, + }, + statistics=True, + upload_s3=True, + upload_pephub=False, + no_fail=True, + ) + with Session(bbagent_obj.config.db_engine.engine) as session: + result = session.scalar(select(BedSets).where(BedSets.id == "testinoo")) + assert result is not None + assert result.name == "test_name" + assert len([k for k in result.files]) == 1 + + def test_get_metadata_full(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get(BEDSET_TEST_ID, full=True) + + assert result.id == BEDSET_TEST_ID + assert result.md5sum == "bbad0000000000000000000000000000" + assert result.statistics.sd is not None + assert result.statistics.mean is not None + assert result.plots is not None + + def test_get_metadata_not_full(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get(BEDSET_TEST_ID, full=False) + + assert result.id == BEDSET_TEST_ID + assert result.md5sum == "bbad0000000000000000000000000000" + assert result.statistics is None + assert result.plots is None + + def test_get_not_found(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + with pytest.raises(BedSetNotFoundError): + bbagent_obj.bedset.get("not_uid", full=True) + + def test_get_object(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_objects(BEDSET_TEST_ID) + + assert len(result) == 1 + + def test_get_plots(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_plots(BEDSET_TEST_ID) + + assert result is not None + + def test_get_stats(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_statistics(BEDSET_TEST_ID) + + assert result.sd is not None + assert result.mean is not None + + def test_get_bedset_list(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_ids_list(limit=100, offset=0) + + assert result.count == 1 + assert result.limit == 100 + assert result.offset == 0 + assert len(result.results) == 1 + assert result.results[0].id == BEDSET_TEST_ID + + def test_get_bedset_list_offset(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_ids_list(limit=100, offset=1) + + # assert result.count == 1 + assert result.limit == 100 + assert result.offset == 1 + assert len(result.results) == 0 + + def test_get_idset_list_query_found(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_ids_list(query="rando", limit=100, offset=0) + + assert result.count == 1 + assert result.limit == 100 + assert result.offset == 0 + assert len(result.results) == 1 + + def test_get_idset_list_query_fail(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_ids_list( + query="rando1", limit=100, offset=0 + ) + + assert result.count == 0 + assert result.limit == 100 + assert result.offset == 0 + assert len(result.results) == 0 + + def test_get_get_bedset_bedfiles(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_bedset_bedfiles(BEDSET_TEST_ID) + + assert result.count == 1 + assert result.limit == 100 + assert result.offset == 0 + assert len(result.results) == 1 + + def test_delete(self, bbagent_obj, mocker): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.delete_s3", + return_value=True, + ) + bbagent_obj.bedset.delete(BEDSET_TEST_ID) + + assert not bbagent_obj.bedset.exists(BEDSET_TEST_ID) + + def test_delete_none(self, bbagent_obj, mocker): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + mocker.patch( + "bbconf.config_parser.bedbaseconfig.BedBaseConfig.delete_s3", + return_value=True, + ) + bbagent_obj.bedset.delete(BEDSET_TEST_ID) + + with pytest.raises(BedSetNotFoundError): + bbagent_obj.bedset.delete(BEDSET_TEST_ID) + + def test_delete_s3_error(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + with pytest.raises(BedbaseS3ConnectionError): + bbagent_obj.bedset.delete(BEDSET_TEST_ID) + + +def test_get_stats(bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True, bedset=True): + return_result = bbagent_obj.get_stats() + + assert return_result + assert return_result.bedfiles_number == 1 + assert return_result.bedsets_number == 1 + assert return_result.genomes_number == 1 diff --git a/tests/test_objects.py b/tests/test_objects.py new file mode 100644 index 00000000..635b0f03 --- /dev/null +++ b/tests/test_objects.py @@ -0,0 +1,46 @@ +import pytest + +from bbconf.exceptions import BEDFileNotFoundError, MissingThumbnailError + +from .utils import BED_TEST_ID, ContextManagerDBTesting + + +class TestObjects: + def test_object_path(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + result = bbagent_obj.objects.get_object_uri( + "bed", BED_TEST_ID, "bed_file", "http" + ) + + assert isinstance(result, str) + + def test_object_path_error(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + with pytest.raises(BEDFileNotFoundError): + bbagent_obj.objects.get_object_uri("bed", "not_f", "bed_file", "http") + + def test_object_path_thumbnail(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + result = bbagent_obj.objects.get_thumbnail_uri( + "bed", BED_TEST_ID, "chrombins", "http" + ) + assert isinstance(result, str) + + def test_object_path_thumbnail_error(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + with pytest.raises(MissingThumbnailError): + bbagent_obj.objects.get_thumbnail_uri( + "bed", BED_TEST_ID, "bed_file", "http" + ) + + def test_object_metadata(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + result = bbagent_obj.objects.get_drs_metadata( + "bed", BED_TEST_ID, "bed_file", "localhost" + ) + assert result is not None + + +@pytest.mark.skip("Used to visualize the schema") +def test_create_schema_graph(bbagent_obj): + bbagent_obj.config.db_engine.create_schema_graph() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..9566ebdb --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,147 @@ +from typing import Union + +from sqlalchemy.orm import Session + +from bbconf.config_parser.bedbaseconfig import BedBaseConfig +from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files + +BED_TEST_ID = "bbad85f21962bb8d972444f7f9a3a932" +BEDSET_TEST_ID = "test_bedset_id" + + +stats = { + "id": BED_TEST_ID, + "number_of_regions": 1, + "median_tss_dist": 2, + "mean_region_width": 3, + "exon_frequency": 4, + "exon_percentage": 5, + "intron_frequency": 6, + "intron_percentage": 7, + "intergenic_percentage": 8, + "intergenic_frequency": 9, + "promotercore_frequency": 10, + "promotercore_percentage": 11, + "fiveutr_frequency": 12, + "fiveutr_percentage": 13, + "threeutr_frequency": 14, + "threeutr_percentage": 15, + "promoterprox_frequency": 16, + "promoterprox_percentage": 17, +} + + +def get_example_dict() -> dict: + value = { + "id": BED_TEST_ID, + "bed_format": "narrowpeak", + "bed_type": "bed6+4", + "genome_alias": "hg38", + "genome_digest": "2230c535660fb4774114bfa966a62f823fdb6d21acf138d4", + "name": "random_name", + } + return value + + +def get_bedset_files() -> dict: + return { + "title": "region_commonality", + "name": "region_commonality", + "path": "data/files/bbad85f21962bb8d972444f7f9a3a932.bed.gz", + "description": "Bfffffff", + "bedset_id": BEDSET_TEST_ID, + } + + +def get_files() -> dict: + return { + "title": "Bed file", + "name": "bed_file", + "path": "data/files/bbad85f21962bb8d972444f7f9a3a932.bed.gz", + "description": "Bed file with regions", + "bedfile_id": BED_TEST_ID, + } + + +def get_plots() -> dict: + return { + "name": "chrombins", + "description": "Regions distribution over chromosomes", + "title": "Regions distribution over chromosomes", + "path": "data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.pdf", + "path_thumbnail": "data/plots/bbad85f21962bb8d972444f7f9a3a932_chrombins.png", + "bedfile_id": BED_TEST_ID, + } + + +class ContextManagerDBTesting: + """ + Creates context manager to connect to database at db_url adds data and drop everything from the database upon exit to ensure + the db is empty for each new test. + """ + + def __init__( + self, + config: Union[str, BedBaseConfig], + add_data: bool = False, + bedset: bool = False, + ): + """ + :param config: config object + :param add_data: add data to the database + :param bedset: add bedset data to the database + """ + if isinstance(config, BedBaseConfig): + self.config = config + else: + self.config = BedBaseConfig(config) + + self.add_data = add_data + self.bedset = bedset + + def __enter__(self): + self.db_engine = self.config.db_engine + + if self.add_data: + self._add_data() + if self.bedset: + self._add_bedset_data() + + def __exit__(self, exc_type, exc_value, exc_traceback): + self.db_engine.delete_schema() + + def _add_data(self): + with Session(self.db_engine.engine) as session: + new_bed = Bed(**get_example_dict()) + new_files = Files(**get_files()) + new_plots = Files(**get_plots()) + new_stats = BedStats(**stats) + + session.add(new_bed) + session.add(new_files) + session.add(new_plots) + session.add(new_stats) + + session.commit() + + def _add_bedset_data(self): + with Session(self.db_engine.engine) as session: + new_bedset = BedSets( + id=BEDSET_TEST_ID, + name=BEDSET_TEST_ID, + description="random desc", + bedset_means=stats, + bedset_standard_deviation=stats, + md5sum="bbad0000000000000000000000000000", + ) + new_bed_bedset = BedFileBedSetRelation( + bedfile_id=BED_TEST_ID, + bedset_id=BEDSET_TEST_ID, + ) + new_files = Files(**get_bedset_files()) + + session.add(new_bedset) + session.add(new_bed_bedset) + session.add(new_files) + + session.commit()