diff --git a/etc/alembic.ini b/etc/alembic.ini new file mode 100644 index 0000000..6777380 --- /dev/null +++ b/etc/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = src/alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql://uta_admin:@localhost/uta + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/etc/scripts/create-new-schema.sh b/etc/scripts/create-new-schema.sh index 0b6aa2b..5200e8d 100755 --- a/etc/scripts/create-new-schema.sh +++ b/etc/scripts/create-new-schema.sh @@ -21,4 +21,5 @@ pg_dump -U uta_admin -h localhost -d uta -n "$source_uta_v" | \ # create new schema gzip -cdq $dumps_dir/"$source_uta_v".pgd.gz | \ sbin/pg-dump-schema-rename "$source_uta_v" "$dest_uta_v" | \ + sbin/pg-dump-schema-rename "uta_1_1" "$dest_uta_v" | \ psql -U uta_admin -h localhost -d uta -aeE \ No newline at end of file diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 166c6cb..f4dec9a 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -19,7 +19,7 @@ then fi # set local variables and create working directories -loading_uta_v="uta_1_1" +loading_uta_v="uta" loading_dir="$working_dir/loading" dumps_dir="$working_dir/dumps" logs_dir="$working_dir/logs" @@ -31,6 +31,10 @@ done etc/scripts/delete-schema.sh "$loading_uta_v" etc/scripts/create-new-schema.sh "$source_uta_v" "$loading_uta_v" +## for now set up Alembic for schema migrations +alembic -c etc/alembic.ini stamp edadb97f6502 +alembic -c etc/alembic.ini upgrade head + ## Load SeqRepo with new sequences seqrepo load -n NCBI -i "$seqrepo_data_release" \ $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz \ diff --git a/pyproject.toml b/pyproject.toml index 5b6b4a2..652c22d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ classifiers = [ ] dependencies = [ + "alembic", "attrs", "biocommons.seqrepo", "biopython>=1.69", diff --git a/src/alembic/README b/src/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/src/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/src/alembic/env.py b/src/alembic/env.py new file mode 100644 index 0000000..1e8e830 --- /dev/null +++ b/src/alembic/env.py @@ -0,0 +1,92 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +from uta.models import Base +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def include_name(name, type_, parent_names) -> bool: + if type_ == "schema": + return name in ["uta"] + else: + return True + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + version_table_schema=target_metadata.schema, + include_schemas=True, + include_name=include_name, + ) + + with context.begin_transaction(): + context.execute(f'create schema if not exists {target_metadata.schema};') + context.execute(f'set search_path to {target_metadata.schema}') + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/src/alembic/script.py.mako b/src/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/src/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py new file mode 100644 index 0000000..c8ee756 --- /dev/null +++ b/src/alembic/versions/cc51f50ae896_add_sqlalchemy_model_for_assocacs.py @@ -0,0 +1,44 @@ +"""add sqlalchemy model for assocacs + +Revision ID: cc51f50ae896 +Revises: edadb97f6502 +Create Date: 2024-04-05 00:33:40.105587 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'cc51f50ae896' +down_revision: Union[str, None] = 'edadb97f6502' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('associated_accessions', 'tx_ac', + existing_type=sa.TEXT(), + nullable=False, + schema='uta') + op.alter_column('associated_accessions', 'pro_ac', + existing_type=sa.TEXT(), + nullable=False, + schema='uta') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('associated_accessions', 'pro_ac', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + op.alter_column('associated_accessions', 'tx_ac', + existing_type=sa.TEXT(), + nullable=True, + schema='uta') + # ### end Alembic commands ### diff --git a/src/alembic/versions/edadb97f6502_initial_state.py b/src/alembic/versions/edadb97f6502_initial_state.py new file mode 100644 index 0000000..86ea9cd --- /dev/null +++ b/src/alembic/versions/edadb97f6502_initial_state.py @@ -0,0 +1,386 @@ +"""initial state + +Revision ID: edadb97f6502 +Revises: +Create Date: 2024-04-03 21:41:05.875580 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision: str = 'edadb97f6502' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('gene', + sa.Column('hgnc', sa.Text(), nullable=False), + sa.Column('maploc', sa.Text(), nullable=True), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('summary', sa.Text(), nullable=True), + sa.Column('aliases', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint('hgnc'), + schema='uta' + ) + op.create_table('meta', + sa.Column('key', sa.Text(), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.PrimaryKeyConstraint('key'), + schema='uta' + ) + op.create_table('origin', + sa.Column('origin_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('name', sa.Text(), nullable=False), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('updated', sa.DateTime(), nullable=True), + sa.Column('url', sa.Text(), nullable=True), + sa.Column('url_ac_fmt', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('origin_id'), + sa.UniqueConstraint('name'), + schema='uta' + ) + op.create_table('seq', + sa.Column('seq_id', sa.Text(), nullable=False), + sa.Column('len', sa.Integer(), nullable=False), + sa.Column('seq', sa.Text(), nullable=True), + sa.PrimaryKeyConstraint('seq_id'), + schema='uta' + ) + op.create_table('seq_anno', + sa.Column('seq_anno_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('seq_id', sa.Text(), nullable=True), + sa.Column('origin_id', sa.Integer(), nullable=False), + sa.Column('ac', sa.Text(), nullable=False), + sa.Column('descr', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.ForeignKeyConstraint(['seq_id'], ['uta.seq.seq_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('seq_anno_id'), + schema='uta' + ) + op.create_index(op.f('ix_uta_seq_anno_ac'), 'seq_anno', ['ac'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_seq_anno_seq_id'), 'seq_anno', ['seq_id'], unique=False, schema='uta') + op.create_index('seq_anno_ac_unique_in_origin', 'seq_anno', ['origin_id', 'ac'], unique=True, schema='uta') + op.create_table('transcript', + sa.Column('ac', sa.Text(), nullable=False), + sa.Column('origin_id', sa.Integer(), nullable=False), + sa.Column('hgnc', sa.Text(), nullable=True), + sa.Column('cds_start_i', sa.Integer(), nullable=True), + sa.Column('cds_end_i', sa.Integer(), nullable=True), + sa.Column('cds_md5', sa.Text(), nullable=True), + sa.Column('added', sa.DateTime(), nullable=False), + sa.CheckConstraint('cds_start_i <= cds_end_i', name='cds_start_i_must_be_le_cds_end_i'), + sa.ForeignKeyConstraint(['origin_id'], ['uta.origin.origin_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('ac'), + schema='uta' + ) + op.create_index(op.f('ix_uta_transcript_cds_md5'), 'transcript', ['cds_md5'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_transcript_origin_id'), 'transcript', ['origin_id'], unique=False, schema='uta') + op.create_table('exon_set', + sa.Column('exon_set_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_ac', sa.Text(), nullable=False), + sa.Column('alt_ac', sa.Text(), nullable=False), + sa.Column('alt_strand', sa.SmallInteger(), nullable=False), + sa.Column('alt_aln_method', sa.Text(), nullable=False), + sa.Column('added', sa.DateTime(), nullable=False), + sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_set_id'), + sa.UniqueConstraint('tx_ac', 'alt_ac', 'alt_aln_method', name=' must be unique'), + schema='uta' + ) + op.create_table('exon', + sa.Column('exon_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('exon_set_id', sa.Integer(), nullable=False), + sa.Column('start_i', sa.Integer(), nullable=False), + sa.Column('end_i', sa.Integer(), nullable=False), + sa.Column('ord', sa.Integer(), nullable=False), + sa.Column('name', sa.Text(), nullable=True), + sa.CheckConstraint('start_i < end_i', name='exon_start_i_must_be_lt_end_i'), + sa.ForeignKeyConstraint(['exon_set_id'], ['uta.exon_set.exon_set_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_id'), + sa.UniqueConstraint('exon_set_id', 'end_i', name='end_i_must_be_unique_in_exon_set'), + sa.UniqueConstraint('exon_set_id', 'start_i', name='start_i_must_be_unique_in_exon_set'), + schema='uta' + ) + op.create_index(op.f('ix_uta_exon_exon_set_id'), 'exon', ['exon_set_id'], unique=False, schema='uta') + op.create_table('exon_aln', + sa.Column('exon_aln_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_exon_id', sa.Integer(), nullable=False), + sa.Column('alt_exon_id', sa.Integer(), nullable=False), + sa.Column('cigar', sa.Text(), nullable=False), + sa.Column('added', sa.DateTime(), nullable=False), + sa.Column('tx_aseq', sa.Text(), nullable=True), + sa.Column('alt_aseq', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['alt_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.ForeignKeyConstraint(['tx_exon_id'], ['uta.exon.exon_id'], onupdate='CASCADE', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('exon_aln_id'), + schema='uta' + ) + op.create_index(op.f('ix_uta_exon_aln_alt_exon_id'), 'exon_aln', ['alt_exon_id'], unique=False, schema='uta') + op.create_index(op.f('ix_uta_exon_aln_tx_exon_id'), 'exon_aln', ['tx_exon_id'], unique=False, schema='uta') + # ### end Alembic commands ### + + # ### custom commands to match the initial UTA database schema 1.1 ### + op.create_table('associated_accessions', + sa.Column('associated_accession_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('tx_ac', sa.Text(), nullable=True), + sa.Column('pro_ac', sa.Text(), nullable=True), + sa.Column('origin', sa.Text(), nullable=False), + sa.Column('added', postgresql.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.PrimaryKeyConstraint('associated_accession_id'), + schema='uta' + ) + op.create_index('associated_accessions_tx_ac', 'associated_accessions', ['tx_ac'], unique=False, schema='uta') + op.create_index('associated_accessions_pro_ac', 'associated_accessions', ['pro_ac'], unique=False, schema='uta') + op.create_index('unique_pair_in_origin', 'associated_accessions', ['origin', 'tx_ac', 'pro_ac'], unique=True, schema='uta') + op.create_table_comment('associated_accessions', 'transcript-protein accession pairs associated in source databases', schema='uta') + # ### end custom commands ### + + # ### custom SQL to add views to match the initial UTA database schema 1.1 ### + op.execute(""" + CREATE VIEW _cds_exons_v AS + WITH cds_exons as ( + SELECT ES.exon_set_id, T.ac AS tx_ac, E.ord, + E.start_i, E.end_i, + CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN greatest(E.start_i,T.cds_start_i) ELSE NULL end AS cds_ex_start_i, + CASE WHEN E.end_i >= T.cds_start_i AND E.start_i <= T.cds_end_i THEN least(E.end_i,T.cds_end_i) ELSE NULL end AS cds_ex_end_i + FROM transcript T + JOIN exon_set ES ON T.ac = ES.tx_ac AND ES.alt_aln_METHOD = 'transcript' + JOIN exon E ON ES.exon_set_id=E.exon_set_id + WHERE T.cds_start_i IS NOT NULL AND T.cds_end_i IS NOT NULL + ) + select *, end_i - start_i as ex_len, cds_ex_end_i - cds_ex_start_i as cds_ex_len from cds_exons; + """) + op.execute(""" + CREATE VIEW _cds_exons_flat_v AS + SELECT exon_set_id,tx_ac,MIN(ord) AS cds_start_exon,MAX(ord) AS cds_end_exon, + ARRAY_TO_STRING(ARRAY_AGG(format('%s,%s',cds_ex_start_i,cds_ex_end_i) ORDER BY ord),';') AS cds_se_i, + ARRAY_TO_STRING(ARRAY_AGG(cds_ex_len ORDER BY ord),';') AS cds_exon_lengths + FROM _cds_exons_v + WHERE cds_ex_start_i IS NOT NULL + GROUP BY exon_set_id, tx_ac; + """) + op.execute(""" + CREATE VIEW _seq_anno_most_recent AS + SELECT DISTINCT ON (ac) * + FROM seq_anno + ORDER BY ac,added DESC; + """) + op.execute(""" + CREATE VIEW _cds_exons_fp_v AS + SELECT SA.seq_id, md5(format('%s;%s',LOWER(SA.seq_id),CTEF.cds_se_i)) AS cds_es_fp, + md5(cds_exon_lengths) AS cds_exon_lengths_fp, CTEF.* + FROM _cds_exons_flat_v CTEF + JOIN _seq_anno_most_recent SA ON CTEF.tx_ac=SA.ac; + """) + op.execute(""" + CREATE VIEW _discontiguous_tx AS + SELECT t.hgnc, + es.exon_set_id, + es.tx_ac, + format('[%s-%s]'::text, e1.end_i, e2.start_i) AS gap, + e1.exon_id AS e1_exon_id, + e1.ord AS e1_ord, + e1.start_i AS e1_start_i, + e1.end_i AS e1_end_i, + e2.exon_id AS e2_exon_id, + e2.ord AS e2_ord, + e2.start_i AS e2_start_i, + e2.end_i AS e2_end_i + FROM exon_set es + LEFT JOIN transcript t ON es.tx_ac = t.ac + JOIN exon e1 ON es.exon_set_id = e1.exon_set_id + JOIN exon e2 ON es.exon_set_id = e2.exon_set_id AND e2.ord = (e1.ord + 1) AND e1.end_i <> e2.start_i + WHERE es.alt_aln_method = 'transcript'::text; + """) + op.execute(""" + CREATE VIEW tx_alt_exon_pairs_v AS + SELECT T.hgnc,TES.exon_SET_id AS tes_exon_SET_id,AES.exon_SET_id AS aes_exon_SET_id, + TES.tx_ac AS tx_ac,AES.alt_ac AS alt_ac,AES.alt_strand,AES.alt_aln_method, + TEX.ORD AS ORD,TEX.exon_id AS tx_exon_id,AEX.exon_id AS alt_exon_id, + TEX.start_i AS tx_start_i,TEX.END_i AS tx_END_i, AEX.start_i AS alt_start_i,AEX.END_i AS alt_END_i, + EA.exon_aln_id,EA.cigar + FROM exon_SET tes + JOIN transcript t ON tes.tx_ac=t.ac + JOIN exon_set aes ON tes.tx_ac=aes.tx_ac AND tes.alt_aln_method='transcript' AND aes.alt_aln_method!='transcript' + JOIN exon tex ON tes.exon_SET_id=tex.exon_SET_id + JOIN exon aex ON aes.exon_SET_id=aex.exon_SET_id AND tex.ORD=aex.ORD + LEFT JOIN exon_aln ea ON ea.tx_exon_id=tex.exon_id AND ea.alt_exon_id=AEX.exon_id; + """) + op.execute(""" + CREATE VIEW tx_exon_aln_v AS + SELECT T.hgnc,T.ac as tx_ac,AES.alt_ac,AES.alt_aln_method,AES.alt_strand, + TE.ord, TE.start_i as tx_start_i,TE.end_i as tx_end_i, + AE.start_i as alt_start_i, AE.end_i as alt_end_i, + EA.cigar, EA.tx_aseq, EA.alt_aseq, + TES.exon_set_id AS tx_exon_set_id,AES.exon_set_id as alt_exon_set_id, + TE.exon_id as tx_exon_id, AE.exon_id as alt_exon_id, + EA.exon_aln_id + FROM transcript T + JOIN exon_set TES ON T.ac=TES.tx_ac AND TES.alt_aln_method ='transcript' + JOIN exon_set AES on T.ac=AES.tx_ac and AES.alt_aln_method!='transcript' + JOIN exon TE ON TES.exon_set_id=TE.exon_set_id + JOIN exon AE ON AES.exon_set_id=AE.exon_set_id AND TE.ord=AE.ord + LEFT JOIN exon_aln EA ON TE.exon_id=EA.tx_exon_id AND AE.exon_id=EA.alt_exon_id; + """) + op.execute(""" + CREATE VIEW exon_set_exons_v AS + SELECT ES.*,EL.n_exons,EL.se_i,EL.starts_i,EL.ends_i,EL.lengths + FROM exon_set ES + JOIN (SELECT + iES.exon_set_id, + count(*) AS n_exons, + array_to_string(array_agg(format('%s,%s',iE.start_i,iE.end_i) ORDER BY iE.ord),';') AS se_i, + array_agg(iE.start_i ORDER BY iE.ord) AS starts_i, + array_agg(iE.end_i ORDER BY iE.ord) AS ends_i, + array_agg((iE.end_i-iE.start_i) ORDER BY iE.ord) AS lengths + FROM exon_set iES + JOIN exon iE ON iES.exon_set_id=iE.exon_set_id + GROUP BY iES.exon_set_id) EL + ON ES.exon_set_id = EL.exon_set_id; + """) + op.execute(""" + COMMENT ON VIEW exon_set_exons_v IS 'defining view of "flat" (aggregated) exons on a sequence; use _mv; for faster materialized version'; + """) + op.execute(""" + CREATE VIEW exon_set_exons_fp_v AS + SELECT ESE.*,md5(format('%s;%s',lower(ASA.seq_id),ESE.se_i)) AS es_fingerprint + FROM exon_set_exons_v ESE + JOIN _seq_anno_most_recent ASA ON ESE.alt_ac=ASA.ac; + """) + op.execute(""" + COMMENT ON VIEW exon_set_exons_fp_v IS 'flattened (aggregated) exons with exon set fingerprint'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW exon_set_exons_fp_mv AS SELECT * FROM exon_set_exons_fp_v WITH NO DATA; + CREATE INDEX exon_set_exons_fp_mv_tx_ac_ix ON exon_set_exons_fp_mv(tx_ac); + CREATE INDEX exon_set_exons_fp_mv_alt_ac_ix ON exon_set_exons_fp_mv(alt_ac); + CREATE INDEX exon_set_exons_fp_mv_alt_aln_method_ix ON exon_set_exons_fp_mv(alt_aln_method); + GRANT SELECT ON exon_set_exons_fp_mv TO public; + """) + op.execute(""" + CREATE OR replace VIEW tx_exon_set_summary_dv AS + SELECT hgnc,cds_md5,es_fingerprint,tx_ac,alt_ac,alt_aln_method,alt_strand,exon_set_id,n_exons,se_i,starts_i,ends_i,lengths + FROM transcript T + JOIN exon_set_exons_fp_mv ESE ON T.ac=ESE.tx_ac; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_exon_set_summary_mv AS SELECT * FROM tx_exon_set_summary_dv WITH NO DATA; + CREATE INDEX tx_exon_set_summary_mv_cds_md5_ix ON tx_exon_set_summary_mv(cds_md5); + CREATE INDEX tx_exon_set_summary_mv_es_fingerprint_ix ON tx_exon_set_summary_mv(es_fingerprint); + CREATE INDEX tx_exon_set_summary_mv_tx_ac_ix ON tx_exon_set_summary_mv(tx_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_ac_ix ON tx_exon_set_summary_mv(alt_ac); + CREATE INDEX tx_exon_set_summary_mv_alt_aln_method_ix ON tx_exon_set_summary_mv(alt_aln_method); + GRANT SELECT ON tx_exon_set_summary_mv TO public; + """) + op.execute(""" + CREATE VIEW tx_def_summary_dv AS + SELECT TESS.exon_set_id, TESS.tx_ac, TESS.alt_ac, TESS.alt_aln_method, TESS.alt_strand, + TESS.hgnc, TESS.cds_md5, TESS.es_fingerprint, CEF.cds_es_fp, CEF.cds_exon_lengths_fp, + TESS.n_exons, TESS.se_i, CEF.cds_se_i, TESS.starts_i, TESS.ends_i, TESS.lengths, + T.cds_start_i, T.cds_end_i, CEF.cds_start_exon, CEF.cds_end_exon + FROM tx_exon_set_summary_mv TESS + JOIN transcript T ON TESS.tx_ac=T.ac + LEFT JOIN _cds_exons_fp_v CEF ON TESS.exon_set_id=CEF.exon_set_id + WHERE TESS.alt_aln_method = 'transcript'; + """) + op.execute(""" + COMMENT ON VIEW tx_def_summary_dv IS 'transcript definitions, with exon structures'; + """) + op.execute(""" + CREATE MATERIALIZED VIEW tx_def_summary_mv AS SELECT * FROM tx_def_summary_dv WITH NO DATA; + """) + op.execute(""" + COMMENT ON MATERIALIZED VIEW tx_def_summary_mv IS 'transcript definitions, with exon structures and fingerprints'; + """) + op.execute(""" + create index tx_def_summary_mv_tx_ac on tx_def_summary_mv (tx_ac); + create index tx_def_summary_mv_alt_ac on tx_def_summary_mv (alt_ac); + create index tx_def_summary_mv_alt_aln_method on tx_def_summary_mv (alt_aln_method); + create index tx_def_summary_mv_hgnc on tx_def_summary_mv (hgnc); + """) + op.execute(""" + CREATE VIEW tx_def_summary_v AS + SELECT * FROM tx_def_summary_mv; + """) + op.execute(""" + CREATE OR REPLACE VIEW tx_similarity_v AS + SELECT DISTINCT + D1.tx_ac as tx_ac1, D2.tx_ac as tx_ac2, + D1.hgnc = D2.hgnc as hgnc_eq, + D1.cds_md5=D2.cds_md5 as cds_eq, + D1.es_fingerprint=D2.es_fingerprint as es_fp_eq, + D1.cds_es_fp=D2.cds_es_fp as cds_es_fp_eq, + D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp as cds_exon_lengths_fp_eq + FROM tx_def_summary_mv D1 + JOIN tx_def_summary_mv D2 on (D1.tx_ac != D2.tx_ac + and (D1.hgnc=D2.hgnc + or D1.cds_md5=D2.cds_md5 + or D1.es_fingerprint=D2.es_fingerprint + or D1.cds_es_fp=D2.cds_es_fp + or D1.cds_exon_lengths_fp=D2.cds_exon_lengths_fp + )); + """) + # ### end custom SQL commands ### + + +def downgrade() -> None: + # ### custom SQL to remove views ### + op.execute("DROP VIEW tx_similarity_v CASCADE;") + op.execute("DROP VIEW tx_def_summary_v CASCADE;") + op.execute("DROP INDEX tx_def_summary_mv_hgnc CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_alt_aln_method CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_alt_ac CASCADE") + op.execute("DROP INDEX tx_def_summary_mv_tx_ac CASCADE") + op.execute("DROP MATERIALIZED VIEW tx_def_summary_mv CASCADE;") + op.execute("DROP VIEW tx_def_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW tx_exon_set_summary_mv CASCADE;") + op.execute("DROP VIEW tx_exon_set_summary_dv CASCADE;") + op.execute("DROP MATERIALIZED VIEW exon_set_exons_fp_mv CASCADE;") + op.execute("DROP VIEW exon_set_exons_fp_v CASCADE;") + op.execute("DROP VIEW exon_set_exons_v CASCADE;") + op.execute("DROP VIEW tx_exon_aln_v CASCADE;") + op.execute("DROP VIEW tx_alt_exon_pairs_v CASCADE;") + op.execute("DROP VIEW _discontiguous_tx CASCADE;") + op.execute("DROP VIEW _cds_exons_fp_v CASCADE;") + op.execute("DROP VIEW _seq_anno_most_recent CASCADE;") + op.execute("DROP VIEW _cds_exons_flat_v CASCADE;") + op.execute("DROP VIEW _cds_exons_v CASCADE;") + # ### end custom SQL commands ### + + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_uta_exon_aln_tx_exon_id'), table_name='exon_aln', schema='uta') + op.drop_index(op.f('ix_uta_exon_aln_alt_exon_id'), table_name='exon_aln', schema='uta') + op.drop_table('exon_aln', schema='uta') + op.drop_index(op.f('ix_uta_exon_exon_set_id'), table_name='exon', schema='uta') + op.drop_table('exon', schema='uta') + op.drop_table('exon_set', schema='uta') + op.drop_index(op.f('ix_uta_transcript_origin_id'), table_name='transcript', schema='uta') + op.drop_index(op.f('ix_uta_transcript_cds_md5'), table_name='transcript', schema='uta') + op.drop_table('transcript', schema='uta') + op.drop_index('seq_anno_ac_unique_in_origin', table_name='seq_anno', schema='uta') + op.drop_index(op.f('ix_uta_seq_anno_seq_id'), table_name='seq_anno', schema='uta') + op.drop_index(op.f('ix_uta_seq_anno_ac'), table_name='seq_anno', schema='uta') + op.drop_table('seq_anno', schema='uta') + op.drop_table('seq', schema='uta') + op.drop_table('origin', schema='uta') + op.drop_table('meta', schema='uta') + op.drop_table('gene', schema='uta') + # ### end Alembic commands ### + + # ### custom commands to remove items not autogenerated by Alembic ### + op.drop_index('unique_pair_in_origin', table_name='associated_accessions') + op.drop_index('associated_accessions_pro_ac', table_name='associated_accessions') + op.drop_index('associated_accessions_tx_ac', table_name='associated_accessions') + op.drop_table_comment('associated_accessions', existing_comment='transcript-protein accession pairs associated in source databases', schema='uta') + op.drop_table('associated_accessions') + # ### end custom commands ### \ No newline at end of file diff --git a/src/uta/models.py b/src/uta/models.py index 20c71b5..9a598b1 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -9,6 +9,7 @@ import sqlalchemy.types import sqlalchemy.sql.functions from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.dialects import postgresql ############################################################################ @@ -18,11 +19,9 @@ schema_version = "1.1" use_schema = True if use_schema: - schema_name = "uta_" + schema_version.replace(".","_") - schema_name_dot = schema_name + "." + schema_name = "uta" else: schema_name = None - schema_name_dot = "" ############################################################################ @@ -210,8 +209,8 @@ class ExonAln(Base): cigar = sa.Column(sa.Text, nullable=False) added = sa.Column( sa.DateTime, default=datetime.datetime.now(), nullable=False) - tx_aseq = sa.Column(sa.Text, nullable=False) - alt_aseq = sa.Column(sa.Text, nullable=False) + tx_aseq = sa.Column(sa.Text, nullable=True) + alt_aseq = sa.Column(sa.Text, nullable=True) # relationships: tx_exon = sao.relationship( @@ -225,9 +224,10 @@ class ExonAln(Base): class AssociatedAccessions(Base): __tablename__ = "associated_accessions" __table_args__ = ( - sa.UniqueConstraint("origin", "tx_ac", "pro_ac", name="unique_pair_in_origin"), + sa.Index("unique_pair_in_origin", "origin", "tx_ac", "pro_ac", unique=True), sa.Index("associated_accessions_pro_ac", "pro_ac"), sa.Index("associated_accessions_tx_ac", "tx_ac"), + {"comment": "transcript-protein accession pairs associated in source databases"}, ) # columns: @@ -236,7 +236,7 @@ class AssociatedAccessions(Base): pro_ac = sa.Column(sa.Text, nullable=False) origin = sa.Column(sa.Text, nullable=False) added = sa.Column( - sqlalchemy.types.TIMESTAMP, + postgresql.TIMESTAMP(timezone=True), server_default=sqlalchemy.sql.functions.now(), nullable=False, )