Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev 0.2.1 #15

Merged
merged 17 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ env/
seeds/
*.ipynb_checkpoints/
models/sources.yml
exported_database/
duckdb_fdw/
15 changes: 11 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.1.13
rev: v1.5.4
hooks:
- id: forbid-crlf
- id: remove-crlf
Expand All @@ -9,7 +9,7 @@ repos:
- id: remove-tabs
exclude_types: [csv]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -21,7 +21,14 @@ repos:
hooks:
- id: isort
- repo: https://github.com/ambv/black
rev: 22.3.0
rev: 24.1.1
hooks:
- id: black
language_version: python3.8
language_version: python3.11
- repo: https://github.com/sqlfluff/sqlfluff
rev: 2.3.5
hooks:
- id: sqlfluff-lint
args: [--dialect, postgres]
- id: sqlfluff-fix
args: [--dialect, postgres]
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,28 @@ dbt docs generate
dbt docs serve
```

### Exporting to newer version of DuckDB
Right now newer version of DuckDB is not backward compatible. To migrate the data to newer version, use the script [`export_duckdb.py`](scripts/export_duckdb.py):
```bash
$ python scripts/export_duckdb.py -h
usage: export_duckdb.py [-h] [--database_filename DATABASE_FILENAME] [--export_directory EXPORT_DIRECTORY]

Export a DuckDB database.

options:
-h, --help show this help message and exit
--database_filename DATABASE_FILENAME
The filename of the DuckDB database to export.
--export_directory EXPORT_DIRECTORY
The directory to save the exported database.
--format {parquet,csv}
The format to export the database in.
```

### [WIP] Migrating to PostgreSQL
```bash
bash scripts/migrate_postgres_workflow.sh
```

# Credits
This dbt template was inspired adapted from [jaffle_shop_duckdb](https://github.com/dbt-labs/jaffle_shop_duckdb) example.
20 changes: 20 additions & 0 deletions models/raw/mibig.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
WITH src_mibig_bgcs AS (
SELECT * FROM {{ source('bgcflow_tables', 'df_mibig_bgcs') }}
),

stg_mibig_bgcs AS (
SELECT
mibig_id,
biosyn_class,
compounds,
chem_acts,
accession,
completeness,
evidence,
organism_name,
ncbi_tax_id,
publications
FROM src_mibig_bgcs
)

SELECT * FROM stg_mibig_bgcs
32 changes: 16 additions & 16 deletions models/raw/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,22 @@ models:
tests:
- not_null
- unique
#- name: region_id
# description: Foreign key to region table
# tests:
# - relationships:
# to: ref('regions')
# field: region_id
# - name: checkm
# description: CheckM information
# columns:
# - name: genome_id
# description: Foreign Key to genomes table
# tests:
# - not_null
# - relationships:
# to: ref('genomes')
# field: genome_id
- name: region_id
description: Foreign key to region table
tests:
- relationships:
to: ref('regions')
field: region_id
- name: checkm
description: CheckM information
columns:
- name: genome_id
description: Foreign Key to genomes table
tests:
- not_null
- relationships:
to: ref('genomes')
field: genome_id
- name: bigscape_cluster
description: GCF information created through BiG-SCAPE
columns:
Expand Down
7 changes: 4 additions & 3 deletions profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ dbt_bgcflow:
- parquet
external_root: "../data_warehouse"
prod:
dbname: dbt_bgcflow
dbname: bgcflow
schema: public
type: postgres
threads: 2
host: localhost
port: 5432
user: <user id>
pass: <password>
user: "{{ env_var('DBT_USER') }}"
pass: "{{ env_var('DBT_PASSWORD') }}"
extensions:
- parquet
external_root: "../data_warehouse"
35 changes: 35 additions & 0 deletions scripts/duckdb_fdw_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# sudo apt-get install postgresql-server-dev-all
# Step 1: Download source
git clone https://github.com/alitrack/duckdb_fdw


cd duckdb_fdw

# Step 2: Download DuckDB library
wget -c https://github.com/duckdb/duckdb/releases/download/v0.8.1/libduckdb-linux-amd64.zip -nc
unzip -n -d . libduckdb-linux-amd64.zip

export PATH=/usr/lib/postgresql/14/bin:$PATH
echo $(which pg_config)

libdir="/usr/lib/postgresql/14/lib"
sudo cp libduckdb.so $libdir


# Step 3: Build and install duckdb_fdw
# Add a directory of pg_config to PATH and build and install duckdb_fdw.

sudo make NO_PGXS=1
sudo make install NO_PGXS=1



cd ..
export PGPASSWORD=$1

echo "Using ${PSQL_HOST:=localhost}:${PSQL_PORT:=5432} with db ${PSQL_DB:=dbt_bgcflow}, schema ${PSQL_SCHEMA:=public} as ${PSQL_USER:=postgres}"
PSQL="psql -h $PSQL_HOST -p $PSQL_PORT -U $PSQL_USER"
PSQL_AS="$PSQL $PSQL_DB"

$PSQL_AS -c "CREATE EXTENSION duckdb_fdw;"
46 changes: 46 additions & 0 deletions scripts/duckdb_schema_to_postgres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import argparse


def convert_sql_script(duckdb_script_path, postgres_script_path):
"""
Convert a DuckDB SQL script to a PostgreSQL SQL script.

Args:
duckdb_script_path (str): The path to the DuckDB SQL script.
postgres_script_path (str): The path to save the PostgreSQL SQL script.
"""
# Mapping of DuckDB data types to PostgreSQL data types
data_type_mapping = {
"BIGINT": "BIGINT",
"BOOLEAN": "BOOLEAN",
"DATE": "DATE",
"DOUBLE": "DOUBLE PRECISION",
"INTEGER": "INTEGER",
"TEXT": "TEXT",
"TIMESTAMP": "TIMESTAMP",
}

# Read the DuckDB SQL script
with open(duckdb_script_path, "r") as file:
duckdb_script = file.read()

# Replace the data types with their PostgreSQL equivalents
for duckdb_type, postgres_type in data_type_mapping.items():
duckdb_script = duckdb_script.replace(duckdb_type, postgres_type)

# Write the new script to a PostgreSQL SQL script
with open(postgres_script_path, "w") as file:
file.write(duckdb_script)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert a DuckDB SQL script to a PostgreSQL SQL script."
)
parser.add_argument("duckdb_script_path", help="The path to the DuckDB SQL script.")
parser.add_argument(
"postgres_script_path", help="The path to save the PostgreSQL SQL script."
)
args = parser.parse_args()

convert_sql_script(args.duckdb_script_path, args.postgres_script_path)
50 changes: 50 additions & 0 deletions scripts/export_duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import argparse
import logging

import duckdb

# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def export_database(database_filename, export_directory, format):
"""
Export a DuckDB database to a specified directory.

Args:
database_filename (str): The filename of the DuckDB database to export.
export_directory (str): The directory to save the exported database.
format (str): The format to export the database in.
"""
logging.info(f"Connecting to database: {database_filename}")
conn = duckdb.connect(database_filename)

logging.info(f"Exporting database to directory: {export_directory}")
conn.execute(f"EXPORT DATABASE '{export_directory}' (FORMAT {format.upper()})")

logging.info("Database export completed successfully")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Export a DuckDB database.")
parser.add_argument(
"--database_filename",
help="The filename of the DuckDB database to export.",
default="dbt_bgcflow.duckdb",
)
parser.add_argument(
"--export_directory",
help="The directory to save the exported database.",
default="./exported_database",
)
parser.add_argument(
"--format",
help="The format to export the database in.",
choices=["parquet", "csv"],
default="parquet",
)
args = parser.parse_args()

export_database(args.database_filename, args.export_directory, args.format)
48 changes: 48 additions & 0 deletions scripts/init_postgres.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

set -o nounset

echo "Setting up PostgreSQL connection..."
echo "Using ${PSQL_HOST:=localhost}:${PSQL_PORT:=5432} with db ${PSQL_DB:=dbt_bgcflow}, schema ${PSQL_SCHEMA:=public} as ${PSQL_USER:=postgres}"
PSQL="psql -h $PSQL_HOST -p $PSQL_PORT -U $PSQL_USER"
PSQL_AS="$PSQL $PSQL_DB"

echo "Checking if database ${PSQL_DB} exists..."
$PSQL -tc "SELECT 1 FROM pg_database WHERE datname = '${PSQL_DB}';" | grep -q 1 || $PSQL -c "CREATE DATABASE $PSQL_DB;"

echo "Clearing out the whole schema"
$PSQL_AS -c "DROP SCHEMA IF EXISTS ${PSQL_SCHEMA} CASCADE;" > /dev/null 2>&1

echo "Creating schema ${PSQL_SCHEMA} if it does not exist..."
$PSQL_AS -c "CREATE SCHEMA IF NOT EXISTS ${PSQL_SCHEMA};" > /dev/null 2>&1

SQL_FILES="scripts/sql"

# tables not depending on other tables
TABLES="taxa mibig "

# tables depending on other tables. Please keep the order intact
TABLES="$TABLES genomes checkm dna_sequences seqfu regions bigscape_cluster cdss mibig_hits bigfam_hits bigfam_network bigscape_network"

# load tables
TABLES="$TABLES load"

for t in $TABLES; do
if [ -f "$SQL_FILES/${t}.sql" ]; then
echo "Processing $t"
else
echo "no such file: $SQL_FILES/${t}.sql"
exit 1
fi
echo "Executing SQL script for $t..."
$PSQL_AS 2>&1 < "$SQL_FILES/${t}.sql" | tee tmp | grep ERROR
if [ "$?" -eq "0" ]; then
cat tmp
rm tmp
exit 1
fi
echo "SQL script for $t executed successfully."
rm tmp
done

echo "PostgreSQL setup completed."
7 changes: 7 additions & 0 deletions scripts/migrate_postgres_workflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
echo "Starting export from DuckDB..."
python scripts/export_duckdb.py --format csv
echo "Export from DuckDB completed."

echo "Initializing PostgreSQL..."
bash scripts/init_postgres.sh
echo "PostgreSQL initialization completed."
13 changes: 13 additions & 0 deletions scripts/sql/bigfam_hits.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CREATE TABLE bigfam_hits (
bigfam_id BIGINT,
core_member BIGINT,
putative_member BIGINT,
core_member_mibig VARCHAR,
putative_member_mibig VARCHAR,
core_member_mibig_count BIGINT,
core_member_mibig_bool BOOLEAN,
putative_member_mibig_count BIGINT,
putative_member_mibig_bool BOOLEAN,
bigfam_link VARCHAR,
CONSTRAINT bigfam_hits_pkey PRIMARY KEY (bigfam_id)
);
12 changes: 12 additions & 0 deletions scripts/sql/bigfam_network.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CREATE TABLE bigfam_network (
bigfam_id BIGINT,
bgc_id VARCHAR,
membership_value DOUBLE PRECISION,
rank BIGINT,
CONSTRAINT bigfam_network_bigfam_id_fkey FOREIGN KEY (bigfam_id)
REFERENCES bigfam_hits (bigfam_id) MATCH SIMPLE
ON DELETE CASCADE,
CONSTRAINT bigfam_network_bgc_id_fkey FOREIGN KEY (bgc_id)
REFERENCES regions (region_id) MATCH SIMPLE
ON DELETE CASCADE
);
15 changes: 15 additions & 0 deletions scripts/sql/bigscape_cluster.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CREATE TABLE bigscape_cluster (
bgc_id VARCHAR,
genome_id VARCHAR,
product VARCHAR,
bigscape_class VARCHAR,
accn_id VARCHAR,
gcf BIGINT,
fam_id BIGINT,
fam_type VARCHAR,
fam_known_compounds VARCHAR,
CONSTRAINT bigscape_cluster_pkey PRIMARY KEY (bgc_id),
CONSTRAINT bigscape_cluster_genome_id_fkey FOREIGN KEY (genome_id)
REFERENCES genomes (genome_id) MATCH SIMPLE
ON DELETE CASCADE
);
17 changes: 17 additions & 0 deletions scripts/sql/bigscape_network.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
CREATE TABLE bigscape_network (
bigscape_edge_id BIGINT,
clustername_1 VARCHAR,
clustername_2 VARCHAR,
raw_distance DOUBLE PRECISION,
squared_similarity DOUBLE PRECISION,
jaccard_index DOUBLE PRECISION,
dss_index DOUBLE PRECISION,
adjacency_index DOUBLE PRECISION,
raw_dss_non_anchor DOUBLE PRECISION,
raw_dss_anchor DOUBLE PRECISION,
non_anchor_domains BIGINT,
anchor_domains BIGINT,
combined_group VARCHAR,
shared_group VARCHAR,
CONSTRAINT bigscape_network_pkey PRIMARY KEY (bigscape_edge_id)
);
Loading
Loading