Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DC-3632] Remove irrelevant bq_utils import statements #1841

Merged
merged 6 commits into from
Jan 22, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[DC-3632] remove 5 more usages of imports, some formatting occurred
Michael Schmidt committed Jan 3, 2024
commit 83a9ebd75dad4a2b8a035b8c96d0daeb7a54afa5
27 changes: 13 additions & 14 deletions data_steward/analytics/cdr_ops/ad_hoc_analyses/coverage.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,6 @@

import warnings

import bq_utils
import utils.bq
from notebooks import parameters
warnings.filterwarnings('ignore')
@@ -33,11 +32,11 @@ def get_hpo_table_columns(hpo_id):
:param hpo_id: hpo site id
:return: dataframe with table name, column name and table row count
"""
query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id
query = """SELECT table_name, column_name, t.row_count as table_row_count, '{hpo_id}' as hpo_id
FROM {dataset}.INFORMATION_SCHEMA.COLUMNS c
JOIN {dataset}.__TABLES__ t on c.table_name=t.table_id
WHERE STARTS_WITH(table_id, lower('{hpo_id}'))=true AND
NOT(table_id like '_mapping%') AND
NOT(table_id like '_mapping%') AND
(
table_id like '%person' OR
table_id like '%visit_occurrence' OR
@@ -59,25 +58,25 @@ def get_hpo_table_columns(hpo_id):


def create_hpo_completeness_query(table_columns, hpo_id):
query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated
query_with_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated
FROM (
SELECT '{table_name}' as table_name, '{column_name}' as column_name,
'{hpo_id}' as site_name,
{table_row_count} as total_rows,
{table_row_count} as total_rows,
sum(case when {column_name}=0 then 0 else 1 end) as num_nonnulls_zeros,
({table_row_count} - count({column_name})) as non_populated_rows
FROM {dataset}.{table_name}
) as x
({table_row_count} - count({column_name})) as non_populated_rows
FROM {dataset}.{table_name}
) as x
"""
query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated
query_without_concept_id = """SELECT current_datetime() as report_run_time, x.*, CASE WHEN total_rows=0 THEN 0 ELSE (num_nonnulls_zeros)/(total_rows) END as percent_field_populated
FROM (
SELECT '{table_name}' as table_name, '{column_name}' as column_name,
'{hpo_id}' as site_name,
{table_row_count} as total_rows,
count({column_name}) as num_nonnulls_zeros,
({table_row_count} - count({column_name})) as non_populated_rows
FROM {dataset}.{table_name}
) as x
{table_row_count} as total_rows,
count({column_name}) as num_nonnulls_zeros,
({table_row_count} - count({column_name})) as non_populated_rows
FROM {dataset}.{table_name}
) as x
"""
queries = []
for i, row in table_columns.iterrows():
Original file line number Diff line number Diff line change
@@ -26,7 +26,6 @@
client = bigquery.Client()
# %load_ext google.cloud.bigquery

import bq_utils
import utils.bq
from notebooks import parameters
# %matplotlib inline
@@ -95,18 +94,18 @@
racial_distribution_by_site_query = """
SELECT
DISTINCT
a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons
a.*, b.number_from_site, ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as percent_of_site_persons
FROM
(SELECT
DISTINCT
mp.src_hpo_id, p.race_concept_id, c.concept_name,
mp.src_hpo_id, p.race_concept_id, c.concept_name,
COUNT(p.race_concept_id) as number_of_demographic,
FROM
`{DATASET}.unioned_ehr_person` p
LEFT JOIN
`{DATASET}._mapping_person` mp
ON
p.person_id = mp.src_person_id
p.person_id = mp.src_person_id
LEFT JOIN
`{DATASET}.concept` c
ON
@@ -141,17 +140,17 @@ def return_hpos_to_display(hpo_names, max_num_sites_to_display):
Function is intended to return a means for divide the number of HPOs into an
appropriate number of lists based on the maximum number of sites a user
wants to display.

This is useful for creating graphs that will only display a fraction of the
total HPOs.

Parameters
----------
hpo_names (list): list of all the health provider organizations (in string form)

num_sites_to_display (int): user-specified number of sites to display in each graph


Returns
-------
all_hpos (list): contains several lists, each of which contains a number of sites
@@ -195,17 +194,17 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names,
"""
Function is used to create a dictionary that contains the racial makeup of a selected
number of sites (expressed as a percentage, from a source dataframe)

Parameters
----------
hpo_dfs (dictonary): has the following structure
key: string representing an HPO ID
value: dataframe that contains information about the different race concepts (IDs
and names) and their relative spread within the site

selected_hpo_names (list): contains strings that represent the different HPOs that will
ultimately be translated to a dictionary


most_popular_race_cids (list): list of the most popular concept IDs (across all sites)

@@ -253,23 +252,23 @@ def create_information_dictionary_for_sites(hpo_dfs, selected_hpo_names,
def create_graphs(hpo_names_to_display, num_races_for_legend,
racial_percentages, img_name):
"""
Function is used to create and save graphs that show the racial distribution for
Function is used to create and save graphs that show the racial distribution for
a selected number of sites

Parameters
----------
hpo_names_to_display (list): list with a user-specified number of HPOs that are to
be displayed in the graph

num_races_for_legend (int): the number of races that are to be displayed next
to the graph

racial_percentages (dictionary): has the following structure
key: race concept ID
value: list, each index represents one of the sites in the 'selected_hpo_names'
parameter. the value represents the proportion of persons from the HPO
who have the reported race concept ID

img_name (string): name for the image to be displayed
"""
num_sites_to_display = len(hpo_names_to_display)
@@ -408,46 +407,46 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name):
- number of IDs for that particular group in the specified table
- total number of IDs for the HPO
- percentage of the records for the site that belong to that demographic class

This query is then run through bigquery and returns a dataframe


Parameters
----------
dataset (str): dataset to be queried (defined at the top of the workbook)

percent_of_table (str): the string to represent the percentage of the records for the
site that belong to the particular demographic class

table_name (str): name of the table to be investigated


Returns
-------
dataframe (df): contains the information specified in the top of the docstring

"""

query = """
SELECT
DISTINCT
a.src_hpo_id, a.race_concept_id, a.concept_name,
ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table}
a.src_hpo_id, a.race_concept_id, a.concept_name,
ROUND(a.number_of_demographic / b.number_from_site * 100, 2) as {percent_of_table}
FROM
(SELECT
DISTINCT
mp.src_hpo_id, p.race_concept_id, c.concept_name,
mp.src_hpo_id, p.race_concept_id, c.concept_name,
COUNT(p.race_concept_id) as number_of_demographic,
FROM
`{dataset}.unioned_ehr_{table_name}` x
LEFT JOIN
`{dataset}.unioned_ehr_person` p
ON
x.person_id = p.person_id
x.person_id = p.person_id
LEFT JOIN
`{dataset}._mapping_person` mp
ON
p.person_id = mp.src_person_id
p.person_id = mp.src_person_id
LEFT JOIN
`{dataset}.concept` c
ON
@@ -464,7 +463,7 @@ def create_query_for_particular_table(dataset, percent_of_table, table_name):
LEFT JOIN
`{dataset}.unioned_ehr_person` p
ON
x.person_id = p.person_id
x.person_id = p.person_id
LEFT JOIN
`{dataset}._mapping_person` mp
ON
@@ -549,13 +548,13 @@ def find_all_distributions_for_site_race_combo(df, hpo, race,
This function is used to calculate the relative 'underrepresentation' of a given
race for a particular table when compared to the race's overall representation in
the person table.

For instance, a site may have 65% participants who identify as 'White'. The persons
who identify with this race, however, only make up 60% of the drug_exposure_ids in
the drug exposure table. This would result in a 'underrepresentation' of 5% for
persons at this particular site for this particular table.


Parameters
----------
df (df): dataframe that contains the following information in its fields:
@@ -567,15 +566,15 @@ def find_all_distributions_for_site_race_combo(df, hpo, race,
aforementioned race_concept_id
e. the same metric as d but also for the condition, observation, procedure,
and visit tables

hpo (string): HPO whose 'representation' metric is going to be assessed

race (string): race concept name that will be evaluated for 'representation'

person_distribution: the proportion of person_ids for the particular site that
belong to the aforementioned race


Returns
-------
difference_df: contains the 'difference' between the proportion of records
11 changes: 5 additions & 6 deletions data_steward/analytics/cdr_ops/ad_hoc_analyses/site_mapping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# +
import bq_utils
import utils.bq
from notebooks import render, parameters

@@ -12,10 +11,10 @@
# ## Row counts in combined `_mapping*` and deid `*_ext` tables

ROW_COUNTS_QUERY = """
SELECT dataset_id,
REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table,
table_id,
creation_time,
SELECT dataset_id,
REPLACE(REPLACE(table_id, '_mapping_', ''), '_ext', '') mapped_table,
table_id,
creation_time,
last_modified_time,
row_count
FROM
@@ -25,7 +24,7 @@

UNION ALL

SELECT *
SELECT *
FROM {COMBINED}.__TABLES__ d1
WHERE table_id LIKE '\\\_mapping\\\_%')

Loading