Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DC-3629] Add the wear_study percentage with fitbit data check #1833

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 65 additions & 3 deletions data_steward/analytics/cdr_ops/fitbit_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
fitbit_dataset: str = "" # identifies the name of the new fitbit dataset
sandbox_dataset: str = "" # the pipeline tables sandbox
source_dataset: str = "" # identifies the name of the rdr dataset
deid_dataset: str = "" # dataset contains wear_study table
cutoff_date: str = "" # CDR cutoff date in YYYY--MM-DD format
run_as: str = "" # service account email to impersonate
# -

from common import JINJA_ENV, FITBIT_TABLES, PIPELINE_TABLES, SITE_MASKING_TABLE_ID
from utils import auth
from gcloud.bq import BigQueryClient
from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES
from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message

impersonation_creds = auth.get_impersonation_credentials(
run_as, target_scopes=IMPERSONATION_SCOPES)
Expand Down Expand Up @@ -71,7 +72,8 @@
}

# ## Identify person_ids that are not in the person table
# This check verifies that person_ids are valid. That they exist in the CDM person table and are not null. There should be no bad rows.
# This check verifies that person_ids are valid. That they exist in the CDM person table and are not null.
# There should be no bad rows.
#
# In case of failure:
# - If the person_id is not in the CDM person table. Check that `RemoveNonExistingPids` was applied.
Expand Down Expand Up @@ -201,7 +203,8 @@

# ## Check for rows without a valid date field
# Fitbit table records must have at least one valid date in order to be deemed valid.
# This is a preleminary check as this circumstance(lacking a date) should not be possible. No CR currently exists to remove data of this type.
# This is a preleminary check as this circumstance(lacking a date) should not be possible. No CR currently exists to
# remove data of this type.
#
# If bad rows are found a new CR may be required. Notify and recieve guidance from the DST.

Expand Down Expand Up @@ -263,4 +266,63 @@

execute(client, union_all_query)
# -
# # Check percentage of wear_study participants lacking fitbit data
#
# This check requires a deid dataset containing the generated wear_study table.
#
# If the check fails - If one of the data sources is missing or if the percentage of wear_study participants lacking
# fitbit data is more than 40% for vibrent participants or 10% for ce participants, the data analytics team should be
# notified.
# See DC-3629 for more information.

# +
query = JINJA_ENV.from_string("""
WITH fb_person_ids AS ( -- identify pids with fitbit data --
SELECT DISTINCT person_id
FROM {{project_id}}.{{dataset}}.activity_summary
)
, consenting_ws_ids AS ( -- identify consenting pids --
SELECT person_id,research_id,
FROM {{project_id}}.{{pipeline}}.primary_pid_rid_mapping dm
WHERE research_id IN (SELECT person_id
FROM {{project_id}}.{{deid_dataset}}.wear_study
WHERE wear_consent_end_date IS NULL)
)
SELECT
src_id,
ROUND(COUNT(CASE WHEN fb.person_id IS NULL THEN 1 ELSE NULL END) * 100 / COUNT(c_ws),1) AS percent_without_fb,
FROM (SELECT * FROM {{project_id}}.{{raw_rdr}}.observation WHERE observation_source_concept_id = 2100000010) o
JOIN consenting_ws_ids c_ws USING(person_id)
LEFT JOIN fb_person_ids fb ON o.person_id = fb.person_id
GROUP BY 1
""").render(project_id=project_id,
dataset=fitbit_dataset,
raw_rdr=source_dataset,
pipeline=sandbox_dataset,
deid_dataset=deid_dataset)

df = execute(client, query)

# conditions for a passing check
cond_vibrent_percentage = df.loc[df['src_id'] == 'vibrent', 'percent_without_fb'].iloc[0] < 40
cond_ce_percentage = df.loc[df['src_id'] == 'ce', 'percent_without_fb'].iloc[0] < 10
is_success = cond_vibrent_percentage and cond_ce_percentage

success_msg = "Conditions Pass"
failure_msg = (
"""
One of the following checks failed. Confirm failure, and notify the proper team(Data Analytics) <br>
(1) The percentage of wear_study participants lacking fitbit data should be less than than 40% for vibrent. <br>
(2) The percentage of wear_study participants lacking fitbit data should be less than than 10% for ce. <br>
"""
)

render_message(df,
success_msg,
failure_msg,
is_success=is_success)
# -