[DC-3633] Add a check to the raw and clean rdr qc notebooks for the p…

…resence of row duplicates (#1826) [DC-3633] Add the duplicates check to the raw and clean notebooks
all-of-us · Dec 15, 2023 · 58cebad · 58cebad
1 parent 7b6be13
commit 58cebad
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 10 deletions.
diff --git a/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py b/data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
@@ -1313,4 +1313,59 @@
 query = tpl.render(new_rdr=new_rdr, raw_rdr=raw_rdr, project_id=project_id)
 execute(client, query)
 
+# # Check to catch duplicates in observation
+# If the query fails find extra information on the issue type below.
+# * "multiple rows with the same observation_id". There should not be any rows that have the same observation_id. A similar check exists in the raw rdr qc notebook and should have caught occurrences of this issue in the raw export. This check failing most likely means that a rdr cleaning rule is the culpret. 
+# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. A cleaning rule was created to remove duplicates of this type 'drop_row_duplicates'.  
+
+# +
+tpl = JINJA_ENV.from_string('''
+WITH whole_row_dups AS (
+SELECT
+COUNT(*) as n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY -- all fields except observation_id --
+ person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id, 
+ value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id, 
+ visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value, 
+ qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
+HAVING n > 1)
+
+, observation_id_dups AS (
+SELECT
+observation_id,
+COUNT(observation_id) AS n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY observation_id
+HAVING n>1)
+
+SELECT
+ "multiple rows with the same obs_id" as issue,
+  COUNT(*) AS n
+FROM 
+    observation_id_dups
+
+UNION ALL
+
+SELECT
+ "duplicates on whole row - excluding obs_id" as issue,
+  COUNT(*) AS n
+FROM 
+    whole_row_dups
+
+''')
+query = tpl.render(project_id=project_id, new_rdr=new_rdr)
+df = execute(client, query)
+
+is_success = sum(df['n']) == 0
+success_msg = 'No issue found.'
+failure_msg = 'Duplicates found. See check description.'
+
+render_message(df,
+               success_msg,
+               failure_msg,
+               is_success=is_success)
+# -
+
+
 
diff --git a/data_steward/analytics/cdr_ops/notebook_utils.py b/data_steward/analytics/cdr_ops/notebook_utils.py
@@ -106,7 +106,8 @@ def render_message(results_df,
                    success_msg='',
                    failure_msg='',
                    success_msg_args={},
-                   failure_msg_args={}):
+                   failure_msg_args={},
+                   is_success=None):
     """
     Renders a conditional success or failure message for a DQ check.
 
@@ -115,9 +116,11 @@ def render_message(results_df,
     failure_msg: A templated string to describe failure.
     success_msg_args: A dictionary of args to pass to success_msg template.
     failure_msg_args: A dictionary of args to pass to failiure_msg template.
+    is_success: Optional override of the default is_success value.
 
     """
-    is_success = len(results_df) == 0
+    if is_success is None:
+        is_success = len(results_df) == 0
     status_msg = 'Success' if is_success else 'Failure'
     if is_success:
         display(

diff --git a/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py b/data_steward/analytics/cdr_ops/raw_rdr_export_qc.py
@@ -1298,18 +1298,62 @@
                success_msg,
                failure_msg)
 # -
-# # Check to catch duplicate observation_ids
+# # Check to catch duplicates
+# If the query fails find extra information on the issue type below.
+# * "multiple rows with the same observation_id". No rows in the rdr export should have the same 'observation_id'. Relay this issue back to RDR.
+# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. The rdr export should not have duplicates of this type. Relay this issue back to RDR to fix. FYI the RDR CR 'drop_row_duplicates' would remove the duplicate rows if they are not corrected by RDR. 
 
+# +
 tpl = JINJA_ENV.from_string('''
+WITH whole_row_dups AS (
+SELECT
+COUNT(*) as n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY -- all fields except observation_id --
+ person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id, 
+ value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id, 
+ visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value, 
+ qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
+HAVING n > 1)
+
+, observation_id_dups AS (
 SELECT
-  observation_id,
-  COUNT(observation_id) AS n
+observation_id,
+COUNT(observation_id) AS n
+FROM `{{project_id}}.{{new_rdr}}.observation`
+GROUP BY observation_id
+HAVING n>1)
+
+SELECT
+ "multiple rows with the same obs_id" as issue,
+  COUNT(*) AS n
 FROM 
-    `{{project_id}}.{{new_rdr}}.observation`
-GROUP BY 
-    observation_id
-HAVING n>1
+    observation_id_dups
+
+UNION ALL
+
+SELECT
+ "duplicates on whole row - excluding obs_id" as issue,
+  COUNT(*) AS n
+FROM 
+    whole_row_dups
+
 ''')
 query = tpl.render(project_id=project_id, new_rdr=new_rdr)
-execute(client, query)
+df = execute(client, query)
+
+is_success = sum(df['n']) == 0
+success_msg = 'No issue found.'
+failure_msg = 'Duplicates found. See check description.'
+
+render_message(df,
+               success_msg,
+               failure_msg,
+               is_success=is_success)
+# -
+
+
+
+
+