Skip to content

Commit

Permalink
[DC-3633] Add a check to the raw and clean rdr qc notebooks for the p…
Browse files Browse the repository at this point in the history
…resence of row duplicates (#1826)

[DC-3633] Add the duplicates check to the raw and clean notebooks
  • Loading branch information
brendagutman authored Dec 15, 2023
1 parent 7b6be13 commit 58cebad
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 10 deletions.
55 changes: 55 additions & 0 deletions data_steward/analytics/cdr_ops/clean_rdr_export_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1313,4 +1313,59 @@
query = tpl.render(new_rdr=new_rdr, raw_rdr=raw_rdr, project_id=project_id)
execute(client, query)

# # Check to catch duplicates in observation
# If the query fails find extra information on the issue type below.
# * "multiple rows with the same observation_id". There should not be any rows that have the same observation_id. A similar check exists in the raw rdr qc notebook and should have caught occurrences of this issue in the raw export. This check failing most likely means that a rdr cleaning rule is the culpret.
# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. A cleaning rule was created to remove duplicates of this type 'drop_row_duplicates'.

# +
tpl = JINJA_ENV.from_string('''
WITH whole_row_dups AS (
SELECT
COUNT(*) as n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY -- all fields except observation_id --
person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id,
value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id,
visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value,
qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
HAVING n > 1)
, observation_id_dups AS (
SELECT
observation_id,
COUNT(observation_id) AS n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY observation_id
HAVING n>1)
SELECT
"multiple rows with the same obs_id" as issue,
COUNT(*) AS n
FROM
observation_id_dups
UNION ALL
SELECT
"duplicates on whole row - excluding obs_id" as issue,
COUNT(*) AS n
FROM
whole_row_dups
''')
query = tpl.render(project_id=project_id, new_rdr=new_rdr)
df = execute(client, query)

is_success = sum(df['n']) == 0
success_msg = 'No issue found.'
failure_msg = 'Duplicates found. See check description.'

render_message(df,
success_msg,
failure_msg,
is_success=is_success)
# -



7 changes: 5 additions & 2 deletions data_steward/analytics/cdr_ops/notebook_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def render_message(results_df,
success_msg='',
failure_msg='',
success_msg_args={},
failure_msg_args={}):
failure_msg_args={},
is_success=None):
"""
Renders a conditional success or failure message for a DQ check.
Expand All @@ -115,9 +116,11 @@ def render_message(results_df,
failure_msg: A templated string to describe failure.
success_msg_args: A dictionary of args to pass to success_msg template.
failure_msg_args: A dictionary of args to pass to failiure_msg template.
is_success: Optional override of the default is_success value.
"""
is_success = len(results_df) == 0
if is_success is None:
is_success = len(results_df) == 0
status_msg = 'Success' if is_success else 'Failure'
if is_success:
display(
Expand Down
60 changes: 52 additions & 8 deletions data_steward/analytics/cdr_ops/raw_rdr_export_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,18 +1298,62 @@
success_msg,
failure_msg)
# -
# # Check to catch duplicate observation_ids
# # Check to catch duplicates
# If the query fails find extra information on the issue type below.
# * "multiple rows with the same observation_id". No rows in the rdr export should have the same 'observation_id'. Relay this issue back to RDR.
# * "whole row duplicates excluding obs_id" These are rows that have all the same data except for the observation_id. The rdr export should not have duplicates of this type. Relay this issue back to RDR to fix. FYI the RDR CR 'drop_row_duplicates' would remove the duplicate rows if they are not corrected by RDR.

# +
tpl = JINJA_ENV.from_string('''
WITH whole_row_dups AS (
SELECT
COUNT(*) as n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY -- all fields except observation_id --
person_id, observation_concept_id, observation_date, observation_datetime, observation_type_concept_id,
value_as_number, value_as_string, value_as_concept_id, qualifier_concept_id, unit_concept_id, provider_id,
visit_occurrence_id, visit_detail_id, observation_source_value, observation_source_concept_id, unit_source_value,
qualifier_source_value, value_source_concept_id, value_source_value, questionnaire_response_id
HAVING n > 1)
, observation_id_dups AS (
SELECT
observation_id,
COUNT(observation_id) AS n
observation_id,
COUNT(observation_id) AS n
FROM `{{project_id}}.{{new_rdr}}.observation`
GROUP BY observation_id
HAVING n>1)
SELECT
"multiple rows with the same obs_id" as issue,
COUNT(*) AS n
FROM
`{{project_id}}.{{new_rdr}}.observation`
GROUP BY
observation_id
HAVING n>1
observation_id_dups
UNION ALL
SELECT
"duplicates on whole row - excluding obs_id" as issue,
COUNT(*) AS n
FROM
whole_row_dups
''')
query = tpl.render(project_id=project_id, new_rdr=new_rdr)
execute(client, query)
df = execute(client, query)

is_success = sum(df['n']) == 0
success_msg = 'No issue found.'
failure_msg = 'Duplicates found. See check description.'

render_message(df,
success_msg,
failure_msg,
is_success=is_success)
# -






0 comments on commit 58cebad

Please sign in to comment.