Skip to content

Commit

Permalink
Add sequence identifier when parsing sequencing accessioning file
Browse files Browse the repository at this point in the history
Calculates a sequence identifier by hashing the `strain_name` and appending the pathogen code (RSVA, RSVB, or HCOV19) to be used as the identifier in warehouse.genomic_sequence table.
  • Loading branch information
davereinhart committed Aug 6, 2024
1 parent 521db2b commit daff921
Showing 1 changed file with 15 additions and 5 deletions.
20 changes: 15 additions & 5 deletions lib/seattleflu/id3c/cli/command/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def clinical():
# Parse sequencing accessions subcommand
@clinical.command("parse-sequencing")
@click.argument("accession_ids_filename", metavar = "<Sequencing accession IDs filename>")
@click.argument("record_type", metavar="<record type>", type=click.Choice(['covid', 'rsv-a', 'rsv-b']))
@click.argument("record_type", metavar="<record type>", type=click.Choice(['hcov19', 'rsv-a', 'rsv-b']))
@click.option("-o", "--output", metavar="<output filename>",
help="The filename for the output of missing barcodes")

Expand Down Expand Up @@ -92,18 +92,28 @@ def parse_sequencing_accessions(accession_ids_filename, record_type, output):
.pipe(trim_whitespace)
.pipe(add_provenance, accession_ids_filename))

# only keep submitted records
clinical_records = clinical_records[clinical_records.status == 'submitted']

if record_type in ['rsv-a', 'rsv-b']:
clinical_records = clinical_records[clinical_records['pathogen'] == record_type]
elif record_type == 'hcov19':
assert 'pathogen' not in clinical_records.columns, 'Error: unexpected column `pathogen` in sequence records.'
clinical_records['pathogen'] = 'hcov19'

clinical_records['sequence_identifier'] = clinical_records.apply(
lambda row: generate_hash(row['strain_name']) + '-' + row['pathogen'].upper().replace('-', ''), axis=1
)
column_map = {
'sequence_identifier': 'sequence_identifier',
'sfs_sample_barcode': 'barcode',
'strain_name': 'strain_name',
'nwgc_id': 'nwgc_id',
'gisaid_accession': 'gisaid_accession',
'genbank_accession': 'genbank_accession',
'pathogen': 'pathogen',
'_provenance': '_provenance'
}
if record_type in ['rsv-a', 'rsv-b']:
clinical_records = clinical_records[clinical_records['pathogen'] == record_type]
column_map['pathogen'] = 'pathogen'

clinical_records = clinical_records[(clinical_records['sfs_sample_barcode'].notnull())&(clinical_records.status=='submitted')].rename(columns=column_map)

barcode_quality_control(clinical_records, output)
Expand Down

0 comments on commit daff921

Please sign in to comment.