From cdaa1dd8835709b8067e0d59fd3403e555909749 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 12 Dec 2024 17:41:23 +0100 Subject: [PATCH] Support MAF record writing --- .../export/ClinicalAttributeDataWriter.java | 10 +- .../file/export/MafRecordWriter.java | 79 ++++++++ .../org/cbioportal/file/export/TSVUtil.java | 13 ++ .../org/cbioportal/file/model/MafRecord.java | 191 ++++++++++++++++++ .../file/export/MafRecordWriterTest.java | 66 ++++++ 5 files changed, 352 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/cbioportal/file/export/MafRecordWriter.java create mode 100644 src/main/java/org/cbioportal/file/export/TSVUtil.java create mode 100644 src/main/java/org/cbioportal/file/model/MafRecord.java create mode 100644 src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java diff --git a/src/main/java/org/cbioportal/file/export/ClinicalAttributeDataWriter.java b/src/main/java/org/cbioportal/file/export/ClinicalAttributeDataWriter.java index ced57e68e8c..84973cb5882 100644 --- a/src/main/java/org/cbioportal/file/export/ClinicalAttributeDataWriter.java +++ b/src/main/java/org/cbioportal/file/export/ClinicalAttributeDataWriter.java @@ -6,12 +6,11 @@ import java.io.IOException; import java.io.Writer; import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; + +import static org.cbioportal.file.export.TSVUtil.composeRow; public class ClinicalAttributeDataWriter { - public static final String TAB = "\t"; private final Writer writer; /** @@ -55,8 +54,5 @@ private void writeCommentsRow(Iterable row) { } } - private static String composeRow(Iterable row) { - return StreamSupport.stream(row.spliterator(), false) - .map(s -> s.replace(TAB, "\\t")).collect(Collectors.joining(TAB)) + "\n"; - } + } diff --git a/src/main/java/org/cbioportal/file/export/MafRecordWriter.java b/src/main/java/org/cbioportal/file/export/MafRecordWriter.java new file mode 100644 index 00000000000..77e4bf3232b --- /dev/null +++ b/src/main/java/org/cbioportal/file/export/MafRecordWriter.java @@ -0,0 +1,79 @@ +package org.cbioportal.file.export; + +import org.cbioportal.file.model.MafRecord; + +import java.io.IOException; +import java.io.Writer; +import java.util.Iterator; +import java.util.LinkedHashMap; + +import static org.cbioportal.file.export.TSVUtil.composeRow; + +/** + * Writes MAF records to a writer + */ +public class MafRecordWriter { + private final Writer writer; + + public MafRecordWriter(Writer writer) { + this.writer = writer; + } + + public void write(Iterator maf) { + int line = 0; + while (maf.hasNext()) { + MafRecord mafRecord = maf.next(); + LinkedHashMap mafRow = new LinkedHashMap<>(); + mafRow.put("Hugo_Symbol", mafRecord.hugoSymbol()); + mafRow.put("Entrez_Gene_Id", mafRecord.entrezGeneId()); + mafRow.put("Center", mafRecord.center()); + mafRow.put("NCBI_Build", mafRecord.ncbiBuild()); + mafRow.put("Chromosome", mafRecord.chromosome()); + mafRow.put("Start_Position", mafRecord.startPosition().toString()); + mafRow.put("End_Position", mafRecord.endPosition().toString()); + mafRow.put("Strand", mafRecord.strand()); + mafRow.put("Variant_Classification", mafRecord.variantClassification()); + mafRow.put("Variant_Type", mafRecord.variantType()); + mafRow.put("Reference_Allele", mafRecord.referenceAllele()); + mafRow.put("Tumor_Seq_Allele1", mafRecord.tumorSeqAllele1()); + mafRow.put("Tumor_Seq_Allele2", mafRecord.tumorSeqAllele2()); + mafRow.put("dbSNP_RS", mafRecord.dbSnpRs()); + mafRow.put("dbSNP_Val_Status", mafRecord.dbSnpValStatus()); + mafRow.put("Tumor_Sample_Barcode", mafRecord.tumorSampleBarcode()); + mafRow.put("Matched_Norm_Sample_Barcode", mafRecord.matchedNormSampleBarcode()); + mafRow.put("Match_Norm_Seq_Allele1", mafRecord.matchNormSeqAllele1()); + mafRow.put("Match_Norm_Seq_Allele2", mafRecord.matchNormSeqAllele2()); + mafRow.put("Tumor_Validation_Allele1", mafRecord.tumorValidationAllele1()); + mafRow.put("Tumor_Validation_Allele2", mafRecord.tumorValidationAllele2()); + mafRow.put("Match_Norm_Validation_Allele1", mafRecord.matchNormValidationAllele1()); + mafRow.put("Match_Norm_Validation_Allele2", mafRecord.matchNormValidationAllele2()); + mafRow.put("Verification_Status", mafRecord.verificationStatus()); + mafRow.put("Validation_Status", mafRecord.validationStatus()); + mafRow.put("Mutation_Status", mafRecord.mutationStatus()); + mafRow.put("Sequencing_Phase", mafRecord.sequencingPhase()); + mafRow.put("Sequence_Source", mafRecord.sequenceSource()); + mafRow.put("Validation_Method", mafRecord.validationMethod()); + mafRow.put("Score", mafRecord.score()); + mafRow.put("BAM_File", mafRecord.bamFile()); + mafRow.put("Sequencer", mafRecord.sequencer()); + mafRow.put("HGVSc_Short", mafRecord.hgvspShort()); + mafRow.put("t_alt_count", mafRecord.tAltCount().toString()); + mafRow.put("t_ref_count", mafRecord.tRefCount().toString()); + mafRow.put("n_alt_count", mafRecord.nAltCount().toString()); + mafRow.put("n_ref_count", mafRecord.nRefCount().toString()); + if (line == 0) { + writeRow(mafRow.sequencedKeySet()); + } + writeRow(mafRow.sequencedValues()); + line++; + } + } + + private void writeRow(Iterable row) { + try { + writer.write(composeRow(row)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/src/main/java/org/cbioportal/file/export/TSVUtil.java b/src/main/java/org/cbioportal/file/export/TSVUtil.java new file mode 100644 index 00000000000..fe11dc7a086 --- /dev/null +++ b/src/main/java/org/cbioportal/file/export/TSVUtil.java @@ -0,0 +1,13 @@ +package org.cbioportal.file.export; + +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +public class TSVUtil { + public static final String TAB = "\t"; + + public static String composeRow(Iterable row) { + return StreamSupport.stream(row.spliterator(), false) + .map(s -> s.replace(TAB, "\\t")).collect(Collectors.joining(TAB)) + "\n"; + } +} diff --git a/src/main/java/org/cbioportal/file/model/MafRecord.java b/src/main/java/org/cbioportal/file/model/MafRecord.java new file mode 100644 index 00000000000..9dbda61785a --- /dev/null +++ b/src/main/java/org/cbioportal/file/model/MafRecord.java @@ -0,0 +1,191 @@ +package org.cbioportal.file.model; + +/** + * Represents a record in a Mutation Annotation Format (MAF) file. + */ +public record MafRecord( + /** + * A HUGO gene symbol. + */ + String hugoSymbol, + + /** + * A Entrez Gene identifier. + */ + String entrezGeneId, + + /** + * The sequencing center. + */ + String center, + + /** + * The Genome Reference Consortium Build used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse. + */ + String ncbiBuild, + + /** + * A chromosome number, e.g., "7". + */ + String chromosome, + + /** + * Start position of event. + */ + Integer startPosition, + + /** + * End position of event. + */ + Integer endPosition, + + /** + * We assume that the mutation is reported for the + strand. + */ + String strand, + + /** + * Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc. + */ + String variantClassification, + + /** + * Variant Type, e.g. SNP, DNP, etc. + */ + String variantType, + + /** + * The plus strand reference allele at this position. + */ + String referenceAllele, + + /** + * Primary data genotype. + */ + String tumorSeqAllele1, + + /** + * Primary data genotype. + */ + String tumorSeqAllele2, + + /** + * Latest dbSNP rs ID. + */ + String dbSnpRs, + + /** + * dbSNP validation status. + */ + String dbSnpValStatus, + + /** + * This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file. + */ + String tumorSampleBarcode, + + /** + * The sample ID for the matched normal sample. + */ + String matchedNormSampleBarcode, + + /** + * Primary data. + */ + String matchNormSeqAllele1, + + /** + * Primary data. + */ + String matchNormSeqAllele2, + + /** + * Secondary data from orthogonal technology. + */ + String tumorValidationAllele1, + + /** + * Secondary data from orthogonal technology. + */ + String tumorValidationAllele2, + + /** + * Secondary data from orthogonal technology. + */ + String matchNormValidationAllele1, + + /** + * Secondary data from orthogonal technology. + */ + String matchNormValidationAllele2, + + /** + * Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA". + */ + String verificationStatus, + + /** + * Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA". + */ + String validationStatus, + + /** + * "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text. + */ + String mutationStatus, + + /** + * Indicates current sequencing phase. + */ + String sequencingPhase, + + /** + * Molecular assay type used to produce the analytes used for sequencing. + */ + String sequenceSource, + + /** + * The assay platforms used for the validation call. + */ + String validationMethod, + + /** + * Not used. + */ + String score, + + /** + * Not used. + */ + String bamFile, + + /** + * Instrument used to produce primary data. + */ + String sequencer, + + /** + * Amino Acid Change, e.g. p.V600E. + */ + String hgvspShort, + + /** + * Variant allele count (tumor). + */ + Integer tAltCount, + + /** + * Reference allele count (tumor). + */ + Integer tRefCount, + + /** + * Variant allele count (normal). + */ + Integer nAltCount, + + /** + * Reference allele count (normal). + */ + Integer nRefCount +) {} \ No newline at end of file diff --git a/src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java b/src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java new file mode 100644 index 00000000000..6d8bc3c4b0d --- /dev/null +++ b/src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java @@ -0,0 +1,66 @@ +package org.cbioportal.file.export; + +import org.cbioportal.file.model.MafRecord; +import org.junit.Test; + +import java.io.StringWriter; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class MafRecordWriterTest { + + StringWriter output = new StringWriter(); + MafRecordWriter writer = new MafRecordWriter(output); + + @Test + public void testClinicalAttributeDataWriter() { + writer.write(List.of( + new MafRecord( + "HUGO", + "12345", + "center1", + "hg38", + "X", + 1000000, + 1000100, + "+", + "Missense_Mutation", + "SNP", + "T", + "C", + "A", + "DBSNPRS123", + "byFrequency", + "SAMPLE_1", + "SAMPLE_2", + "A", + "T", + "C", + "G", + "A", + "T", + "Verified", + "Somatic", + "Somatic", + "Phase1", + "Exome", + "Sanger", + "1.1", + "bam_file", + "Illumina hiseq 2000", + "SHRT", + 55, + 33, + 100, + 99 + ) + ).iterator()); + + assertEquals(""" + Hugo_Symbol\tEntrez_Gene_Id\tCenter\tNCBI_Build\tChromosome\tStart_Position\tEnd_Position\tStrand\tVariant_Classification\tVariant_Type\tReference_Allele\tTumor_Seq_Allele1\tTumor_Seq_Allele2\tdbSNP_RS\tdbSNP_Val_Status\tTumor_Sample_Barcode\tMatched_Norm_Sample_Barcode\tMatch_Norm_Seq_Allele1\tMatch_Norm_Seq_Allele2\tTumor_Validation_Allele1\tTumor_Validation_Allele2\tMatch_Norm_Validation_Allele1\tMatch_Norm_Validation_Allele2\tVerification_Status\tValidation_Status\tMutation_Status\tSequencing_Phase\tSequence_Source\tValidation_Method\tScore\tBAM_File\tSequencer\tHGVSc_Short\tt_alt_count\tt_ref_count\tn_alt_count\tn_ref_count + HUGO\t12345\tcenter1\thg38\tX\t1000000\t1000100\t+\tMissense_Mutation\tSNP\tT\tC\tA\tDBSNPRS123\tbyFrequency\tSAMPLE_1\tSAMPLE_2\tA\tT\tC\tG\tA\tT\tVerified\tSomatic\tSomatic\tPhase1\tExome\tSanger\t1.1\tbam_file\tIllumina hiseq 2000\tSHRT\t55\t33\t100\t99 + """, output.toString()); + } + +}