Skip to content

Commit

Permalink
Support MAF record writing
Browse files Browse the repository at this point in the history
  • Loading branch information
forus committed Dec 12, 2024
1 parent 1f0c37c commit cdaa1dd
Show file tree
Hide file tree
Showing 5 changed files with 352 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
import java.io.IOException;
import java.io.Writer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

import static org.cbioportal.file.export.TSVUtil.composeRow;

public class ClinicalAttributeDataWriter {

public static final String TAB = "\t";
private final Writer writer;

/**
Expand Down Expand Up @@ -55,8 +54,5 @@ private void writeCommentsRow(Iterable<String> row) {
}
}

private static String composeRow(Iterable<String> row) {
return StreamSupport.stream(row.spliterator(), false)
.map(s -> s.replace(TAB, "\\t")).collect(Collectors.joining(TAB)) + "\n";
}

}
79 changes: 79 additions & 0 deletions src/main/java/org/cbioportal/file/export/MafRecordWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package org.cbioportal.file.export;

import org.cbioportal.file.model.MafRecord;

import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.LinkedHashMap;

import static org.cbioportal.file.export.TSVUtil.composeRow;

/**
* Writes MAF records to a writer
*/
public class MafRecordWriter {
private final Writer writer;

public MafRecordWriter(Writer writer) {
this.writer = writer;
}

public void write(Iterator<MafRecord> maf) {
int line = 0;
while (maf.hasNext()) {
MafRecord mafRecord = maf.next();
LinkedHashMap<String, String> mafRow = new LinkedHashMap<>();
mafRow.put("Hugo_Symbol", mafRecord.hugoSymbol());
mafRow.put("Entrez_Gene_Id", mafRecord.entrezGeneId());
mafRow.put("Center", mafRecord.center());
mafRow.put("NCBI_Build", mafRecord.ncbiBuild());
mafRow.put("Chromosome", mafRecord.chromosome());
mafRow.put("Start_Position", mafRecord.startPosition().toString());
mafRow.put("End_Position", mafRecord.endPosition().toString());
mafRow.put("Strand", mafRecord.strand());
mafRow.put("Variant_Classification", mafRecord.variantClassification());
mafRow.put("Variant_Type", mafRecord.variantType());
mafRow.put("Reference_Allele", mafRecord.referenceAllele());
mafRow.put("Tumor_Seq_Allele1", mafRecord.tumorSeqAllele1());
mafRow.put("Tumor_Seq_Allele2", mafRecord.tumorSeqAllele2());
mafRow.put("dbSNP_RS", mafRecord.dbSnpRs());
mafRow.put("dbSNP_Val_Status", mafRecord.dbSnpValStatus());
mafRow.put("Tumor_Sample_Barcode", mafRecord.tumorSampleBarcode());
mafRow.put("Matched_Norm_Sample_Barcode", mafRecord.matchedNormSampleBarcode());
mafRow.put("Match_Norm_Seq_Allele1", mafRecord.matchNormSeqAllele1());
mafRow.put("Match_Norm_Seq_Allele2", mafRecord.matchNormSeqAllele2());
mafRow.put("Tumor_Validation_Allele1", mafRecord.tumorValidationAllele1());
mafRow.put("Tumor_Validation_Allele2", mafRecord.tumorValidationAllele2());
mafRow.put("Match_Norm_Validation_Allele1", mafRecord.matchNormValidationAllele1());
mafRow.put("Match_Norm_Validation_Allele2", mafRecord.matchNormValidationAllele2());
mafRow.put("Verification_Status", mafRecord.verificationStatus());
mafRow.put("Validation_Status", mafRecord.validationStatus());
mafRow.put("Mutation_Status", mafRecord.mutationStatus());
mafRow.put("Sequencing_Phase", mafRecord.sequencingPhase());
mafRow.put("Sequence_Source", mafRecord.sequenceSource());
mafRow.put("Validation_Method", mafRecord.validationMethod());
mafRow.put("Score", mafRecord.score());
mafRow.put("BAM_File", mafRecord.bamFile());
mafRow.put("Sequencer", mafRecord.sequencer());
mafRow.put("HGVSc_Short", mafRecord.hgvspShort());
mafRow.put("t_alt_count", mafRecord.tAltCount().toString());
mafRow.put("t_ref_count", mafRecord.tRefCount().toString());
mafRow.put("n_alt_count", mafRecord.nAltCount().toString());
mafRow.put("n_ref_count", mafRecord.nRefCount().toString());
if (line == 0) {
writeRow(mafRow.sequencedKeySet());
}
writeRow(mafRow.sequencedValues());
line++;
}
}

private void writeRow(Iterable<String> row) {
try {
writer.write(composeRow(row));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
13 changes: 13 additions & 0 deletions src/main/java/org/cbioportal/file/export/TSVUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package org.cbioportal.file.export;

import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

public class TSVUtil {
public static final String TAB = "\t";

public static String composeRow(Iterable<String> row) {
return StreamSupport.stream(row.spliterator(), false)
.map(s -> s.replace(TAB, "\\t")).collect(Collectors.joining(TAB)) + "\n";
}
}
191 changes: 191 additions & 0 deletions src/main/java/org/cbioportal/file/model/MafRecord.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package org.cbioportal.file.model;

/**
* Represents a record in a Mutation Annotation Format (MAF) file.
*/
public record MafRecord(
/**
* A HUGO gene symbol.
*/
String hugoSymbol,

/**
* A Entrez Gene identifier.
*/
String entrezGeneId,

/**
* The sequencing center.
*/
String center,

/**
* The Genome Reference Consortium Build used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse.
*/
String ncbiBuild,

/**
* A chromosome number, e.g., "7".
*/
String chromosome,

/**
* Start position of event.
*/
Integer startPosition,

/**
* End position of event.
*/
Integer endPosition,

/**
* We assume that the mutation is reported for the + strand.
*/
String strand,

/**
* Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc.
*/
String variantClassification,

/**
* Variant Type, e.g. SNP, DNP, etc.
*/
String variantType,

/**
* The plus strand reference allele at this position.
*/
String referenceAllele,

/**
* Primary data genotype.
*/
String tumorSeqAllele1,

/**
* Primary data genotype.
*/
String tumorSeqAllele2,

/**
* Latest dbSNP rs ID.
*/
String dbSnpRs,

/**
* dbSNP validation status.
*/
String dbSnpValStatus,

/**
* This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file.
*/
String tumorSampleBarcode,

/**
* The sample ID for the matched normal sample.
*/
String matchedNormSampleBarcode,

/**
* Primary data.
*/
String matchNormSeqAllele1,

/**
* Primary data.
*/
String matchNormSeqAllele2,

/**
* Secondary data from orthogonal technology.
*/
String tumorValidationAllele1,

/**
* Secondary data from orthogonal technology.
*/
String tumorValidationAllele2,

/**
* Secondary data from orthogonal technology.
*/
String matchNormValidationAllele1,

/**
* Secondary data from orthogonal technology.
*/
String matchNormValidationAllele2,

/**
* Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA".
*/
String verificationStatus,

/**
* Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA".
*/
String validationStatus,

/**
* "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text.
*/
String mutationStatus,

/**
* Indicates current sequencing phase.
*/
String sequencingPhase,

/**
* Molecular assay type used to produce the analytes used for sequencing.
*/
String sequenceSource,

/**
* The assay platforms used for the validation call.
*/
String validationMethod,

/**
* Not used.
*/
String score,

/**
* Not used.
*/
String bamFile,

/**
* Instrument used to produce primary data.
*/
String sequencer,

/**
* Amino Acid Change, e.g. p.V600E.
*/
String hgvspShort,

/**
* Variant allele count (tumor).
*/
Integer tAltCount,

/**
* Reference allele count (tumor).
*/
Integer tRefCount,

/**
* Variant allele count (normal).
*/
Integer nAltCount,

/**
* Reference allele count (normal).
*/
Integer nRefCount
) {}
66 changes: 66 additions & 0 deletions src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.cbioportal.file.export;

import org.cbioportal.file.model.MafRecord;
import org.junit.Test;

import java.io.StringWriter;
import java.util.List;

import static org.junit.Assert.assertEquals;

public class MafRecordWriterTest {

StringWriter output = new StringWriter();
MafRecordWriter writer = new MafRecordWriter(output);

@Test
public void testClinicalAttributeDataWriter() {
writer.write(List.of(
new MafRecord(
"HUGO",
"12345",
"center1",
"hg38",
"X",
1000000,
1000100,
"+",
"Missense_Mutation",
"SNP",
"T",
"C",
"A",
"DBSNPRS123",
"byFrequency",
"SAMPLE_1",
"SAMPLE_2",
"A",
"T",
"C",
"G",
"A",
"T",
"Verified",
"Somatic",
"Somatic",
"Phase1",
"Exome",
"Sanger",
"1.1",
"bam_file",
"Illumina hiseq 2000",
"SHRT",
55,
33,
100,
99
)
).iterator());

assertEquals("""
Hugo_Symbol\tEntrez_Gene_Id\tCenter\tNCBI_Build\tChromosome\tStart_Position\tEnd_Position\tStrand\tVariant_Classification\tVariant_Type\tReference_Allele\tTumor_Seq_Allele1\tTumor_Seq_Allele2\tdbSNP_RS\tdbSNP_Val_Status\tTumor_Sample_Barcode\tMatched_Norm_Sample_Barcode\tMatch_Norm_Seq_Allele1\tMatch_Norm_Seq_Allele2\tTumor_Validation_Allele1\tTumor_Validation_Allele2\tMatch_Norm_Validation_Allele1\tMatch_Norm_Validation_Allele2\tVerification_Status\tValidation_Status\tMutation_Status\tSequencing_Phase\tSequence_Source\tValidation_Method\tScore\tBAM_File\tSequencer\tHGVSc_Short\tt_alt_count\tt_ref_count\tn_alt_count\tn_ref_count
HUGO\t12345\tcenter1\thg38\tX\t1000000\t1000100\t+\tMissense_Mutation\tSNP\tT\tC\tA\tDBSNPRS123\tbyFrequency\tSAMPLE_1\tSAMPLE_2\tA\tT\tC\tG\tA\tT\tVerified\tSomatic\tSomatic\tPhase1\tExome\tSanger\t1.1\tbam_file\tIllumina hiseq 2000\tSHRT\t55\t33\t100\t99
""", output.toString());
}

}

0 comments on commit cdaa1dd

Please sign in to comment.