-
Notifications
You must be signed in to change notification settings - Fork 558
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
352 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
79 changes: 79 additions & 0 deletions
79
src/main/java/org/cbioportal/file/export/MafRecordWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package org.cbioportal.file.export; | ||
|
||
import org.cbioportal.file.model.MafRecord; | ||
|
||
import java.io.IOException; | ||
import java.io.Writer; | ||
import java.util.Iterator; | ||
import java.util.LinkedHashMap; | ||
|
||
import static org.cbioportal.file.export.TSVUtil.composeRow; | ||
|
||
/** | ||
* Writes MAF records to a writer | ||
*/ | ||
public class MafRecordWriter { | ||
private final Writer writer; | ||
|
||
public MafRecordWriter(Writer writer) { | ||
this.writer = writer; | ||
} | ||
|
||
public void write(Iterator<MafRecord> maf) { | ||
int line = 0; | ||
while (maf.hasNext()) { | ||
MafRecord mafRecord = maf.next(); | ||
LinkedHashMap<String, String> mafRow = new LinkedHashMap<>(); | ||
mafRow.put("Hugo_Symbol", mafRecord.hugoSymbol()); | ||
mafRow.put("Entrez_Gene_Id", mafRecord.entrezGeneId()); | ||
mafRow.put("Center", mafRecord.center()); | ||
mafRow.put("NCBI_Build", mafRecord.ncbiBuild()); | ||
mafRow.put("Chromosome", mafRecord.chromosome()); | ||
mafRow.put("Start_Position", mafRecord.startPosition().toString()); | ||
mafRow.put("End_Position", mafRecord.endPosition().toString()); | ||
mafRow.put("Strand", mafRecord.strand()); | ||
mafRow.put("Variant_Classification", mafRecord.variantClassification()); | ||
mafRow.put("Variant_Type", mafRecord.variantType()); | ||
mafRow.put("Reference_Allele", mafRecord.referenceAllele()); | ||
mafRow.put("Tumor_Seq_Allele1", mafRecord.tumorSeqAllele1()); | ||
mafRow.put("Tumor_Seq_Allele2", mafRecord.tumorSeqAllele2()); | ||
mafRow.put("dbSNP_RS", mafRecord.dbSnpRs()); | ||
mafRow.put("dbSNP_Val_Status", mafRecord.dbSnpValStatus()); | ||
mafRow.put("Tumor_Sample_Barcode", mafRecord.tumorSampleBarcode()); | ||
mafRow.put("Matched_Norm_Sample_Barcode", mafRecord.matchedNormSampleBarcode()); | ||
mafRow.put("Match_Norm_Seq_Allele1", mafRecord.matchNormSeqAllele1()); | ||
mafRow.put("Match_Norm_Seq_Allele2", mafRecord.matchNormSeqAllele2()); | ||
mafRow.put("Tumor_Validation_Allele1", mafRecord.tumorValidationAllele1()); | ||
mafRow.put("Tumor_Validation_Allele2", mafRecord.tumorValidationAllele2()); | ||
mafRow.put("Match_Norm_Validation_Allele1", mafRecord.matchNormValidationAllele1()); | ||
mafRow.put("Match_Norm_Validation_Allele2", mafRecord.matchNormValidationAllele2()); | ||
mafRow.put("Verification_Status", mafRecord.verificationStatus()); | ||
mafRow.put("Validation_Status", mafRecord.validationStatus()); | ||
mafRow.put("Mutation_Status", mafRecord.mutationStatus()); | ||
mafRow.put("Sequencing_Phase", mafRecord.sequencingPhase()); | ||
mafRow.put("Sequence_Source", mafRecord.sequenceSource()); | ||
mafRow.put("Validation_Method", mafRecord.validationMethod()); | ||
mafRow.put("Score", mafRecord.score()); | ||
mafRow.put("BAM_File", mafRecord.bamFile()); | ||
mafRow.put("Sequencer", mafRecord.sequencer()); | ||
mafRow.put("HGVSc_Short", mafRecord.hgvspShort()); | ||
mafRow.put("t_alt_count", mafRecord.tAltCount().toString()); | ||
mafRow.put("t_ref_count", mafRecord.tRefCount().toString()); | ||
mafRow.put("n_alt_count", mafRecord.nAltCount().toString()); | ||
mafRow.put("n_ref_count", mafRecord.nRefCount().toString()); | ||
if (line == 0) { | ||
writeRow(mafRow.sequencedKeySet()); | ||
} | ||
writeRow(mafRow.sequencedValues()); | ||
line++; | ||
} | ||
} | ||
|
||
private void writeRow(Iterable<String> row) { | ||
try { | ||
writer.write(composeRow(row)); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package org.cbioportal.file.export; | ||
|
||
import java.util.stream.Collectors; | ||
import java.util.stream.StreamSupport; | ||
|
||
public class TSVUtil { | ||
public static final String TAB = "\t"; | ||
|
||
public static String composeRow(Iterable<String> row) { | ||
return StreamSupport.stream(row.spliterator(), false) | ||
.map(s -> s.replace(TAB, "\\t")).collect(Collectors.joining(TAB)) + "\n"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
package org.cbioportal.file.model; | ||
|
||
/** | ||
* Represents a record in a Mutation Annotation Format (MAF) file. | ||
*/ | ||
public record MafRecord( | ||
/** | ||
* A HUGO gene symbol. | ||
*/ | ||
String hugoSymbol, | ||
|
||
/** | ||
* A Entrez Gene identifier. | ||
*/ | ||
String entrezGeneId, | ||
|
||
/** | ||
* The sequencing center. | ||
*/ | ||
String center, | ||
|
||
/** | ||
* The Genome Reference Consortium Build used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse. | ||
*/ | ||
String ncbiBuild, | ||
|
||
/** | ||
* A chromosome number, e.g., "7". | ||
*/ | ||
String chromosome, | ||
|
||
/** | ||
* Start position of event. | ||
*/ | ||
Integer startPosition, | ||
|
||
/** | ||
* End position of event. | ||
*/ | ||
Integer endPosition, | ||
|
||
/** | ||
* We assume that the mutation is reported for the + strand. | ||
*/ | ||
String strand, | ||
|
||
/** | ||
* Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc. | ||
*/ | ||
String variantClassification, | ||
|
||
/** | ||
* Variant Type, e.g. SNP, DNP, etc. | ||
*/ | ||
String variantType, | ||
|
||
/** | ||
* The plus strand reference allele at this position. | ||
*/ | ||
String referenceAllele, | ||
|
||
/** | ||
* Primary data genotype. | ||
*/ | ||
String tumorSeqAllele1, | ||
|
||
/** | ||
* Primary data genotype. | ||
*/ | ||
String tumorSeqAllele2, | ||
|
||
/** | ||
* Latest dbSNP rs ID. | ||
*/ | ||
String dbSnpRs, | ||
|
||
/** | ||
* dbSNP validation status. | ||
*/ | ||
String dbSnpValStatus, | ||
|
||
/** | ||
* This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file. | ||
*/ | ||
String tumorSampleBarcode, | ||
|
||
/** | ||
* The sample ID for the matched normal sample. | ||
*/ | ||
String matchedNormSampleBarcode, | ||
|
||
/** | ||
* Primary data. | ||
*/ | ||
String matchNormSeqAllele1, | ||
|
||
/** | ||
* Primary data. | ||
*/ | ||
String matchNormSeqAllele2, | ||
|
||
/** | ||
* Secondary data from orthogonal technology. | ||
*/ | ||
String tumorValidationAllele1, | ||
|
||
/** | ||
* Secondary data from orthogonal technology. | ||
*/ | ||
String tumorValidationAllele2, | ||
|
||
/** | ||
* Secondary data from orthogonal technology. | ||
*/ | ||
String matchNormValidationAllele1, | ||
|
||
/** | ||
* Secondary data from orthogonal technology. | ||
*/ | ||
String matchNormValidationAllele2, | ||
|
||
/** | ||
* Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA". | ||
*/ | ||
String verificationStatus, | ||
|
||
/** | ||
* Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA". | ||
*/ | ||
String validationStatus, | ||
|
||
/** | ||
* "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text. | ||
*/ | ||
String mutationStatus, | ||
|
||
/** | ||
* Indicates current sequencing phase. | ||
*/ | ||
String sequencingPhase, | ||
|
||
/** | ||
* Molecular assay type used to produce the analytes used for sequencing. | ||
*/ | ||
String sequenceSource, | ||
|
||
/** | ||
* The assay platforms used for the validation call. | ||
*/ | ||
String validationMethod, | ||
|
||
/** | ||
* Not used. | ||
*/ | ||
String score, | ||
|
||
/** | ||
* Not used. | ||
*/ | ||
String bamFile, | ||
|
||
/** | ||
* Instrument used to produce primary data. | ||
*/ | ||
String sequencer, | ||
|
||
/** | ||
* Amino Acid Change, e.g. p.V600E. | ||
*/ | ||
String hgvspShort, | ||
|
||
/** | ||
* Variant allele count (tumor). | ||
*/ | ||
Integer tAltCount, | ||
|
||
/** | ||
* Reference allele count (tumor). | ||
*/ | ||
Integer tRefCount, | ||
|
||
/** | ||
* Variant allele count (normal). | ||
*/ | ||
Integer nAltCount, | ||
|
||
/** | ||
* Reference allele count (normal). | ||
*/ | ||
Integer nRefCount | ||
) {} |
66 changes: 66 additions & 0 deletions
66
src/test/java/org/cbioportal/file/export/MafRecordWriterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.cbioportal.file.export; | ||
|
||
import org.cbioportal.file.model.MafRecord; | ||
import org.junit.Test; | ||
|
||
import java.io.StringWriter; | ||
import java.util.List; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
public class MafRecordWriterTest { | ||
|
||
StringWriter output = new StringWriter(); | ||
MafRecordWriter writer = new MafRecordWriter(output); | ||
|
||
@Test | ||
public void testClinicalAttributeDataWriter() { | ||
writer.write(List.of( | ||
new MafRecord( | ||
"HUGO", | ||
"12345", | ||
"center1", | ||
"hg38", | ||
"X", | ||
1000000, | ||
1000100, | ||
"+", | ||
"Missense_Mutation", | ||
"SNP", | ||
"T", | ||
"C", | ||
"A", | ||
"DBSNPRS123", | ||
"byFrequency", | ||
"SAMPLE_1", | ||
"SAMPLE_2", | ||
"A", | ||
"T", | ||
"C", | ||
"G", | ||
"A", | ||
"T", | ||
"Verified", | ||
"Somatic", | ||
"Somatic", | ||
"Phase1", | ||
"Exome", | ||
"Sanger", | ||
"1.1", | ||
"bam_file", | ||
"Illumina hiseq 2000", | ||
"SHRT", | ||
55, | ||
33, | ||
100, | ||
99 | ||
) | ||
).iterator()); | ||
|
||
assertEquals(""" | ||
Hugo_Symbol\tEntrez_Gene_Id\tCenter\tNCBI_Build\tChromosome\tStart_Position\tEnd_Position\tStrand\tVariant_Classification\tVariant_Type\tReference_Allele\tTumor_Seq_Allele1\tTumor_Seq_Allele2\tdbSNP_RS\tdbSNP_Val_Status\tTumor_Sample_Barcode\tMatched_Norm_Sample_Barcode\tMatch_Norm_Seq_Allele1\tMatch_Norm_Seq_Allele2\tTumor_Validation_Allele1\tTumor_Validation_Allele2\tMatch_Norm_Validation_Allele1\tMatch_Norm_Validation_Allele2\tVerification_Status\tValidation_Status\tMutation_Status\tSequencing_Phase\tSequence_Source\tValidation_Method\tScore\tBAM_File\tSequencer\tHGVSc_Short\tt_alt_count\tt_ref_count\tn_alt_count\tn_ref_count | ||
HUGO\t12345\tcenter1\thg38\tX\t1000000\t1000100\t+\tMissense_Mutation\tSNP\tT\tC\tA\tDBSNPRS123\tbyFrequency\tSAMPLE_1\tSAMPLE_2\tA\tT\tC\tG\tA\tT\tVerified\tSomatic\tSomatic\tPhase1\tExome\tSanger\t1.1\tbam_file\tIllumina hiseq 2000\tSHRT\t55\t33\t100\t99 | ||
""", output.toString()); | ||
} | ||
|
||
} |