Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory improvement when writing missing positions to pet #7098

Merged
merged 8 commits into from
Feb 24, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefi
try {
final File petOutputFile = new File(outputDirectory, PET_FILETYPE_PREFIX + tableNumberPrefix + sampleName + "." + outputType.toString().toLowerCase());
switch (outputType) {
case TSV:
case TSV:
List<String> petHeader = PetTsvCreator.getHeaders();
petTsvWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR);
petTsvWriter.setHeaderLine(petHeader);
Expand Down Expand Up @@ -157,21 +157,21 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
String state = TSVLineToCreatePet.get(2);

switch (outputType) {
case TSV:
case TSV:
petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
break;
case TSV2:
petTsv2Writer.addRow(location, sampleId, state);
case TSV2:
petTsv2Writer.addRow(location, sampleId, state);
break;
case ORC:
petOrcWriter.addRow(location, sampleId, state);
petOrcWriter.addRow(location, sampleId, state);
break;
case AVRO:
petAvroWriter.addRow(location, sampleId, state);
petAvroWriter.addRow(location, sampleId, state);
break;
case PARQUET:
petParquetWriter.addRow(location, sampleId, state);
break;
petParquetWriter.addRow(location, sampleId, state);
break;
}
}
}
Expand All @@ -184,33 +184,15 @@ public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSo
GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
// for each block of uncovered locations
for (GenomeLoc genomeLoc : uncoveredIntervals) {
final String contig = genomeLoc.getContig();
// write the position to the XSV
for (List<String> TSVLineToCreatePet : PetTsvCreator.createMissingTSV(
// write all positions in this block to the pet output
writeMissingPositions(
SchemaUtils.encodeLocation(contig, genomeLoc.getStart()),
SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()),
sampleId
)) {
long location = Long.parseLong(TSVLineToCreatePet.get(0));
long sampleId = Long.parseLong(TSVLineToCreatePet.get(1));
String state = TSVLineToCreatePet.get(2);

switch (outputType) {
case TSV:
petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
break;
case ORC:
petOrcWriter.addRow(location, sampleId, state);
break;
case AVRO:
petAvroWriter.addRow(location, sampleId, state);
break;
case PARQUET:
petParquetWriter.addRow(location, sampleId, state);
break;
}
}
);
}
}

Expand Down Expand Up @@ -287,18 +269,33 @@ public List<List<String>> createSpanDelRows(final long start, final long end, fi
return rows;
}

public static List<List<String>> createMissingTSV(long start, long end, String sampleName) {
List<List<String>> rows = new ArrayList<>();

for (long position = start; position <= end; position ++){
public void writeMissingPositions(long start, long end, String sampleName) throws IOException {
for (long position = start; position <= end; position++){
List<String> row = new ArrayList<>();
row.add(String.valueOf(position));
row.add(sampleName);
row.add(GQStateEnum.MISSING.value);
rows.add(row);
}

return rows;
// TODO refactor - this only needs to be done for non-TSV outputTypes
long location = Long.parseLong(row.get(0));
long sampleId = Long.parseLong(row.get(1));
String state = row.get(2);

switch (outputType) {
case TSV:
petTsvWriter.getNewLineBuilder().setRow(row).write();
break;
case ORC:
petOrcWriter.addRow(location, sampleId, state);
break;
case AVRO:
petAvroWriter.addRow(location, sampleId, state);
break;
case PARQUET:
petParquetWriter.addRow(location, sampleId, state);
break;
}
}
}

public static GQStateEnum getGQStateEnum(int GQ){
Expand Down Expand Up @@ -326,22 +323,22 @@ public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
Set<GQStateEnum> ret = new HashSet<GQStateEnum>();

switch (s) {
case ZERO:
case ZERO:
ret.add(GQStateEnum.TEN);
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TEN:
case TEN:
ret.add(GQStateEnum.TWENTY);
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
ret.add(GQStateEnum.SIXTY);
break;
case TWENTY:
case TWENTY:
ret.add(GQStateEnum.THIRTY);
ret.add(GQStateEnum.FORTY);
ret.add(GQStateEnum.FIFTY);
Expand All @@ -363,15 +360,15 @@ public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){

return ret;
}

public static List<String> getHeaders() {
return Arrays.stream(PetFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
}

public void closeTool() {
try {
switch (outputType) {
case TSV:
case TSV:
if (petTsvWriter != null) petTsvWriter.close();
break;
case ORC:
Expand Down