From 8e325532c102cefdffb697575059cb124a8fb6fc Mon Sep 17 00:00:00 2001 From: James McFeeters Date: Mon, 19 Dec 2022 14:04:24 -0600 Subject: [PATCH 1/3] Add sample status output --- .../java/org/pankratzlab/kdmatch/KDMatch.java | 23 +++++++++++++++---- .../java/org/pankratzlab/kdmatch/Match.java | 16 +++++++++---- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/pankratzlab/kdmatch/KDMatch.java b/src/main/java/org/pankratzlab/kdmatch/KDMatch.java index 790d57b..325e5fd 100644 --- a/src/main/java/org/pankratzlab/kdmatch/KDMatch.java +++ b/src/main/java/org/pankratzlab/kdmatch/KDMatch.java @@ -2,6 +2,7 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; @@ -18,7 +19,6 @@ import java.util.logging.Logger; import java.util.stream.Collectors; import java.util.stream.Stream; -import java.util.zip.GZIPOutputStream; public class KDMatch { @@ -35,12 +35,12 @@ public class KDMatch { // controls. Note: this portion is multi-threaded within communities (i.e. optimizes within // communities of matches that are connected by at least one control) - private static void run(Path inputFileAnchor, Path inputFileBarns, Path ouputDir, + private static void run(Path inputFileAnchor, Path inputFileBarns, Path outputDir, int initialNumSelect, int finalNumSelect, int threads, Logger log) throws IOException, InterruptedException, ExecutionException { String[] headerA = Files.lines(inputFileAnchor).findFirst().get().toString().trim().split("\t"); String[] headerB = Files.lines(inputFileBarns).findFirst().get().toString().trim().split("\t"); - new File(ouputDir.toString()).mkdirs(); + new File(outputDir.toString()).mkdirs(); if (Arrays.equals(headerA, headerB)) { KDTree kdTree = new KDTree<>(headerA.length - 1);// dimension of the data to be @@ -59,13 +59,16 @@ private static void run(Path inputFileAnchor, Path inputFileBarns, Path ouputDir getSampleStreamFromFile(inputFileAnchor), initialNumSelect) .collect(Collectors.toList()); - String outputBase = ouputDir + File.separator + "test.match.AllowDups.txt.gz"; + String outputBase = outputDir + File.separator + "test.match.AllowDups.txt.gz"; log.info("reporting full baseline selection of " + initialNumSelect + " nearest neighbors to " + outputBase); writeToFile(naiveMatches.stream(), outputBase, headerA, headerB, initialNumSelect); - String outputOpt = ouputDir + File.separator + "test.match.optimized.txt.gz"; + String statusBase = outputDir + File.separator + "test.status.AllowDups.txt"; + writeSampleStatusFile(naiveMatches.stream(), statusBase, initialNumSelect); + + String outputOpt = outputDir + File.separator + "test.match.optimized.txt.gz"; log.info("selecting " + naiveMatches + " optimized nearest neighbors"); @@ -77,6 +80,8 @@ private static void run(Path inputFileAnchor, Path inputFileBarns, Path ouputDir writeToFile(optimizedMatches, outputOpt, headerA, headerB, finalNumSelect); + String statusOptimized = outputDir + File.separator + "test.status.optimized.txt"; + writeSampleStatusFile(optimizedMatches, statusOptimized, finalNumSelect); } } @@ -119,6 +124,14 @@ private static void addHeader(int numToSelect, String[] headerA, String[] header writer.println(header); } + public static void writeSampleStatusFile(Stream matches, String outputFileName, int numToSelect) + throws FileNotFoundException { + PrintWriter writer = new PrintWriter(new FileOutputStream(outputFileName, true)); + String header = "id\tstatus\tmatched_case_id"; + writer.println(header); + matches.flatMap(m -> m.getStatusFileLines(numToSelect)).forEach(writer::println); + } + public static void main(String[] args) { // Assumed that the input files are tab delimited with a header, first column is IDs and the diff --git a/src/main/java/org/pankratzlab/kdmatch/Match.java b/src/main/java/org/pankratzlab/kdmatch/Match.java index ec084c8..1ca5ae9 100644 --- a/src/main/java/org/pankratzlab/kdmatch/Match.java +++ b/src/main/java/org/pankratzlab/kdmatch/Match.java @@ -5,10 +5,10 @@ import java.util.Set; import java.util.StringJoiner; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Holds a sample, and the potential matches (i.e nearest neighbors) - * */ public class Match { Sample sample; @@ -80,13 +80,13 @@ String getFormattedResults(int numToSelect) { results.add(Double.toString(control.dim[j])); } if (!control.getGroup().equals("")) { - results.add(control.getGroup()); + results.add(control.getGroup()); } else { - results.add("no_group"); + results.add("no_group"); } } else { // TODO untested - results.add("no-match"); + results.add("no-match"); results.add(Double.toString(Double.NaN)); for (int j = 0; j < sample.dim.length; j++) { results.add(Double.toString(Double.NaN)); @@ -96,6 +96,14 @@ String getFormattedResults(int numToSelect) { } results.add(Boolean.toString(hungarian)); return results.toString(); + } + Stream getStatusFileLines(int numToSelect) { + Stream.Builder streamBuilder = Stream.builder(); + streamBuilder.add(String.join("\t", sample.ID, "1", sample.ID)); + for (Sample s : this.matches.subList(0, numToSelect)) { + streamBuilder.add(String.join("\t", s.ID, "0", sample.ID)); + } + return streamBuilder.build(); } } From a1a57b6dfc02b4ff03f31f2e13630325b6e5fdef Mon Sep 17 00:00:00 2001 From: James McFeeters Date: Wed, 21 Dec 2022 15:22:34 -0600 Subject: [PATCH 2/3] Use printWriter in try-with-resources --- src/main/java/org/pankratzlab/kdmatch/KDMatch.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/pankratzlab/kdmatch/KDMatch.java b/src/main/java/org/pankratzlab/kdmatch/KDMatch.java index 325e5fd..bd90f71 100644 --- a/src/main/java/org/pankratzlab/kdmatch/KDMatch.java +++ b/src/main/java/org/pankratzlab/kdmatch/KDMatch.java @@ -124,12 +124,13 @@ private static void addHeader(int numToSelect, String[] headerA, String[] header writer.println(header); } - public static void writeSampleStatusFile(Stream matches, String outputFileName, int numToSelect) - throws FileNotFoundException { - PrintWriter writer = new PrintWriter(new FileOutputStream(outputFileName, true)); - String header = "id\tstatus\tmatched_case_id"; - writer.println(header); - matches.flatMap(m -> m.getStatusFileLines(numToSelect)).forEach(writer::println); + public static void writeSampleStatusFile(Stream matches, String outputFileName, + int numToSelect) throws FileNotFoundException { + try (PrintWriter writer = new PrintWriter(new FileOutputStream(outputFileName, true))) { + String header = "id\tstatus\tmatched_case_id"; + writer.println(header); + matches.flatMap(m -> m.getStatusFileLines(numToSelect)).forEach(writer::println); + } } public static void main(String[] args) { From 99d5b16c215bb550d116c439528c4d5bb1c904fc Mon Sep 17 00:00:00 2001 From: James McFeeters Date: Wed, 21 Dec 2022 15:45:19 -0600 Subject: [PATCH 3/3] =?UTF-8?q?Ensure=20numToSelect=20=E2=89=A4=20actual?= =?UTF-8?q?=20number=20of=20matches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main/java/org/pankratzlab/kdmatch/Match.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/pankratzlab/kdmatch/Match.java b/src/main/java/org/pankratzlab/kdmatch/Match.java index 1ca5ae9..cdbc3fc 100644 --- a/src/main/java/org/pankratzlab/kdmatch/Match.java +++ b/src/main/java/org/pankratzlab/kdmatch/Match.java @@ -99,6 +99,7 @@ String getFormattedResults(int numToSelect) { } Stream getStatusFileLines(int numToSelect) { + numToSelect = Math.min(numToSelect, this.matches.size()); Stream.Builder streamBuilder = Stream.builder(); streamBuilder.add(String.join("\t", sample.ID, "1", sample.ID)); for (Sample s : this.matches.subList(0, numToSelect)) {