diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java index 6b63c8f70cf..56339b68296 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java @@ -149,6 +149,6 @@ public Object onTraversalSuccess() { @Override public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) { final byte refAsByte = referenceContext.getBase(); - allelicCountCollector.collectAtLocus(Nucleotide.valueOf(refAsByte), alignmentContext.getBasePileup(), alignmentContext.getLocation(), minimumBaseQuality); + allelicCountCollector.collectAtLocus(Nucleotide.decode(refAsByte), alignmentContext.getBasePileup(), alignmentContext.getLocation(), minimumBaseQuality); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java index 61d0e5a5cdb..3a943bdfcc7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java @@ -202,7 +202,7 @@ private static IntervalList generateBins(final IntervalList preparedIntervalList private static IntervalList filterBinsContainingOnlyNs(final IntervalList unfilteredBins, final ReferenceDataSource reference) { final IntervalList bins = new IntervalList(reference.getSequenceDictionary()); for (final Interval unfilteredBin : unfilteredBins) { - if (!Utils.stream(reference.query(new SimpleInterval(unfilteredBin))).allMatch(b -> b == Nucleotide.N.toBase())) { + if (!Utils.stream(reference.query(new SimpleInterval(unfilteredBin))).allMatch(b -> Nucleotide.decode(b) == Nucleotide.N)) { bins.add(unfilteredBin); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java index 6367420b20d..e1ae69e1649 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java @@ -40,8 +40,8 @@ enum AllelicCountTableColumn { final int position = dataLine.getInt(AllelicCountTableColumn.POSITION); final int refReadCount = dataLine.getInt(AllelicCountTableColumn.REF_COUNT); final int altReadCount = dataLine.getInt(AllelicCountTableColumn.ALT_COUNT); - final Nucleotide refNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.REF_NUCLEOTIDE.name()).getBytes()[0]); - final Nucleotide altNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.ALT_NUCLEOTIDE.name()).getBytes()[0]); + final Nucleotide refNucleotide = Nucleotide.decode(dataLine.get(AllelicCountTableColumn.REF_NUCLEOTIDE.name()).charAt(0)); + final Nucleotide altNucleotide = Nucleotide.decode(dataLine.get(AllelicCountTableColumn.ALT_NUCLEOTIDE.name()).charAt(0)); final SimpleInterval interval = new SimpleInterval(contig, position, position); return new AllelicCount(interval, refReadCount, altReadCount, refNucleotide, altNucleotide); }; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java b/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java index ac88f8871d2..3c2a220c31d 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java @@ -4,7 +4,7 @@ import java.util.stream.LongStream; /** - * Represents the nucleotide alphabet. + * Represents the nucleotide alphabet with support for IUPAC ambiguity codes. * *
* This enumeration not only contains concrete nucleotides, but also @@ -14,19 +14,106 @@ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ public enum Nucleotide { - A, C, G, T, N, X, INVALID; - private static final Nucleotide[] baseToValue = new Nucleotide[Byte.MAX_VALUE + 1]; + // Basic nucleotide codes, + // and their one-bit-encoding masks CODE(0xTGCA): + A(0b0001), + C(0b0010), + G(0b0100), + T(0b1000), + + // Extended codes: + // CODE(included nucs) + R(A,G), // Purine + Y(C,T), // Pyrimidines + S(C,G), // Strong nucletoides. + W(A,T), // Weak nucleotides + K(G,T), // Keto + M(A,C), // Amino + B(C,G,T), // Not-A (B follows A) + D(A,G,T), // Not-C (D follows C) + H(A,C,T), // Not-G (H follows G) + V(A,C,G), // Not-V (V follows T) + N(A,C,G,T), // Any/Unknown + X(); // Invalid. + + // Uracil is considered equivalent to Thymine. + public static final Nucleotide U = T; + + // Convenient long form alternative names for some of the enumeration values: + + // Long form concrete nucleotide names. + @SuppressWarnings("unused") + public static final Nucleotide ADENINE = A; + @SuppressWarnings("unused") + public static final Nucleotide CYTOSINE = C; + @SuppressWarnings("unused") + public static final Nucleotide GUANINE = G; + @SuppressWarnings("unused") + public static final Nucleotide THYMINE = T; + @SuppressWarnings("unused") + public static final Nucleotide URACIL = U; + + // Ambiguous nucleotide groups with proper long form names: + public static final Nucleotide STRONG = S; + public static final Nucleotide WEAK = W; + public static final Nucleotide PURINE = R; + public static final Nucleotide PYRIMIDINE = Y; + @SuppressWarnings("unused") + public static final Nucleotide AMINO = M; + @SuppressWarnings("unused") + public static final Nucleotide KETO = K; + @SuppressWarnings("unused") + public static final Nucleotide ANY = N; + @SuppressWarnings("unused") + public static final Nucleotide UNKNOWN = N; + public static final Nucleotide INVALID = X; + + // actually calling values() is costly (creates a new array every time) and often we do just to find out the + // total number of constants. + private static final int NUMBER_OF_CONSTANTS; + + private static final Nucleotide[] baseToValue; + private static final Nucleotide[] maskToValue; static { - Arrays.fill(baseToValue, INVALID); - baseToValue['a'] = baseToValue['A'] = A; - baseToValue['c'] = baseToValue['C'] = C; - baseToValue['g'] = baseToValue['G'] = G; - baseToValue['t'] = baseToValue['T'] = T; - baseToValue['u'] = baseToValue['U'] = T; - baseToValue['x'] = baseToValue['X'] = X; - baseToValue['n'] = baseToValue['N'] = N; + final Nucleotide[] values = values(); + NUMBER_OF_CONSTANTS = values.length; + baseToValue = new Nucleotide[1 << Byte.SIZE]; + maskToValue = new Nucleotide[16]; + Arrays.fill(baseToValue, X); + for (final Nucleotide nucleotide : values) { + baseToValue[nucleotide.lowerCaseByteEncoding] = baseToValue[nucleotide.upperCaseByteEncoding] = nucleotide; + maskToValue[nucleotide.mask] = nucleotide; + } + baseToValue['u'] = baseToValue['U'] = U; + } + + private final int mask; + private final boolean isConcrete; + private Nucleotide complement; + private Nucleotide transition; + private Nucleotide transversion; + + /** + * Holds lower-case byte encoding for this nucleotide; {@code 0} for {@link Nucleotide#INVALID}. + */ + private final byte lowerCaseByteEncoding; + + /** + * Holds the upper-case byte encoding for this nucleotide; {@code 0} for {@link Nucleotide#INVALID}. + */ + private final byte upperCaseByteEncoding; + + Nucleotide(final int mask) { + this.mask = mask; + isConcrete = Integer.bitCount(mask & 0b1111) == 1; + lowerCaseByteEncoding = (byte) Character.toLowerCase(name().charAt(0)); + upperCaseByteEncoding = (byte) Character.toUpperCase(name().charAt(0)); + } + + Nucleotide(final Nucleotide ... nucs) { + this(Arrays.stream(nucs).mapToInt(nuc -> nuc.mask).reduce((a, b) -> a | b).orElse(0)); } /** @@ -37,16 +124,18 @@ public enum Nucleotide { *
* The {@link #INVALID} nucleotide does not have an actual base then resulting in an exception. *
- * @throws UnsupportedOperationException if this nucleotide does not have a byte representation such - * as {@link #INVALID}. - * @return a positive byte value. - */ - public byte toBase() { - if (this == INVALID) { - throw new UnsupportedOperationException("the invalid nucleotide does not have a base byte"); - } else { - return (byte) name().charAt(0); - } + * @return a valid byte representation for a nucleotide, {@code 0} for {@link Nucleotide#INVALID}. + */ + public byte encodeAsByte(final boolean upperCase) { + return upperCase ? upperCaseByteEncoding : lowerCaseByteEncoding; + } + + /** + * Returns the nucleotide encoding in a byte using its upper-case representation. + * @return a valid upper-case byte representation for a nucleotide, {@code 0} for {@link Nucleotide#INVALID}. + */ + public byte encodeAsByte() { + return upperCaseByteEncoding; } /** @@ -56,30 +145,198 @@ public byte toBase() { * @return never {@code null}, but {@link #INVALID} if the base code does not * correspond to a valid nucleotide specification. */ - public static Nucleotide valueOf(final byte base) { + public static Nucleotide decode(final byte base) { return baseToValue[Utils.validIndex(base, baseToValue.length)]; } + public static Nucleotide decode(final char base) { + return decode((byte) base); + } + /** * Checks whether the nucleotide refer to a concrete (rather than ambiguous) base. - * @return + * @return {@code true} iff this is a concrete nucleotide. */ public boolean isConcrete() { - return ordinal() < N.ordinal(); + return isConcrete; + } + + /** + * Checks whether the nucleotide refer to an ambiguous base. + * @return {@code true} iff this is an ambiguous nucleotide. + */ + public boolean isAmbiguous() { + return !isConcrete && this != INVALID; + } + + public boolean isValid() { + return this != INVALID; + } + + /** + * Checks whether this nucleotide code encloses all possible nucleotides for another code. + * @param other the other nucleotide to compare to. + * @return {@code true} iff any nucleotide in {@code other} is enclosed it this code. + */ + public boolean includes(final Nucleotide other) { + Utils.nonNull(other); + return other != INVALID && (mask & other.mask) == other.mask; + } + + public boolean includes(final byte b) { + return includes(decode(b)); + } + + public Nucleotide intersect(final Nucleotide other) { + return maskToValue[mask & other.mask]; + } + + /** + * Checks whether to base encodings make reference to the same {@link #Nucleotide} + * instance regardless of their case. + *+ * This method is a shorthard for: + *
{@link #decode}(a){@link #same(Nucleotide) same}({@link #decode}(b)). + * + * + *
+ * The order of the inputs is not relevant, therefore {@code same(a, b) == same(b, a)} for any + * given {@code a} and {@code b}. + *
+ *+ * Notice that if either or both input bases make reference to an invalid nucleotide (i.e.
{@link #decode}(x) == {@link #INVALID}}, + * this method will return {@code false} even if {@code a == b}. + * + * @param a the first base to compare (however order is not relevant). + * @param b the second base to compare (however order is not relevant). + * @return {@code true} iff {@code {@link #decode}}.same({@link #decode}(b))}} + */ + public static boolean same(final byte a, final byte b) { + return baseToValue[a] == baseToValue[b] && baseToValue[a] != INVALID; } /** - * Helper class to count the number of occurrences of each nucleotide in + * Checks whether this and another {@link #Nucleotide} make reference to the same nucleotide(s). + *+ * In contrast with {@link #equals}, this method will return {@code false} if any of the two, this + * or the input nucleotide is the {@link #INVALID} enum value. So even
{@link #INVALID}.same({@link #INVALID})+ * will return {@code null}. + * + * + * @param other the other nucleotide. + * @return {@code true} iff this and the input nucleotide make reference to the same nucleotides. + */ + public boolean same(final Nucleotide other) { + return this == other && this != INVALID; + } + + /** + * Returns the complement nucleotide code for this one. + *+ * For ambiguous nucleotide codes, this will return the ambiguous code that encloses the complement of + * each possible nucleotide in this code. + *
+ *+ * The complement of the {@link #INVALID} nucleotide its itself. + *
+ * @return never {@code null}. + */ + public Nucleotide complement() { + if (complement == null) { + final int complementMask = ((mask & A.mask) != 0 ? T.mask : 0) + | ((mask & T.mask) != 0 ? A.mask : 0) + | ((mask & C.mask) != 0 ? G.mask : 0) + | ((mask & G.mask) != 0 ? C.mask : 0); + complement = maskToValue[complementMask]; + } + return complement; + } + + /** + * Returns the complement for a base code. + *+ * When an invalid base is provided this method will return the default encoding for the {@link #INVALID} nucleotide. + *
+ * @param b the input base + * @param upperCase whether to return the uppercase ({@code true}) or the lower case ({@code false}) byte encoding. + * @return the complement of the input. + */ + public static byte complement(final byte b, final boolean upperCase) { + final Nucleotide value = decode(b); + final Nucleotide compl = value.complement(); + return compl.encodeAsByte(upperCase); + } + + /** + * Returns the complement for a base code. + *+ * The case of the output will match the case of the input. + *
+ *+ * When an invalid base is provided this method will return the default encoding for the {@link #INVALID} nucleotide. + *
+ * @param b the input base + * @return the complement of the input. + */ + public static byte complement(final byte b) { + return complement(b, Character.isUpperCase(b)); + } + + /** + * Returns the instance that would include all possible transition mutation from this one. + * @return never {@code null}. + */ + public Nucleotide transition() { + if (transition == null) { + final int transitionMask = ((mask & A.mask) != 0 ? G.mask : 0) + | ((mask & G.mask) != 0 ? A.mask : 0) + | ((mask & C.mask) != 0 ? T.mask : 0) + | ((mask & T.mask) != 0 ? C.mask : 0); + transition = maskToValue[transitionMask]; + } + return transition; + } + + /** + * Returns the instance that would include all possible tranversions mutation from this one. + * @return never {@code null}. + */ + public Nucleotide transversion() { + if (transversion == null) { + final int transversionMask = ((mask & PURINE.mask) != 0 ? PYRIMIDINE.mask : 0) + | ((mask & PYRIMIDINE.mask) != 0 ? PURINE.mask : 0); + transversion = maskToValue[transversionMask]; + } + return transversion; + } + + /** + * Transvertion mutation toward a strong or a weak base. + *+ * This method provides a non-ambiguous alternative to {@link #transversion()} for + * concrete nucleotides. + *
+ * + * @param strong whether the result should be a strong ({@code S: G, C}) or weak ({@code W: A, T}) nucleotide(s). + * @return nucleotides that may emerged from such a transversion. + */ + public Nucleotide transversion(final boolean strong) { + return transversion().intersect(strong ? STRONG : WEAK); + } + + /** + * Helper class to count the number of occurrences of each nucleotide code in * a sequence. */ public static class Counter { + private final long[] counts; /** * Creates a new counter with all counts set to 0. */ public Counter() { - counts = new long[Nucleotide.values().length]; + counts = new long[NUMBER_OF_CONSTANTS]; } /** @@ -97,7 +354,7 @@ public void add(final Nucleotide nucleotide) { * @throws IllegalArgumentException if {@code base} is {@code negative}. */ public void add(final byte base) { - add(valueOf(base)); + add(decode(base)); } /** @@ -117,7 +374,7 @@ public long get(final Nucleotide nucleotide) { * @throws IllegalArgumentException if {@code bases} are null or * it contains negative values. */ - public void addAll(final byte[] bases) { + public void addAll(final byte ... bases) { Utils.nonNull(bases); for (final byte base : bases) { add(base); @@ -131,8 +388,14 @@ public void clear() { Arrays.fill(counts, 0); } + /** + * Return the total count of all nucleotide constants. + * @return + */ public long sum() { return LongStream.of(counts).sum(); } } + + } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java b/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java index 71864625631..b81be29cf8f 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java @@ -315,7 +315,7 @@ private static void checkSequenceBases(final byte[] bases, final int offset, fin final int to = offset + length; for (int i = offset; i < to; i++) { final byte b = bases[i]; - if (Nucleotide.valueOf(b) == Nucleotide.INVALID) { + if (!Nucleotide.decode(b).isValid()) { throw new IllegalArgumentException( "the input sequence contains invalid base calls like: " + StringUtils.escape(""+ (char) b)); } @@ -541,7 +541,7 @@ private void writeDictEntry() { * @param bases array containing the bases to be added. * @return this instance. * @throws IllegalArgumentException if {@bases} is {@code null} or - * the input array contains invalid bases (as assessed by: {@link Nucleotide#valueOf(byte)}). + * the input array contains invalid bases (as assessed by: {@link Nucleotide#decode(byte)}). * @throws IllegalStateException if no sequence was started or the writer is already closed. * @throws IOException if such exception is throw when writing in any of the outputs. */ @@ -560,7 +560,7 @@ public FastaReferenceWriter appendBases(final byte[] bases) * @return this instance. * @throws IllegalArgumentException if {@bases} is {@code null} or * {@code offset} and {@code length} do not entail a valid range in {@code bases} or - * that range in {@base} contain invalid bases (as assessed by: {@link Nucleotide#valueOf(byte)}). + * that range in {@base} contain invalid bases (as assessed by: {@link Nucleotide#decode(byte)}). * @throws IllegalStateException if no sequence was started or the writer is already closed. * @throws IOException if such exception is throw when writing in any of the outputs. */ diff --git a/src/test/java/org/broadinstitute/hellbender/utils/NucleotideUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/NucleotideUnitTest.java index a3461a72202..f3fe77f1894 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/NucleotideUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/NucleotideUnitTest.java @@ -19,65 +19,278 @@ public class NucleotideUnitTest { private static final int MAX_RANDOM_SEQ_LENGTH = 100; private static final int NUMBER_OF_RANDOM_SEQUENCES = 10; - @Test - public void testToBase() { - Assert.assertEquals(Nucleotide.A.toBase(), (byte)'A'); - Assert.assertEquals(Nucleotide.C.toBase(), (byte)'C'); - Assert.assertEquals(Nucleotide.G.toBase(), (byte)'G'); - Assert.assertEquals(Nucleotide.N.toBase(), (byte)'N'); - Assert.assertEquals(Nucleotide.T.toBase(), (byte)'T'); - Assert.assertEquals(Nucleotide.X.toBase(), (byte)'X'); + @Test(dataProvider = "values") + public void testEncodeAsByte(final Nucleotide nuc) { + // Will always use the first letter of the constant as the one byt + final char firstLetter = nuc.name().charAt(0); + final byte expectedLowerEncoding = (byte) Character.toLowerCase(firstLetter); + final byte expectedUpperEncoding = (byte) Character.toUpperCase(firstLetter); + Assert.assertEquals(nuc.encodeAsByte(), expectedUpperEncoding); // by default is upper case. + Assert.assertEquals(nuc.encodeAsByte(true), expectedUpperEncoding); + Assert.assertEquals(nuc.encodeAsByte(false), expectedLowerEncoding); } - @Test - public void testIsConcrete() { - for (final Nucleotide nuc : Nucleotide.values()) { - switch (nuc) { - case A: - case C: - case T: - case G: - Assert.assertTrue(nuc.isConcrete()); - break; - default: - Assert.assertFalse(nuc.isConcrete()); - } + + @Test(dataProvider = "values") + public void testIsConcrete(final Nucleotide nuc) { + switch (nuc) { + case A: + case C: + case T: + case G: + Assert.assertTrue(nuc.isConcrete()); + break; + default: + Assert.assertFalse(nuc.isConcrete()); } } - @Test(expectedExceptions = UnsupportedOperationException.class) - public void testToBaseOnInvalid() { - Nucleotide.INVALID.toBase(); + @Test(dataProvider = "values") + public void testIsAmbiguous(final Nucleotide nuc) { + switch (nuc) { + case X: + case A: + case C: + case T: + case G: + Assert.assertFalse(nuc.isAmbiguous()); + break; + default: + Assert.assertTrue(nuc.isAmbiguous()); + } + } + + @Test(dataProvider = "values") + public void testIsValid(final Nucleotide nuc) { + switch (nuc) { + case X: + Assert.assertFalse(nuc.isValid()); + break; + default: + Assert.assertTrue(nuc.isValid()); + } } + @Test - public void testValueOfBase() { + public void testDecode() { for (byte i = 0; i >= 0; i++) { final Nucleotide expected; switch (i) { case 'a': - case 'A': expected = Nucleotide.A; break; + case 'A': + expected = Nucleotide.A; + break; case 'c': - case 'C': expected = Nucleotide.C; break; + case 'C': + expected = Nucleotide.C; + break; case 'g': - case 'G': expected = Nucleotide.G; break; + case 'G': + expected = Nucleotide.G; + break; case 't': case 'T': case 'u': - case 'U': expected = Nucleotide.T; break; + case 'U': + expected = Nucleotide.T; + break; case 'n': - case 'N': expected = Nucleotide.N; break; + case 'N': + expected = Nucleotide.N; + break; case 'x': - case 'X': expected = Nucleotide.X; break; - default : expected = Nucleotide.INVALID; + case 'X': + expected = Nucleotide.X; + break; + case 'r': + case 'R': + expected = Nucleotide.R; + break; + case 'b': + case 'B': + expected = Nucleotide.B; + break; + case 'v': + case 'V': + expected = Nucleotide.V; + break; + case 'y': + case 'Y': + expected = Nucleotide.Y; + break; + case 's': + case 'S': + expected = Nucleotide.S; + break; + case 'w': + case 'W': + expected = Nucleotide.W; + break; + case 'k': + case 'K': + expected = Nucleotide.K; + break; + case 'm': + case 'M': + expected = Nucleotide.M; + break; + case 'd': + case 'D': + expected = Nucleotide.D; + break; + case 'h': + case 'H': + expected = Nucleotide.H; + break; + default: + expected = Nucleotide.X; + } + Assert.assertSame(Nucleotide.decode(i), expected, "Failed with base " + i + " returning nucleotide " + Nucleotide.decode(i)); + Assert.assertSame(Nucleotide.decode((char)i), expected, "Failed with base " + i + " returning nucleotide " + Nucleotide.decode((char)i)); + } + } + + @Test(dataProvider = "values") + public void testIncludes(final Nucleotide nuc) { + if (nuc.isConcrete()) { + for (final Nucleotide other : Nucleotide.values()) { + if (other.isConcrete()) { + Assert.assertEquals(nuc.includes(other), nuc == other); + Assert.assertEquals(nuc.includes(other.encodeAsByte()), nuc == other); + Assert.assertEquals(nuc.includes(other.encodeAsByte(false)), nuc == other); + } else { + Assert.assertFalse(nuc.includes(other)); + Assert.assertFalse(nuc.includes(other.encodeAsByte())); + Assert.assertFalse(nuc.includes(other.encodeAsByte(false))); + } + } + } else if (nuc.isAmbiguous()) { + for (final Nucleotide other : Nucleotide.values()) { + final boolean thisA = nuc.includes(Nucleotide.A); + final boolean thisC = nuc.includes(Nucleotide.C); + final boolean thisG = nuc.includes(Nucleotide.G); + final boolean thisT = nuc.includes(Nucleotide.T); + final boolean otherA = other.includes(Nucleotide.A); + final boolean otherC = other.includes(Nucleotide.C); + final boolean otherG = other.includes(Nucleotide.G); + final boolean otherT = other.includes(Nucleotide.T); + final boolean includes = other.isValid() && (thisA == otherA || thisA) + && (thisC == otherC || thisC) + && (thisG == otherG || thisG) + && (thisT == otherT || thisT); + Assert.assertEquals(nuc.includes(other), includes, "" + nuc + " " + other); + Assert.assertEquals(nuc.includes(other.encodeAsByte()), includes); + Assert.assertEquals(nuc.includes(other.encodeAsByte(false)), includes); + } + } else { // invalid + for (final Nucleotide other : Nucleotide.values()) { + Assert.assertFalse(nuc.includes(other)); + Assert.assertFalse(nuc.includes(other.encodeAsByte())); + Assert.assertFalse(nuc.includes(other.encodeAsByte(false))); } - Assert.assertSame(Nucleotide.valueOf(i), expected, "Failed with base " + i + " returning nucleotide " + Nucleotide.valueOf(i)); + } + + } + + @Test(dataProvider = "values") + public void testIntersects(final Nucleotide nuc) { + final boolean thisA = nuc.includes(Nucleotide.A); + final boolean thisC = nuc.includes(Nucleotide.C); + final boolean thisG = nuc.includes(Nucleotide.G); + final boolean thisT = nuc.includes(Nucleotide.T); + for (final Nucleotide other : Nucleotide.values()) { + final boolean otherA = other.includes(Nucleotide.A); + final boolean otherC = other.includes(Nucleotide.C); + final boolean otherG = other.includes(Nucleotide.G); + final boolean otherT = other.includes(Nucleotide.T); + final Nucleotide intersect = nuc.intersect(other); + Assert.assertNotNull(intersect); + Assert.assertEquals(intersect.includes(Nucleotide.A), thisA && otherA); + Assert.assertEquals(intersect.includes(Nucleotide.C), thisC && otherC); + Assert.assertEquals(intersect.includes(Nucleotide.G), thisG && otherG); + Assert.assertEquals(intersect.includes(Nucleotide.T), thisT && otherT); + } + } + + @Test(dataProvider = "values") + public void testComplement(final Nucleotide nuc) { + final boolean thisA = nuc.includes(Nucleotide.A); + final boolean thisC = nuc.includes(Nucleotide.C); + final boolean thisG = nuc.includes(Nucleotide.G); + final boolean thisT = nuc.includes(Nucleotide.T); + final Nucleotide complement = nuc.complement(); + final boolean compA = complement.includes(Nucleotide.A); + final boolean compC = complement.includes(Nucleotide.C); + final boolean compG = complement.includes(Nucleotide.G); + final boolean compT = complement.includes(Nucleotide.T); + final String errorMessage = "Failure with " + nuc + " result in complement " + complement; + Assert.assertEquals(compA, thisT, errorMessage); + Assert.assertEquals(compT, thisA, errorMessage); + Assert.assertEquals(compC, thisG, errorMessage); + Assert.assertEquals(compG, thisC, errorMessage); + } + + @Test(dataProvider = "values") + public void testTransition(final Nucleotide nuc) { + final boolean thisA = nuc.includes(Nucleotide.A); + final boolean thisC = nuc.includes(Nucleotide.C); + final boolean thisG = nuc.includes(Nucleotide.G); + final boolean thisT = nuc.includes(Nucleotide.T); + final Nucleotide trans = nuc.transition(); + final boolean tranA = trans.includes(Nucleotide.A); + final boolean tranC = trans.includes(Nucleotide.C); + final boolean tranG = trans.includes(Nucleotide.G); + final boolean tranT = trans.includes(Nucleotide.T); + final String errorMessage = "Failure with " + nuc + " result in transition " + trans; + Assert.assertEquals(tranA, thisG, errorMessage); + Assert.assertEquals(tranG, thisA, errorMessage); + Assert.assertEquals(tranC, thisT, errorMessage); + Assert.assertEquals(tranT, thisC, errorMessage); + } + + @Test(dataProvider = "values") + public void testTransversion(final Nucleotide nuc) { + final boolean thisA = nuc.includes(Nucleotide.A); + final boolean thisC = nuc.includes(Nucleotide.C); + final boolean thisG = nuc.includes(Nucleotide.G); + final boolean thisT = nuc.includes(Nucleotide.T); + final Nucleotide trans = nuc.transversion(); + final boolean tranA = trans.includes(Nucleotide.A); + final boolean tranC = trans.includes(Nucleotide.C); + final boolean tranG = trans.includes(Nucleotide.G); + final boolean tranT = trans.includes(Nucleotide.T); + final String errorMessage = "Failure with " + nuc + " result in transversion " + trans; + Assert.assertEquals(tranA, thisC || thisT, errorMessage); + Assert.assertEquals(tranG, thisC || thisT, errorMessage); + Assert.assertEquals(tranC, thisA || thisG, errorMessage); + Assert.assertEquals(tranT, thisA || thisG, errorMessage); + final Nucleotide transStrong = nuc.transversion(true); + final Nucleotide transWeak = nuc.transversion(false); + Assert.assertTrue(trans.includes(transStrong) || trans == trans.X || transStrong == trans.X); + Assert.assertTrue(trans.includes(transWeak) || trans == trans.X || transStrong == trans.X); + Assert.assertTrue(transStrong.intersect(transWeak) == trans.X); + Assert.assertEquals(transStrong.includes(Nucleotide.C), tranC); + Assert.assertEquals(transStrong.includes(Nucleotide.G), tranG); + Assert.assertEquals(transWeak.includes(Nucleotide.A), tranA); + Assert.assertEquals(transWeak.includes(Nucleotide.T), tranT); + } + + @Test(dataProvider = "values") + public void testSame(final Nucleotide nuc) { + for (final Nucleotide other : Nucleotide.values()) { + final boolean reallyTheSame = nuc != Nucleotide.INVALID && nuc == other; + Assert.assertEquals(nuc.same(other), reallyTheSame); + Assert.assertEquals(Nucleotide.same(nuc.encodeAsByte(), other.encodeAsByte()), reallyTheSame); + Assert.assertEquals(Nucleotide.same(nuc.encodeAsByte(false), other.encodeAsByte()), reallyTheSame); + Assert.assertEquals(Nucleotide.same(nuc.encodeAsByte(), other.encodeAsByte(false)), reallyTheSame); + Assert.assertEquals(Nucleotide.same(nuc.encodeAsByte(false), other.encodeAsByte(false)), reallyTheSame); } } @Test(expectedExceptions = IllegalArgumentException.class) public void testValueOfNegativeBase() { - Nucleotide.valueOf((byte) -10); + Nucleotide.decode((byte) -10); } @Test @@ -88,13 +301,13 @@ public void testNucleotideCounterInit() { } } - @Test(dependsOnMethods = "testValueOfBase", dataProvider = "testSequences") + @Test(dependsOnMethods = "testDecode", dataProvider = "testSequences") public void testAddingOneByOne(final byte[] bases) { final Nucleotide.Counter subject = new Nucleotide.Counter(); final Mapshadow = new HashMap<>(Nucleotide.values().length); for (final byte base : bases) { subject.add(base); - final Nucleotide nuc = Nucleotide.valueOf(base); + final Nucleotide nuc = Nucleotide.decode(base); shadow.put(nuc, shadow.getOrDefault(nuc, 0) + 1); for (final Nucleotide n : Nucleotide.values()) { Assert.assertEquals(subject.get(n), (long) shadow.getOrDefault(n, 0)); @@ -103,12 +316,12 @@ public void testAddingOneByOne(final byte[] bases) { Assert.assertEquals(subject.sum(), shadow.values().stream().mapToLong(l -> l).sum()); } - @Test(dependsOnMethods = "testValueOfBase", dataProvider = "testSequences") + @Test(dependsOnMethods = "testDecode", dataProvider = "testSequences") public void testAddingAllAtOnce(final byte[] bases) { final Nucleotide.Counter subject = new Nucleotide.Counter(); final Map shadow = new HashMap<>(Nucleotide.values().length); for (final byte base : bases) { - final Nucleotide nuc = Nucleotide.valueOf(base); + final Nucleotide nuc = Nucleotide.decode(base); shadow.put(nuc, shadow.getOrDefault(nuc, 0) + 1); } subject.addAll(bases); @@ -128,17 +341,16 @@ public void testAddingAllAtOnceOnANullArray() { @Test(expectedExceptions = IllegalArgumentException.class) public void testAddingAllAtOnceWithNegativeBases() { final Nucleotide.Counter subject = new Nucleotide.Counter(); - subject.addAll(new byte[] { 'a', 'A', -10, 'C' } ); + subject.addAll(new byte[]{'a', 'A', -10, 'C'}); } - - @Test(dependsOnMethods = "testValueOfBase", dataProvider = "testSequences") + @Test(dependsOnMethods = "testDecode", dataProvider = "testSequences") public void testClear(final byte[] bases) { final Nucleotide.Counter subject = new Nucleotide.Counter(); final Map shadow = new HashMap<>(Nucleotide.values().length); for (final byte base : bases) { - final Nucleotide nuc = Nucleotide.valueOf(base); + final Nucleotide nuc = Nucleotide.decode(base); shadow.put(nuc, shadow.getOrDefault(nuc, 0) + 1); } subject.addAll(bases); @@ -156,18 +368,27 @@ public void testClear(final byte[] bases) { public Object[][] testSequences() { final List