From 319f1859cebebf7c9b1ed0d84322dbfce41b8f37 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 24 Aug 2018 16:14:43 -0700 Subject: [PATCH] Stable key order for map vectorizers (#88) --- .../stages/impl/feature/OPMapVectorizer.scala | 25 ++- .../impl/feature/SmartTextMapVectorizer.scala | 2 +- .../feature/BinaryMapVectorizerTest.scala | 14 +- .../GeolocationMapVectorizerTest.scala | 153 ++++++++---------- .../feature/IntegralMapVectorizerTest.scala | 120 ++++++-------- .../impl/feature/RealMapVectorizerTest.scala | 134 +++++++-------- .../impl/preparators/SanityCheckerTest.scala | 24 +-- 7 files changed, 214 insertions(+), 258 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala index b41c306bcc..70f03ec484 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizer.scala @@ -84,7 +84,6 @@ abstract class OPMapVectorizer[A, T <: OPMap[A]] val meta = if ($(trackNulls)) makeVectorMetaWithNullIndicators(allKeys) else makeVectorMetadata(allKeys) setMetadata(meta.toMetadata) - val args = OPMapVectorizerModelArgs( allKeys = allKeys, fillByKey = fillByKey(dataset), @@ -124,14 +123,14 @@ class IntegralMapVectorizer[T <: OPMap[Long]](uid: String = UID[IntegralMapVecto def setFillWithMode(shouldFill: Boolean): this.type = set(withConstant, !shouldFill) override def fillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = { - val size = getInputFeatures().length - val cleanedData = dataset.map(_.map( - cleanMap(_, shouldCleanKey = $(cleanKeys), shouldCleanValue = shouldCleanValues) - )) - if ($(withConstant)) Seq.empty else { + val size = getInputFeatures().length val modeAggr = SequenceAggregators.ModeSeqMapLong(size = size) + val shouldCleanKeys = $(cleanKeys) + val cleanedData = dataset.map(_.map( + cleanMap(_, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues) + )) cleanedData.select(modeAggr.toColumn).first() }.map(convertFn) } @@ -203,8 +202,6 @@ class TextMapHashingVectorizer[T <: OPMap[String]] def setHashSpaceStrategy(v: HashSpaceStrategy): this.type = set(hashSpaceStrategy, v.entryName) def getHashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.withNameInsensitive($(hashSpaceStrategy)) - def getFillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = Seq.empty - def makeModel(args: OPMapVectorizerModelArgs, operationName: String, uid: String): OPMapVectorizerModel[String, T] = new TextMapHashingVectorizerModel[T]( args = args.copy(shouldCleanValues = $(cleanText)), @@ -230,14 +227,14 @@ class RealMapVectorizer[T <: OPMap[Double]](uid: String = UID[RealMapVectorizer[ def setFillWithMean(shouldFill: Boolean): this.type = set(withConstant, !shouldFill) override def fillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = { - val size = getInputFeatures().length - val cleanedData = dataset.map(_.map( - cleanMap(_, shouldCleanKey = $(cleanKeys), shouldCleanValue = shouldCleanValues) - )) - if ($(withConstant)) Seq.empty else { + val size = getInputFeatures().length val meanAggr = SequenceAggregators.MeanSeqMapDouble(size = size) + val shouldCleanKeys = $(cleanKeys) + val cleanedData = dataset.map(_.map( + cleanMap(_, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues) + )) cleanedData.select(meanAggr.toColumn).first() } } @@ -282,7 +279,7 @@ trait MapVectorizerFuns[A, T <: OPMap[A]] extends VectorizerDefaults with MapPiv in.map(_.map(kb => filterKeys(kb, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues).keySet)) .select(sumAggr.toColumn) .first() - .map(_.toSeq) + .map(_.toSeq.sorted) } protected def makeVectorMetadata(allKeys: Seq[Seq[String]]): OpVectorMetadata = { diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 80b09d00ea..f9b79cbd59 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -121,7 +121,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] def makeSmartTextMapVectorizerModelArgs(aggregatedStats: Array[TextMapStats]): SmartTextMapVectorizerModelArgs = { val maxCard = $(maxCardinality) - val minSup = ${minSupport} + val minSup = $(minSupport) val shouldCleanKeys = $(cleanKeys) val shouldCleanValues = $(cleanText) val shouldTrackNulls = $(trackNulls) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala index 780d3c6dad..c4e79457b9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala @@ -55,9 +55,9 @@ class BinaryMapVectorizerTest val estimator = new BinaryMapVectorizer().setTrackNulls(false).setCleanKeys(true).setInput(m1, m2) - val expectedResult: Seq[OPVector] = Seq( + val expectedResult = Seq( Vectors.sparse(6, Array(1), Array(1.0)), - Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)), + Vectors.sparse(6, Array(3, 4), Array(1.0, 1.0)), Vectors.sparse(6, Array(), Array()) ).map(_.toOPVector) @@ -68,7 +68,7 @@ class BinaryMapVectorizerTest val expectedMeta = TestOpVectorMetadataBuilder( estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Z")) ) transformed.collect(vector) shouldBe expectedResult @@ -82,8 +82,8 @@ class BinaryMapVectorizerTest val transformed = estimator.setTrackNulls(true).fit(inputData).transform(inputData) val vector = estimator.getOutput() val expected = Array( - Vectors.sparse(12, Array(2, 5, 9, 11), Array(1.0, 1.0, 1.0, 1.0)), - Vectors.sparse(12, Array(1, 3, 7, 8, 10), Array(1.0, 1.0, 1.0, 1.0, 1.0)), + Vectors.sparse(12, Array(2, 5, 7, 9), Array(1.0, 1.0, 1.0, 1.0)), + Vectors.sparse(12, Array(1, 3, 6, 8, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) @@ -93,9 +93,9 @@ class BinaryMapVectorizerTest m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z"), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X")) + IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) transformed.collect(vector) shouldBe expected diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala index 866806055f..9d542fc057 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala @@ -31,21 +31,22 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, IndColWithGroup} -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import com.salesforce.op.stages.base.sequence.SequenceModel +import com.salesforce.op.test.TestOpVectorColumnType.IndColWithGroup +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichStructType._ +import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.FlatSpec @RunWith(classOf[JUnitRunner]) -class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { +class GeolocationMapVectorizerTest + extends OpEstimatorSpec[OPVector, SequenceModel[GeolocationMap, OPVector], GeolocationMapVectorizer] { - lazy val (data, m1, m2) = TestFeatureBuilder("m1", "m2", + val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( (Map("a" -> Seq(32.4, -100.2, 3.0), "b" -> Seq(33.8, -108.7, 2.0)), Map("z" -> Seq(45.0, -105.5, 4.0))), (Map("c" -> Seq(33.8, -108.7, 2.0)), Map("y" -> Seq(42.5, -95.4, 4.0), "x" -> Seq(40.4, -116.3, 2.0))), @@ -53,77 +54,66 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { ).map(v => (GeolocationMap(v._1), GeolocationMap(v._2))) ) - val baseVectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true) + val estimator = new GeolocationMapVectorizer().setInput(m1, m2).setTrackNulls(false).setCleanKeys(true) + + val expectedResult = Seq( + Vectors.sparse(18, Array(0, 1, 2, 3, 4, 5, 15, 16, 17), + Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 45.0, -105.5, 4.0)), + Vectors.sparse(18, Array(6, 7, 8, 9, 10, 11, 12, 13, 14), + Array(33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0)), + Vectors.sparse(18, Array(), Array()) + ).map(_.toOPVector) + val expectedMeta = TestOpVectorMetadataBuilder( - baseVectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "A"), IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X")) + IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z")) ) val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString) val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( - baseVectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "A"), IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), - IndColWithGroup(nullIndicatorValue, "Z"), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), + IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), - IndColWithGroup(nullIndicatorValue, "X")) + IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), IndColWithGroup(None, "Z"), + IndColWithGroup(nullIndicatorValue, "Z")) ) - Spec[GeolocationMapVectorizer] should "take an array of features as input and return a single vector feature" in { - val vectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true) - val vector = vectorizer.getOutput() - - vector.name shouldBe vectorizer.getOutputFeatureName - vector.parents should contain theSameElementsAs Array(m1, m2) - vector.originStage shouldBe vectorizer - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } - it should "return a model that correctly transforms the data" in { - val vectorizer = baseVectorizer.setTrackNulls(false).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - val expected = Array( - Vectors.sparse(18, Array(0, 1, 2, 3, 4, 5, 9, 10, 11), - Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 45.0, -105.5, 4.0)), - Vectors.sparse(18, Array(6, 7, 8, 12, 13, 14, 15, 16, 17), - Array(33.8, -108.7, 2.0, 42.5, -95.4, 4.0, 40.4, -116.3, 2.0)), - Vectors.sparse(18, Array(), Array()) - ).map(_.toOPVector) - - transformed.collect(vector) shouldBe expected + transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta } it should "track nulls" in { - val vectorizer = baseVectorizer.setTrackNulls(true).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.setTrackNulls(true).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.sparse(24, Array(0, 1, 2, 4, 5, 6, 11, 12, 13, 14, 19, 23), - Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 1.0, 45.0, -105.5, 4.0, 1.0, 1.0)), - Vectors.sparse(24, Array(3, 7, 8, 9, 10, 15, 16, 17, 18, 20, 21, 22), - Array(1.0, 1.0, 33.8, -108.7, 2.0, 1.0, 42.5, -95.4, 4.0, 40.4, -116.3, 2.0)), + Vectors.sparse(24, Array(0, 1, 2, 4, 5, 6, 11, 15, 19, 20, 21, 22), + Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 1.0, 1.0, 1.0, 45.0, -105.5, 4.0)), + Vectors.sparse(24, Array(3, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 23), + Array(1.0, 1.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0, 1.0)), Vectors.sparse(24, Array(3, 7, 11, 15, 19, 23), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -131,17 +121,15 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "use the correct fill value for missing keys" in { - val vectorizer = baseVectorizer.setTrackNulls(false) - .setDefaultValue(Geolocation(6.0, 6.0, GeolocationAccuracy.Zip)).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.setTrackNulls(false) + .setDefaultValue(Geolocation(6.0, 6.0, GeolocationAccuracy.Zip)).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.dense(Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 6.0, 6.0, 6.0, 45.0, -105.5, 4.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0)), - Vectors.dense(Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 33.8, -108.7, 2.0, 6.0, 6.0, 6.0, 42.5, -95.4, 4.0, - 40.4, -116.3, 2.0)), - Vectors.dense(Array.fill(18)(6.0)) - ).map(_.toOPVector) + Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 45.0, -105.5, 4.0), + Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0, 6.0, 6.0, 6.0), + Array.fill(18)(6.0) + ).map(v => Vectors.dense(v).toOPVector) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -150,17 +138,17 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "track nulls with missing keys" in { - val vectorizer = baseVectorizer.setTrackNulls(true) - .setDefaultValue(Geolocation(6.0, 6.0, GeolocationAccuracy.Zip)).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.setTrackNulls(true) + .setDefaultValue(Geolocation(6.0, 6.0, GeolocationAccuracy.Zip)).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.dense(Array(32.4, -100.2, 3.0, 0.0, 33.8, -108.7, 2.0, 0.0, 6.0, 6.0, 6.0, 1.0, 45.0, -105.5, 4.0, 0.0, - 6.0, 6.0, 6.0, 1.0, 6.0, 6.0, 6.0, 1.0)), - Vectors.dense(Array(6.0, 6.0, 6.0, 1.0, 6.0, 6.0, 6.0, 1.0, 33.8, -108.7, 2.0, 0.0, 6.0, 6.0, 6.0, 1.0, - 42.5, -95.4, 4.0, 0.0, 40.4, -116.3, 2.0, 0.0)), - Vectors.dense((0 until 6).flatMap(k => Seq.fill(3)(6.0) :+ 1.0).toArray) - ).map(_.toOPVector) + Array(32.4, -100.2, 3.0, 0.0, 33.8, -108.7, 2.0, 0.0, 6.0, 6.0, 6.0, 1.0, 6.0, 6.0, 6.0, 1.0, 6.0, 6.0, 6.0, 1.0, + 45.0, -105.5, 4.0, 0.0), + Array(6.0, 6.0, 6.0, 1.0, 6.0, 6.0, 6.0, 1.0, 33.8, -108.7, 2.0, 0.0, 40.4, -116.3, 2.0, 0.0, 42.5, -95.4, 4.0, + 0.0, 6.0, 6.0, 6.0, 1.0), + (0 until 6).flatMap(k => Seq.fill(3)(6.0) :+ 1.0).toArray + ).map(v => Vectors.dense(v).toOPVector) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -170,8 +158,8 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly whitelist keys" in { val vectorizer = new GeolocationMapVectorizer().setCleanKeys(true).setTrackNulls(false) - .setInput(m1, m2).setWhiteListKeys(Array("a", "b", "z")).fit(data) - val transformed = vectorizer.transform(data) + .setInput(m1, m2).setWhiteListKeys(Array("a", "b", "z")).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.dense(Array(32.4, -100.2, 3.0, 33.8, -108.7, 2.0, 45.0, -105.5, 4.0)), @@ -193,8 +181,8 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with whitelist keys" in { val vectorizer = new GeolocationMapVectorizer().setCleanKeys(true).setTrackNulls(true) - .setInput(m1, m2).setWhiteListKeys(Array("a", "b", "z")).fit(data) - val transformed = vectorizer.transform(data) + .setInput(m1, m2).setWhiteListKeys(Array("a", "b", "z")).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.dense(Array(32.4, -100.2, 3.0, 0.0, 33.8, -108.7, 2.0, 0.0, 45.0, -105.5, 4.0, 0.0)), @@ -219,20 +207,20 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly backlist keys" in { val vectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true).setTrackNulls(false) - .setBlackListKeys(Array("a", "z")).fit(data) - val transformed = vectorizer.transform(data) + .setBlackListKeys(Array("a", "z")).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.sparse(12, Array(0, 1, 2), Array(33.8, -108.7, 2.0)), - Vectors.dense(Array(0.0, 0.0, 0.0, 33.8, -108.7, 2.0, 42.5, -95.4, 4.0, 40.4, -116.3, 2.0)), + Vectors.dense(Array(0.0, 0.0, 0.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0)), Vectors.sparse(12, Array(), Array()) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), + IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y")) ) transformed.collect(vector) shouldBe expected @@ -243,12 +231,12 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with backlist keys" in { val vectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true).setTrackNulls(true) - .setBlackListKeys(Array("a", "z")).fit(data) - val transformed = vectorizer.transform(data) + .setBlackListKeys(Array("a", "z")).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.sparse(16, Array(0, 1, 2, 7, 11, 15), Array(33.8, -108.7, 2.0, 1.0, 1.0, 1.0)), - Vectors.dense(Array(0.0, 0.0, 0.0, 1.0, 33.8, -108.7, 2.0, 0.0, 42.5, -95.4, 4.0, 0.0, 40.4, -116.3, 2.0, 0.0)), + Vectors.dense(Array(0.0, 0.0, 0.0, 1.0, 33.8, -108.7, 2.0, 0.0, 40.4, -116.3, 2.0, 0.0, 42.5, -95.4, 4.0, 0.0)), Vectors.sparse(16, Array(3, 7, 11, 15), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( @@ -257,10 +245,10 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), - IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), - IndColWithGroup(nullIndicatorValue, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), IndColWithGroup(None, "X"), + IndColWithGroup(nullIndicatorValue, "X"), + IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Y"), + IndColWithGroup(nullIndicatorValue, "Y")) ) transformed.collect(vector) shouldBe expected @@ -270,15 +258,16 @@ class GeolocationMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "have a working shortcut function" in { - val vectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = new GeolocationMapVectorizer().setInput(m1, m2).setCleanKeys(true).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expectedOutput = transformed.collect() // Now using the shortcut val res = m1.vectorize(cleanKeys = TransmogrifierDefaults.CleanKeys, others = Array(m2)) - val actualOutput = res.originStage.asInstanceOf[GeolocationMapVectorizer] - .fit(data).transform(data).collect() + res.originStage shouldBe a[GeolocationMapVectorizer] + val actualOutput = res.originStage.asInstanceOf[GeolocationMapVectorizer].fit(inputData) + .transform(inputData).collect() actualOutput.zip(expectedOutput).forall(f => f._1 == f._2) shouldBe true } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala index 3ae7837bb0..2f54a056be 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala @@ -31,21 +31,22 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, IndColWithGroup} -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import com.salesforce.op.stages.base.sequence.SequenceModel +import com.salesforce.op.test.TestOpVectorColumnType.IndColWithGroup +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichStructType._ +import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { +class IntegralMapVectorizerTest + extends OpEstimatorSpec[OPVector, SequenceModel[IntegralMap, OPVector], IntegralMapVectorizer[IntegralMap]] { - lazy val (data, m1, m2) = TestFeatureBuilder("m1", "m2", + val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( (Map("a" -> 1L, "b" -> 5L), Map("z" -> 10L)), (Map("c" -> 11L), Map("y" -> 3L, "x" -> 0L)), @@ -53,69 +54,53 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { ).map(v => v._1.toIntegralMap -> v._2.toIntegralMap) ) - val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString) + val estimator = new IntegralMapVectorizer().setTrackNulls(false).setCleanKeys(true).setInput(m1, m2) - val baseVectorizer = new IntegralMapVectorizer().setInput(m1, m2).setCleanKeys(true) + val expectedResult = Seq( + Vectors.dense(Array(1.0, 5.0, 0.0, 0.0, 0.0, 10.0)), + Vectors.sparse(6, Array(2, 4), Array(11.0, 3.0)), + Vectors.sparse(6, Array(), Array()) + ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( - baseVectorizer, - m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), - IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) + estimator, + m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Z")) ) + val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString) val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( - baseVectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z"), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X")) + IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) - Spec[IntegralMapVectorizer[_]] should "take an array of features as input and return a single vector feature" in { - val vector = baseVectorizer.getOutput() + it should "return a model that correctly transforms the data and produces metadata" in { + val vector = estimator.getOutput() + val transformed = model.transform(inputData) - vector.name shouldBe baseVectorizer.getOutputFeatureName - vector.parents should contain theSameElementsAs Array(m1, m2) - vector.originStage shouldBe baseVectorizer - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } - - it should "return a model that correctly transforms the data" in { - val vectorizer = baseVectorizer.setTrackNulls(false).fit(data) - - val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) - - val expected = Array( - Vectors.dense(Array(1.0, 5.0, 0.0, 10.0, 0.0, 0.0)), - Vectors.sparse(6, Array(2, 4), Array(11.0, 3.0)), - Vectors.sparse(6, Array(), Array()) - ).map(_.toOPVector) - - - transformed.collect(vector) shouldBe expected - transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta - val vectorMetadata = vectorizer.getMetadata() - OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta + transformed.collect(vector) shouldBe expectedResult + transformed.schema.toOpVectorMetadata(estimator.getOutputFeatureName) shouldEqual expectedMeta + val vectorMetadata = estimator.getMetadata() + OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta } it should "track nulls" in { - val vectorizer = baseVectorizer.setTrackNulls(true).fit(data) + val vectorizer = estimator.setTrackNulls(true).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( - Vectors.sparse(12, Array(0, 2, 5, 6, 9, 11), Array(1.0, 5.0, 1.0, 10.0, 1.0, 1.0)), - Vectors.sparse(12, Array(1, 3, 4, 7, 8), Array(1.0, 1.0, 11.0, 1.0, 3.0)), + Vectors.sparse(12, Array(0, 2, 5, 7, 9, 10), Array(1.0, 5.0, 1.0, 1.0, 1.0, 10.0)), + Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -123,13 +108,13 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "use the correct fill value for missing keys" in { - val vectorizer = baseVectorizer.setDefaultValue(100).setTrackNulls(false).fit(data) + val vectorizer = estimator.setDefaultValue(100).setTrackNulls(false).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( - Vectors.dense(Array(1.0, 5.0, 100.0, 10.0, 100.0, 100.0)), - Vectors.dense(Array(100.0, 100.0, 11.0, 100.0, 3.0, 0.0)), + Vectors.dense(Array(1.0, 5.0, 100.0, 100.0, 100.0, 10.0)), + Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) @@ -140,17 +125,16 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "track nulls with the correct fill value for missing keys" in { - val vectorizer = baseVectorizer.setDefaultValue(100).setTrackNulls(true).fit(data) + val vectorizer = estimator.setDefaultValue(100).setTrackNulls(true).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( - Vectors.sparse(12, Array(0, 2, 4, 5, 6, 8, 9, 10, 11), Array(1.0, 5.0, 100.0, 1.0, 10.0, 100.0, 1.0, 100.0, 1.0)), - Vectors.sparse(12, Array(0, 1, 2, 3, 4, 6, 7, 8), Array(100.0, 1.0, 100.0, 1.0, 11.0, 100.0, 1.0, 3.0)), + Vectors.dense(Array(1.0, 0.0, 5.0, 0.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 10.0, 0.0)), + Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -160,9 +144,9 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly whitelist keys" in { val vectorizer = new IntegralMapVectorizer[IntegralMap]().setInput(m1, m2).setCleanKeys(true).setTrackNulls(false) - .setWhiteListKeys(Array("a", "b", "z")).fit(data) + .setWhiteListKeys(Array("a", "b", "z")).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.dense(Array(1.0, 5.0, 10.0)), @@ -184,9 +168,9 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly track nulls with whitelist keys" in { val vectorizer = new IntegralMapVectorizer[IntegralMap]().setInput(m1, m2).setCleanKeys(true).setTrackNulls(true) - .setWhiteListKeys(Array("a", "b", "z")).fit(data) + .setWhiteListKeys(Array("a", "b", "z")).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.sparse(6, Array(0, 2, 4), Array(1.0, 5.0, 10.0)), @@ -208,19 +192,19 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly backlist keys" in { val vectorizer = new IntegralMapVectorizer[IntegralMap]() - .setInput(m1, m2).setCleanKeys(true).setTrackNulls(false).setBlackListKeys(Array("a", "z")).fit(data) + .setInput(m1, m2).setCleanKeys(true).setTrackNulls(false).setBlackListKeys(Array("a", "z")).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.sparse(4, Array(0), Array(5.0)), - Vectors.dense(Array(0.0, 11.0, 3.0, 0.0)), + Vectors.dense(Array(0.0, 11.0, 0.0, 3.0)), Vectors.sparse(4, Array(), Array()) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y")) ) transformed.collect(vector) shouldBe expected @@ -231,21 +215,21 @@ class IntegralMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with backlist keys" in { val vectorizer = new IntegralMapVectorizer[IntegralMap]() - .setInput(m1, m2).setCleanKeys(true).setTrackNulls(true).setBlackListKeys(Array("a", "z")).fit(data) + .setInput(m1, m2).setCleanKeys(true).setTrackNulls(true).setBlackListKeys(Array("a", "z")).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.sparse(8, Array(0, 3, 5, 7), Array(5.0, 1.0, 1.0, 1.0)), - Vectors.sparse(8, Array(1, 2, 4, 6), Array(1.0, 11.0, 3.0, 0.0)), + Vectors.sparse(8, Array(1, 2, 6), Array(1.0, 11.0, 3.0)), Vectors.sparse(8, Array(1, 3, 5, 7), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), + IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y")) ) transformed.collect(vector) shouldBe expected diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala index f84e93314f..f334370509 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala @@ -31,21 +31,22 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op.features.types._ -import com.salesforce.op.test.TestOpVectorColumnType.{IndCol, IndColWithGroup} -import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestSparkContext} -import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import com.salesforce.op.stages.base.sequence.SequenceModel +import com.salesforce.op.test.TestOpVectorColumnType.IndColWithGroup +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestOpVectorMetadataBuilder} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.RichStructType._ +import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class RealMapVectorizerTest extends FlatSpec with TestSparkContext { +class RealMapVectorizerTest + extends OpEstimatorSpec[OPVector, SequenceModel[RealMap, OPVector], RealMapVectorizer[RealMap]] { - lazy val (data, m1, m2) = TestFeatureBuilder("m1", "m2", + val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( (Map("a" -> 1.0, "b" -> 5.0), Map("z" -> 10.0)), (Map("c" -> 11.0), Map("y" -> 3.0, "x" -> 0.0)), @@ -53,7 +54,7 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { ).map(v => v._1.toRealMap -> v._2.toRealMap) ) - lazy val (meanData, f1, f2) = TestFeatureBuilder("f1", "f2", + val (meanData, f1, f2) = TestFeatureBuilder("f1", "f2", Seq( (Map("a" -> 1.0, "b" -> 5.0), Map("y" -> 4.0, "x" -> 0.0, "z" -> 10.0)), (Map("a" -> -3.0, "b" -> 3.0, "c" -> 11.0), Map("y" -> 3.0, "x" -> 0.0)), @@ -62,60 +63,49 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { ).map(v => v._1.toRealMap -> v._2.toRealMap) ) - val baseVectorizer = new RealMapVectorizer[RealMap]().setInput(m1, m2).setCleanKeys(true) + val estimator = new RealMapVectorizer[RealMap]().setInput(m1, m2).setTrackNulls(false).setCleanKeys(true) + + val expectedResult = Seq( + Vectors.dense(Array(1.0, 5.0, 0.0, 0.0, 0.0, 10.0)), + Vectors.sparse(6, Array(2, 4), Array(11.0, 3.0)), + Vectors.sparse(6, Array(), Array()) + ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( - baseVectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Z")) ) val nullIndicatorValue = Some(OpVectorColumnMetadata.NullString) - val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( - baseVectorizer, + estimator, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z"), + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X")) + IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) - Spec[RealMapVectorizer[_]] should "take an array of features as input and return a single vector feature" in { - val vectorizer = new RealMapVectorizer[RealMap]().setCleanKeys(true).setInput(m1, m2) + it should "return a model that correctly transforms the data and produces metadata" in { + val vectorizer = estimator.setDefaultValue(0.0).setTrackNulls(false).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - vector.name shouldBe vectorizer.getOutputFeatureName - vector.parents should contain theSameElementsAs Array(m1, m2) - vector.originStage shouldBe vectorizer - vector.typeName shouldBe FeatureType.typeName[OPVector] - vector.isResponse shouldBe false - } - - it should "return a model that correctly transforms the data" in { - val vectorizer = baseVectorizer.setDefaultValue(0.0).setTrackNulls(false).fit(data) - val transformed = vectorizer.transform(data) - val vector = vectorizer.getOutput() - val expected = Array( - Vectors.dense(Array(1.0, 5.0, 0.0, 10.0, 0.0, 0.0)), - Vectors.sparse(6, Array(2, 4), Array(11.0, 3.0)), - Vectors.sparse(6, Array(), Array()) - ).map(_.toOPVector) - - transformed.collect(vector) shouldBe expected + transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta } it should "track nulls" in { - val vectorizer = baseVectorizer.setDefaultValue(0.0).setTrackNulls(true).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.setDefaultValue(0.0).setTrackNulls(true).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.sparse(12, Array(0, 2, 5, 6, 9, 11), Array(1.0, 5.0, 1.0, 10.0, 1.0, 1.0)), - Vectors.sparse(12, Array(1, 3, 4, 7, 8), Array(1.0, 1.0, 11.0, 1.0, 3.0)), + Vectors.sparse(12, Array(0, 2, 5, 7, 9, 10), Array(1.0, 5.0, 1.0, 1.0, 1.0, 10.0)), + Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) @@ -126,12 +116,12 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "use the correct fill value for missing keys" in { - val vectorizer = baseVectorizer.setDefaultValue(100).setTrackNulls(false).fit(data) - val transformed = vectorizer.transform(data) + val vectorizer = estimator.setDefaultValue(100).setTrackNulls(false).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.dense(Array(1.0, 5.0, 100.0, 10.0, 100.0, 100.0)), - Vectors.dense(Array(100.0, 100.0, 11.0, 100.0, 3.0, 0.0)), + Vectors.dense(Array(1.0, 5.0, 100.0, 100.0, 100.0, 10.0)), + Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) @@ -142,17 +132,16 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { } it should "track nulls with fill value for missing keys" in { - val vectorizer = baseVectorizer.setDefaultValue(100).setTrackNulls(true).fit(data) + val vectorizer = estimator.setDefaultValue(100).setTrackNulls(true).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( - Vectors.sparse(12, Array(0, 2, 4, 5, 6, 8, 9, 10, 11), Array(1.0, 5.0, 100.0, 1.0, 10.0, 100.0, 1.0, 100.0, 1.0)), - Vectors.sparse(12, Array(0, 1, 2, 3, 4, 6, 7, 8), Array(100.0, 1.0, 100.0, 1.0, 11.0, 100.0, 1.0, 3.0)), + Vectors.dense(Array(1.0, 0.0, 5.0, 0.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 10.0, 0.0)), + Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -161,8 +150,8 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly whitelist keys" in { val vectorizer = new RealMapVectorizer[RealMap]().setInput(m1, m2).setDefaultValue(0.0) - .setCleanKeys(true).setWhiteListKeys(Array("a", "b", "z")).setTrackNulls(false).fit(data) - val transformed = vectorizer.transform(data) + .setCleanKeys(true).setWhiteListKeys(Array("a", "b", "z")).setTrackNulls(false).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.dense(Array(1.0, 5.0, 10.0)), @@ -182,9 +171,9 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with whitelist keys" in { val vectorizer = new RealMapVectorizer[RealMap]().setInput(m1, m2).setDefaultValue(0.0) - .setCleanKeys(true).setWhiteListKeys(Array("a", "b", "z")).setTrackNulls(true).fit(data) + .setCleanKeys(true).setWhiteListKeys(Array("a", "b", "z")).setTrackNulls(true).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.sparse(6, Array(0, 2, 4), Array(1.0, 5.0, 10.0)), @@ -206,18 +195,18 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { it should "correctly backlist keys" in { val vectorizer = new RealMapVectorizer[RealMap]().setInput(m1, m2).setDefaultValue(0.0) - .setCleanKeys(true).setBlackListKeys(Array("a", "z")).setTrackNulls(false).fit(data) - val transformed = vectorizer.transform(data) + .setCleanKeys(true).setBlackListKeys(Array("a", "z")).setTrackNulls(false).fit(inputData) + val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val expected = Array( Vectors.sparse(4, Array(0), Array(5.0)), - Vectors.dense(Array(0.0, 11.0, 3.0, 0.0)), + Vectors.dense(Array(0.0, 11.0, 0.0, 3.0)), Vectors.sparse(4, Array(), Array()) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(None, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y")) ) transformed.collect(vector) shouldBe expected @@ -226,24 +215,23 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta } - it should "track nulls with backlist keys" in { val vectorizer = new RealMapVectorizer[RealMap]().setInput(m1, m2).setDefaultValue(0.0) - .setCleanKeys(true).setBlackListKeys(Array("a", "z")).setTrackNulls(true).fit(data) + .setCleanKeys(true).setBlackListKeys(Array("a", "z")).setTrackNulls(true).fit(inputData) val vector = vectorizer.getOutput() - val transformed = vectorizer.transform(data) + val transformed = vectorizer.transform(inputData) val expected = Array( Vectors.sparse(8, Array(0, 3, 5, 7), Array(5.0, 1.0, 1.0, 1.0)), - Vectors.sparse(8, Array(1, 2, 4, 6), Array(1.0, 11.0, 3.0, 0.0)), + Vectors.sparse(8, Array(1, 2, 6), Array(1.0, 11.0, 3.0)), Vectors.sparse(8, Array(1, 3, 5, 7), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - m2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X")) + m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), + IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y")) ) transformed.collect(vector) shouldBe expected @@ -258,19 +246,18 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(meanData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.dense(1.0, 5.0, 11.0, 4.0, 0.0, 10.0), - Vectors.dense(-3.0, 3.0, 11.0, 3.0, 0.0, 15.0 / 2), - Vectors.dense(-1.0, 4.0, 11.0, 1.0, 0.0, 5.0), - Vectors.dense(-1.0, 4.0, 11.0, 8.0 / 3, 0.0, 15.0 / 2) + Vectors.dense(1.0, 5.0, 11.0, 0.0, 4.0, 10.0), + Vectors.dense(-3.0, 3.0, 11.0, 0.0, 3.0, 15.0 / 2), + Vectors.dense(-1.0, 4.0, 11.0, 0.0, 1.0, 5.0), + Vectors.dense(-1.0, 4.0, 11.0, 0.0, 8.0 / 3, 15.0 / 2) ).map(_.toOPVector) transformed.collect(vector) shouldBe expected - // TODO: Order of columns needed to be changed here compared to earlier tests - why does this happen? val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, f1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), - f2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(None, "X"), IndColWithGroup(None, "Z")) + f2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Z")) ) transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -283,22 +270,21 @@ class RealMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(meanData) val vector = vectorizer.getOutput() val expected = Array( - Vectors.sparse(12, Array(0, 2, 4, 5, 6, 10), Array(1.0, 5.0, 11.0, 1.0, 4.0, 10.0)), - Vectors.sparse(12, Array(0, 2, 4, 6, 10, 11), Array(-3.0, 3.0, 11.0, 3.0, 15.0 / 2, 1.0)), - Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 1.0, 0.0, 0.0, 0.0, 5.0, 0.0), - Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 8.0 / 3, 1.0, 0.0, 1.0, 15.0 / 2, 1.0) + Vectors.sparse(12, Array(0, 2, 4, 5, 8, 10), Array(1.0, 5.0, 11.0, 1.0, 4.0, 10.0)), + Vectors.sparse(12, Array(0, 2, 4, 8, 10, 11), Array(-3.0, 3.0, 11.0, 3.0, 15.0 / 2, 1.0)), + Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 0.0, 1.0, 0.0, 5.0, 0.0), + Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 1.0, 8.0 / 3, 1.0, 15.0 / 2, 1.0) ).map(_.toOPVector) transformed.collect(vector) shouldBe expected - // TODO: Order of columns needed to be changed here compared to earlier tests - why does this happen? val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( vectorizer, f1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")), - f2 -> List(IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), - IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), + f2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), + IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"), IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala index 0e0aef791d..8efdf37403 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/preparators/SanityCheckerTest.scala @@ -487,12 +487,12 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val transformed = new OpWorkflow().setResultFeatures(vectorized, checkedFeatures).transform(textData) - val featuresToDrop = Seq("textMap_color_NullIndicatorValue_512") + val featuresToDrop = Seq("textMap_color_NullIndicatorValue_513") val expectedFeatNames = (0 until 512).map(i => "textMap_" + i.toString) ++ - Seq("textMap_color_NullIndicatorValue_512", "textMap_fruit_NullIndicatorValue_513", - "textMap_beverage_NullIndicatorValue_514") - val featuresWithCorr = Seq("textMap_color_NullIndicatorValue_512", "textMap_fruit_NullIndicatorValue_513", - "textMap_beverage_NullIndicatorValue_514" + Seq("textMap_beverage_NullIndicatorValue_512", "textMap_color_NullIndicatorValue_513", + "textMap_fruit_NullIndicatorValue_514") + val featuresWithCorr = Seq("textMap_beverage_NullIndicatorValue_512", "textMap_color_NullIndicatorValue_513", + "textMap_fruit_NullIndicatorValue_514" ) val featuresWithNaNCorr = Seq.empty[String] @@ -535,7 +535,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP } // TODO: Not sure if we should do this test since it may not fail if spark settings are changed - it should "fail (due to a Kyro buffer overflow) when calculating a large (5k x 5k) correlation matrix " in { + it should "fail (due to a Kryo buffer overflow) when calculating a large (5k x 5k) correlation matrix " in { val numHashes = 5000 val vectorized = textMap.vectorize( @@ -595,14 +595,14 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP val featuresToDrop = Seq.empty[String] val totalFeatures = (0 until numHashes).map(i => "textMap_" + i.toString) ++ - Seq("textMap_color_NullIndicatorValue_" + numHashes.toString, - "textMap_fruit_NullIndicatorValue_" + (numHashes + 1).toString, - "textMap_beverage_NullIndicatorValue_" + (numHashes + 2).toString) + Seq("textMap_beverage_NullIndicatorValue_" + numHashes.toString, + "textMap_color_NullIndicatorValue_" + (numHashes + 1).toString, + "textMap_fruit_NullIndicatorValue_" + (numHashes + 2).toString) val featuresWithCorr = Seq("textMap_8", "textMap_89", "textMap_294", "textMap_706", "textMap_971", "textMap_1364", "textMap_1633", "textMap_2382", "textMap_2527", "textMap_3159", "textMap_3491", - "textMap_3804", "textMap_color_NullIndicatorValue_" + numHashes.toString, - "textMap_fruit_NullIndicatorValue_" + (numHashes + 1).toString, - "textMap_beverage_NullIndicatorValue_" + (numHashes + 2).toString) + "textMap_3804", "textMap_beverage_NullIndicatorValue_" + numHashes.toString, + "textMap_color_NullIndicatorValue_" + (numHashes + 1).toString, + "textMap_fruit_NullIndicatorValue_" + (numHashes + 2).toString) val featuresWithNaNCorr = totalFeatures.filterNot(featuresWithCorr.contains)