updated string indexer enum to match spark (#93)

salesforce · Aug 27, 2018 · d6e677d · d6e677d
1 parent 921336a
commit d6e677d
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 19 deletions.
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OpStringIndexer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OpStringIndexer.scala
@@ -32,6 +32,7 @@ package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op.UID
 import com.salesforce.op.features.types._
+import com.salesforce.op.stages.impl.feature.{StringIndexerHandleInvalid => Inv}
 import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
 import enumeratum._
 import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel}
@@ -62,6 +63,8 @@ class OpStringIndexer[T <: Text]
    * @return this stage
    */
   def setHandleInvalid(value: StringIndexerHandleInvalid): this.type = {
+    assert(Seq(Inv.Skip, Inv.Error, Inv.Keep).contains(value),
+      "OpStringIndexer only supports Skip, Error, and Keep for handle invalid")
     getSparkMlStage().get.setHandleInvalid(value.entryName.toLowerCase)
     this
   }
@@ -73,5 +76,6 @@ object StringIndexerHandleInvalid extends Enum[StringIndexerHandleInvalid] {
   val values = findValues
   case object Skip extends StringIndexerHandleInvalid
   case object Error extends StringIndexerHandleInvalid
+  case object Keep extends StringIndexerHandleInvalid
   case object NoFilter extends StringIndexerHandleInvalid
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpStringIndexerNoFilterTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpStringIndexerNoFilterTest.scala
@@ -38,8 +38,8 @@ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.spark.ml.feature.StringIndexerModel
 import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
-import org.scalatest.{Assertions, FlatSpec, Matchers}
 
 
 @RunWith(classOf[JUnitRunner])
@@ -90,22 +90,4 @@ class OpStringIndexerNoFilterTest extends FlatSpec with TestSparkContext {
 
     indices shouldBe expectedNew
   }
-
-  Spec[OpStringIndexer[_]] should "correctly index a text column" in {
-    val stringIndexer = new OpStringIndexer[Text]().setInput(txtF)
-    val indices = stringIndexer.fit(ds).transform(ds).collect(stringIndexer.getOutput())
-
-    indices shouldBe expected
-  }
-
-  it should "correctly deinxed a numeric column" in {
-    val indexedStage = new OpStringIndexer[Text]().setInput(txtF)
-    val indexed = indexedStage.getOutput()
-    val indices = indexedStage.fit(ds).transform(ds)
-    val deindexedStage = new OpIndexToString().setInput(indexed)
-    val deindexed = deindexedStage.getOutput()
-    val deindexedData = deindexedStage.transform(indices).collect(deindexed)
-    deindexedData shouldBe txtData
-  }
-
 }
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpStringIndexerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpStringIndexerTest.scala
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.features.types.Text
+import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+import com.salesforce.op.utils.spark.RichDataset._
+
+@RunWith(classOf[JUnitRunner])
+class OpStringIndexerTest extends FlatSpec with TestSparkContext{
+
+  val txtData = Seq("a", "b", "c", "a", "a", "c").map(_.toText)
+  val (ds, txtF) = TestFeatureBuilder(txtData)
+  val expected = Array(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN)
+
+
+  Spec[OpStringIndexer[_]] should "correctly set the wrapped spark stage params" in {
+    val indexer = new OpStringIndexer[Text]()
+    indexer.setHandleInvalid(StringIndexerHandleInvalid.Skip)
+    indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Skip.entryName.toLowerCase
+    indexer.setHandleInvalid(StringIndexerHandleInvalid.Error)
+    indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Error.entryName.toLowerCase
+    indexer.setHandleInvalid(StringIndexerHandleInvalid.Keep)
+    indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Keep.entryName.toLowerCase
+  }
+
+  it should "throw an error if you try to set noFilter as the indexer" in {
+    val indexer = new OpStringIndexer[Text]()
+    intercept[AssertionError](indexer.setHandleInvalid(StringIndexerHandleInvalid.NoFilter))
+  }
+
+  it should "correctly index a text column" in {
+    val stringIndexer = new OpStringIndexer[Text]().setInput(txtF)
+    val indices = stringIndexer.fit(ds).transform(ds).collect(stringIndexer.getOutput())
+
+    indices shouldBe expected
+  }
+
+  it should "correctly deinxed a numeric column" in {
+    val indexedStage = new OpStringIndexer[Text]().setInput(txtF)
+    val indexed = indexedStage.getOutput()
+    val indices = indexedStage.fit(ds).transform(ds)
+    val deindexedStage = new OpIndexToString().setInput(indexed)
+    val deindexed = deindexedStage.getOutput()
+    val deindexedData = deindexedStage.transform(indices).collect(deindexed)
+    deindexedData shouldBe txtData
+  }
+
+}