Skip to content

Commit

Permalink
updated string indexer enum to match spark (#93)
Browse files Browse the repository at this point in the history
  • Loading branch information
leahmcguire authored and tovbinm committed Aug 27, 2018
1 parent 921336a commit d6e677d
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.feature.{StringIndexerHandleInvalid => Inv}
import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper
import enumeratum._
import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel}
Expand Down Expand Up @@ -62,6 +63,8 @@ class OpStringIndexer[T <: Text]
* @return this stage
*/
def setHandleInvalid(value: StringIndexerHandleInvalid): this.type = {
assert(Seq(Inv.Skip, Inv.Error, Inv.Keep).contains(value),
"OpStringIndexer only supports Skip, Error, and Keep for handle invalid")
getSparkMlStage().get.setHandleInvalid(value.entryName.toLowerCase)
this
}
Expand All @@ -73,5 +76,6 @@ object StringIndexerHandleInvalid extends Enum[StringIndexerHandleInvalid] {
val values = findValues
case object Skip extends StringIndexerHandleInvalid
case object Error extends StringIndexerHandleInvalid
case object Keep extends StringIndexerHandleInvalid
case object NoFilter extends StringIndexerHandleInvalid
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.feature.StringIndexerModel
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Assertions, FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
Expand Down Expand Up @@ -90,22 +90,4 @@ class OpStringIndexerNoFilterTest extends FlatSpec with TestSparkContext {

indices shouldBe expectedNew
}

Spec[OpStringIndexer[_]] should "correctly index a text column" in {
val stringIndexer = new OpStringIndexer[Text]().setInput(txtF)
val indices = stringIndexer.fit(ds).transform(ds).collect(stringIndexer.getOutput())

indices shouldBe expected
}

it should "correctly deinxed a numeric column" in {
val indexedStage = new OpStringIndexer[Text]().setInput(txtF)
val indexed = indexedStage.getOutput()
val indices = indexedStage.fit(ds).transform(ds)
val deindexedStage = new OpIndexToString().setInput(indexed)
val deindexed = deindexedStage.getOutput()
val deindexedData = deindexedStage.transform(indices).collect(deindexed)
deindexedData shouldBe txtData
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.features.types._
import com.salesforce.op.features.types.Text
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner
import com.salesforce.op.utils.spark.RichDataset._

@RunWith(classOf[JUnitRunner])
class OpStringIndexerTest extends FlatSpec with TestSparkContext{

val txtData = Seq("a", "b", "c", "a", "a", "c").map(_.toText)
val (ds, txtF) = TestFeatureBuilder(txtData)
val expected = Array(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN)


Spec[OpStringIndexer[_]] should "correctly set the wrapped spark stage params" in {
val indexer = new OpStringIndexer[Text]()
indexer.setHandleInvalid(StringIndexerHandleInvalid.Skip)
indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Skip.entryName.toLowerCase
indexer.setHandleInvalid(StringIndexerHandleInvalid.Error)
indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Error.entryName.toLowerCase
indexer.setHandleInvalid(StringIndexerHandleInvalid.Keep)
indexer.getSparkMlStage().get.getHandleInvalid shouldBe StringIndexerHandleInvalid.Keep.entryName.toLowerCase
}

it should "throw an error if you try to set noFilter as the indexer" in {
val indexer = new OpStringIndexer[Text]()
intercept[AssertionError](indexer.setHandleInvalid(StringIndexerHandleInvalid.NoFilter))
}

it should "correctly index a text column" in {
val stringIndexer = new OpStringIndexer[Text]().setInput(txtF)
val indices = stringIndexer.fit(ds).transform(ds).collect(stringIndexer.getOutput())

indices shouldBe expected
}

it should "correctly deinxed a numeric column" in {
val indexedStage = new OpStringIndexer[Text]().setInput(txtF)
val indexed = indexedStage.getOutput()
val indices = indexedStage.fit(ds).transform(ds)
val deindexedStage = new OpIndexToString().setInput(indexed)
val deindexed = deindexedStage.getOutput()
val deindexedData = deindexedStage.transform(indices).collect(deindexed)
deindexedData shouldBe txtData
}

}

0 comments on commit d6e677d

Please sign in to comment.