From a51212aedac6644b9a8fce2c74406dc94b53c492 Mon Sep 17 00:00:00 2001
From: Leah McGuire <lmcguire@salesforce.com>
Date: Fri, 24 Jan 2020 09:06:04 -0800
Subject: [PATCH] allow result features to be removed by raw feature filter
 (#458)

---
 .../scala/com/salesforce/op/OpWorkflow.scala  | 52 ++++++++++++++-----
 .../impl/feature/SmartTextVectorizer.scala    |  3 +-
 .../com/salesforce/op/OpWorkflowTest.scala    | 12 +++--
 3 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
index d1ebb22ddf..27459eb4ca 100644
--- a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
+++ b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
@@ -41,6 +41,7 @@ import com.salesforce.op.utils.reflection.ReflectionUtils
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.stages.FitStagesUtil
 import com.salesforce.op.utils.stages.FitStagesUtil.{CutDAG, FittedDAG, Layer, StagesDAG}
+import enumeratum.{Enum, EnumEntry}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Transformer}
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -61,6 +62,9 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
   // raw feature filter stage which can be used in place of a reader
   private[op] var rawFeatureFilter: Option[RawFeatureFilter[_]] = None
 
+  // result feature retention policy
+  private[op] var resultFeaturePolicy: ResultFeatureRetention = ResultFeatureRetention.Strict
+
   /**
    * Set stage and reader parameters from OpWorkflowParams object for run
    *
@@ -78,7 +82,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
    *
    * By setting the final features the stages used to
    * generate them can be traced back through the parent features and origin stages.
-   * The input is an tuple of features to support leaf feature generation (multiple endpoints in feature generation).
+   * The input is a tuple of features to support leaf feature generation (multiple endpoints in feature generation).
    *
    * @param features Final features generated by the workflow
    */
@@ -104,6 +108,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
     this
   }
 
+
   /**
    * Will set the blacklisted features variable and if list is non-empty it will
    * @param features list of features to blacklist
@@ -112,17 +117,25 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
   private[op] def setBlacklist(features: Array[OPFeature], distributions: Seq[FeatureDistribution]): Unit = {
     // TODO: Figure out a way to keep track of raw features that aren't explicitly blacklisted, but can't be used
     // TODO: because they're inputs into an explicitly blacklisted feature. Eg. "height" in ModelInsightsTest
+
+    def finalResultFeaturesCheck(resultFeatures: Array[OPFeature], blacklisted: List[OPFeature]): Unit = {
+      if (resultFeaturePolicy == ResultFeatureRetention.Strict) {
+        resultFeatures.foreach{ f => if (blacklisted.contains(f)) {
+          throw new IllegalArgumentException(s"Blacklist of features (${blacklisted.map(_.name).mkString(", ")})" +
+            s" from RawFeatureFilter contained the result feature ${f.name}") } }
+      } else if (resultFeaturePolicy == ResultFeatureRetention.AtLeastOne) {
+        if (resultFeatures.forall(blacklisted.contains)) throw new IllegalArgumentException(s"Blacklist of features" +
+          s" (${blacklisted.map(_.name).mkString(", ")}) from RawFeatureFilter removed all result features")
+      } else throw new IllegalArgumentException(s"result feature retention policy $resultFeaturePolicy not supported")
+    }
+
     blacklistedFeatures = features
     if (blacklistedFeatures.nonEmpty) {
       val allBlacklisted: MList[OPFeature] = MList(getBlacklist(): _*)
       val allUpdated: MList[OPFeature] = MList.empty
 
       val initialResultFeatures = getResultFeatures()
-      initialResultFeatures
-        .foreach{ f => if (allBlacklisted.contains(f)) throw new IllegalArgumentException(
-          s"Blacklist of features (${allBlacklisted.map(_.name).mkString(", ")})" +
-            s" from RawFeatureFilter contained the result feature ${f.name}" )
-        }
+      finalResultFeaturesCheck(initialResultFeatures, allBlacklisted.toList)
 
       val initialStages = getStages() // ordered by DAG so dont need to recompute DAG
       // for each stage remove anything blacklisted from the inputs and update any changed input features
@@ -138,16 +151,15 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         Try(stg.setInputFeatureArray(inputsChanged).setOutputFeatureName(oldOutput.name).getOutput()) match {
           case Success(out) => allUpdated += out
           case Failure(e) =>
-            if (initialResultFeatures.contains(oldOutput)) throw new RuntimeException(
-              s"Blacklist of features (${allBlacklisted.map(_.name).mkString(", ")}) \n" +
-                s" created by RawFeatureFilter contained features critical to the creation of required result" +
-                s" feature (${oldOutput.name}) though the path: \n ${oldOutput.prettyParentStages} \n", e)
-            else allBlacklisted += oldOutput
+            log.info(s"Issue updating inputs for stage $stg: $e")
+            allBlacklisted += oldOutput
+            finalResultFeaturesCheck(initialResultFeatures, allBlacklisted.toList)
         }
       }
 
       // Update the whole DAG with the blacklisted features expunged
       val updatedResultFeatures = initialResultFeatures
+        .filterNot(allBlacklisted.contains)
         .map{ f => allUpdated.find(u => u.sameOrigin(f)).getOrElse(f) }
       setResultFeatures(updatedResultFeatures: _*)
     }
@@ -339,6 +351,9 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         val fittedStgs = fitStages(data = rawData, stagesToFit = stages, persistEveryKStages)
         val newResultFtrs = resultFeatures.map(_.copyWithNewStages(fittedStgs))
         fittedStgs -> newResultFtrs
+      } else if (rawFeatureFilter.nonEmpty) {
+        generateRawData()
+        stages -> resultFeatures
       } else {
         stages -> resultFeatures
       }
@@ -534,8 +549,10 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
     protectedJSFeatures: Array[OPFeature] = Array.empty,
     textBinsFormula: (Summary, Int) => Int = RawFeatureFilter.textBinsFormula,
     timePeriod: Option[TimePeriod] = None,
-    minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault
+    minScoringRows: Int = RawFeatureFilter.minScoringRowsDefault,
+    resultFeatureRetentionPolicy: ResultFeatureRetention = ResultFeatureRetention.Strict
   ): this.type = {
+    resultFeaturePolicy = resultFeatureRetentionPolicy
     val training = trainingReader.orElse(reader).map(_.asInstanceOf[Reader[T]])
     require(training.nonEmpty, "Reader for training data must be provided either in withRawFeatureFilter or directly" +
       "as the reader for the workflow")
@@ -563,3 +580,14 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
   // scalastyle:on
 
 }
+
+/**
+ * Methods of vectorizing text (eg. to be chosen by statistics computed in SmartTextVectorizer)
+ */
+sealed trait ResultFeatureRetention extends EnumEntry with Serializable
+
+object ResultFeatureRetention extends Enum[ResultFeatureRetention] {
+  val values = findValues
+  case object Strict extends ResultFeatureRetention
+  case object AtLeastOne extends ResultFeatureRetention
+}
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
index d75e42bf36..ea5235fe44 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
@@ -179,7 +179,8 @@ object SmartTextVectorizer {
  * @param valueCounts  counts of feature values
  * @param lengthCounts counts of token lengths
  */
-private[op] case class TextStats(
+private[op] case class TextStats
+(
   valueCounts: Map[String, Long],
   lengthCounts: Map[Int, Long]
 ) extends JsonLike {
diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
index 3a51c1b1f7..2b8d88e63d 100644
--- a/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
+++ b/core/src/test/scala/com/salesforce/op/OpWorkflowTest.scala
@@ -206,9 +206,6 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
   }
 
   it should "throw an error when it is not possible to remove blacklisted features" in {
-    val fv = Seq(age, gender, height, weight, description, boarded, stringMap, numericMap, booleanMap).transmogrify()
-    val survivedNum = survived.occurs()
-    val pred = BinaryClassificationModelSelector().setInput(survivedNum, fv).getOutput()
     val wf = new OpWorkflow()
       .setResultFeatures(whyNotNormed)
       .withRawFeatureFilter(Option(dataReader), None)
@@ -219,6 +216,15 @@ class OpWorkflowTest extends FlatSpec with PassengerSparkFixtureTest {
     error.getMessage.contains("creation of required result feature (height-weight_4-stagesApplied_Real")
   }
 
+  it should "allow the removal of some final features with the retention policy is set to allow it" in {
+    val wf = new OpWorkflow()
+      .setResultFeatures(whyNotNormed, weight)
+      .withRawFeatureFilter(Option(dataReader), None, resultFeatureRetentionPolicy = ResultFeatureRetention.AtLeastOne)
+
+    wf.setBlacklist(Array(age, gender, height, description, stringMap, numericMap), Seq.empty)
+    wf.getResultFeatures().map(_.name) shouldEqual Seq(weight).map(_.name)
+  }
+
   it should "be able to compute a partial dataset in both workflow and workflow model" in {
     val fields =
       List(KeyFieldName, height.name, weight.name, heightNormed.name, density.name,