Remove duplicate features using sanity checker feature to feature cor…

…relations (#476)
salesforce · May 19, 2020 · d222438 · d222438
1 parent 24cdbc4
commit d222438
Show file tree

Hide file tree

Showing 9 changed files with 239 additions and 84 deletions.
diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala
@@ -638,7 +638,7 @@ case object ModelInsights {
               derivedFeatureGroup = h.grouping,
               derivedFeatureValue = h.indicatorValue,
               excluded = Option(s.dropped.contains(h.columnName)),
-              corr = getCorr(s.correlationsWLabel, h.columnName),
+              corr = getCorr(s.correlations, h.columnName),
               cramersV = catGroupIndex.map(i => s.categoricalStats(i).cramersV),
               mutualInformation = catGroupIndex.map(i => s.categoricalStats(i).mutualInfo),
               pointwiseMutualInformation = (catGroupIndex, catIndexWithinGroup) match {
@@ -747,11 +747,7 @@ case object ModelInsights {
     if (index >= 0) values.mapValues(_ (index)) else Map.empty
 
   private def getCorr(corr: Correlations, name: String): Option[Double] = {
-    getIfExists(corr.featuresIn.indexOf(name), corr.values).orElse {
-      val j = corr.nanCorrs.indexOf(name)
-      if (j >= 0) Option(Double.NaN)
-      else None
-    }
+    getIfExists(corr.featuresIn.indexOf(name), corr.valuesWithLabel)
   }
 
   private[op] def descaleLRContrib(

diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
@@ -474,6 +474,7 @@ trait RichNumericFeature {
       sampleUpperLimit: Int = SanityChecker.SampleUpperLimit,
       maxCorrelation: Double = SanityChecker.MaxCorrelation,
       minCorrelation: Double = SanityChecker.MinCorrelation,
+      maxFeatureCorr: Double = SanityChecker.MaxFeatureCorr,
       maxCramersV: Double = SanityChecker.MaxCramersV,
       correlationType: CorrelationType = SanityChecker.CorrelationTypeDefault,
       minVariance: Double = SanityChecker.MinVariance,
@@ -494,6 +495,7 @@ trait RichNumericFeature {
         .setSampleUpperLimit(sampleUpperLimit)
         .setMaxCorrelation(maxCorrelation)
         .setMinCorrelation(minCorrelation)
+        .setMaxFeatureCorrelation(maxFeatureCorr)
         .setMaxCramersV(maxCramersV)
         .setCorrelationType(correlationType)
         .setMinVariance(minVariance)

diff --git a/.../src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala b/.../src/main/scala/com/salesforce/op/stages/impl/preparators/DerivedFeatureFilterUtils.scala
@@ -36,6 +36,7 @@ import com.salesforce.op.utils.spark.RichMetadata._
 import org.apache.log4j.Level
 import org.apache.spark.ml.linalg.{Vectors => NewVectors}
 import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Param, Params}
+import org.apache.spark.mllib.linalg.Matrix
 import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
 import org.apache.spark.sql.types.Metadata
 
@@ -98,7 +99,8 @@ object DerivedFeatureFilterUtils {
     labelNameAndIndex: Option[(String, Int)] = None,
     corrsWithLabel: Array[Double] = Array.empty,
     corrIndices: Array[Int] = Array.empty,
-    categoricalStats: Array[CategoricalGroupStats] = Array.empty
+    categoricalStats: Array[CategoricalGroupStats] = Array.empty,
+    corrMatrix: Option[Matrix] = None
   ): Array[ColumnStatistics] = {
     // precompute all statistics to avoid rebuilding the vectors every time
     val means = statsSummary.mean
@@ -108,7 +110,7 @@ object DerivedFeatureFilterUtils {
     val variances = statsSummary.variance
     val cramersVMap = categoricalStats.flatMap(f => f.categoricalFeatures.map(c => c -> f.cramersV))
       .toMap[String, Double]
-    val numCorrIndices = corrIndices.length
+    val featureCorrs = corrMatrix.map(_.rowIter.map(_.toArray).toArray)
 
     def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map { case (k, v) =>
       // Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation
@@ -174,6 +176,10 @@ object DerivedFeatureFilterUtils {
             case ind => Option(corrsWithLabel(ind))
           },
           cramersV = cramersVMap.get(name),
+          featureCorrs = corrIndices.indexOf(i) match {
+            case -1 => Seq.empty
+            case ind => featureCorrs.getOrElse(Array.empty).map(_.apply(ind)).dropRight(1) // drop label corr
+          },
           parentCorr = getParentValue(col, corrParent, corrParentNoKeys),
           parentCramersV = getParentValue(col, cramersVParent, cramersVParentNoKeys),
           maxRuleConfidences = maxRuleConfMap.getOrElse(name, Seq.empty),
@@ -195,6 +201,7 @@ object DerivedFeatureFilterUtils {
           variance = variances(labelColumnIndex),
           corrLabel = None,
           cramersV = None,
+          featureCorrs = Seq.empty,
           parentCorr = None,
           parentCramersV = None,
           maxRuleConfidences = Seq.empty,
@@ -214,6 +221,7 @@ object DerivedFeatureFilterUtils {
    * @param minVariance            Min variance for dropping features
    * @param minCorrelation         Min correlation with label for dropping features
    * @param maxCorrelation         Max correlation with label for dropping features
+   * @param maxFeatureCorr         Max correlation between features for dropping the later features
    * @param maxCramersV            Max Cramer's V for dropping categorical features
    * @param maxRuleConfidence      Max allowed confidence of association rules for dropping features
    * @param minRequiredRuleSupport Threshold for association rule
@@ -229,6 +237,7 @@ object DerivedFeatureFilterUtils {
     minVariance: Double,
     minCorrelation: Double = 0.0,
     maxCorrelation: Double = 1.0,
+    maxFeatureCorr: Double = 1.0,
     maxCramersV: Double = 1.0,
     maxRuleConfidence: Double = 1.0,
     minRequiredRuleSupport: Double = 1.0,
@@ -256,6 +265,7 @@ object DerivedFeatureFilterUtils {
         minVariance = minVariance,
         minCorrelation = minCorrelation,
         maxCorrelation = maxCorrelation,
+        maxFeatureCorr = maxFeatureCorr,
         maxCramersV = maxCramersV,
         maxRuleConfidence = maxRuleConfidence,
         minRequiredRuleSupport = minRequiredRuleSupport,
@@ -314,6 +324,7 @@ private[op] case class ColumnStatistics
   cramersV: Option[Double],
   parentCorr: Option[Double],
   parentCramersV: Option[Double],
+  featureCorrs: Seq[Double],
   // Need to be able to hold up to two maxRuleConfidences or supports for the case of nullIndicator columns coming
   // from non-categorical features (since they will correspond to a 2x2 contingency matrix)
   maxRuleConfidences: Seq[Double],
@@ -341,6 +352,7 @@ private[op] case class ColumnStatistics
     minVariance: Double,
     maxCorrelation: Double,
     minCorrelation: Double,
+    maxFeatureCorr: Double,
     maxCramersV: Double,
     maxRuleConfidence: Double,
     minRequiredRuleSupport: Double,
@@ -361,6 +373,10 @@ private[op] case class ColumnStatistics
         corrLabel.filter(Math.abs(_) > maxCorrelation).map(corr =>
           s"correlation $corr higher than max correlation $maxCorrelation"
         ),
+        column.flatMap{ case cl => featureCorrs.take(cl.index).find(Math.abs(_) > maxFeatureCorr).map(corr =>
+          s"this feature has correlations $corr with another feature higher than max feature-feature" +
+            s" correlation $maxFeatureCorr")
+        },
         cramersV.filter(_ > maxCramersV).map(cv =>
           s"Cramer's V $cv higher than max Cramer's V $maxCramersV"
         ),

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala b/core/src/main/scala/com/salesforce/op/stages/impl/preparators/SanityChecker.scala
@@ -94,15 +94,24 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
   final val maxCorrelation = new DoubleParam(
     parent = this, name = "maxCorrelation",
     doc = "Maximum correlation (absolute value) allowed between a feature in the feature vector and the label",
-    isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
+    isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
   )
   def setMaxCorrelation(value: Double): this.type = set(maxCorrelation, value)
   def getMaxCorrelation: Double = $(maxCorrelation)
 
+  final val maxFeatureCorrelation = new DoubleParam(
+    parent = this, name = "maxFeatureCorr",
+    doc = "Maximum correlation (absolute value) allowed between two features. When this value is exceeded the second" +
+      " feature in the correlated pair will be dropped.",
+    isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
+  )
+  def setMaxFeatureCorrelation(value: Double): this.type = set(maxFeatureCorrelation, value)
+  def getMaxFeatureCorrelation: Double = $(maxFeatureCorrelation)
+
   final val minCorrelation = new DoubleParam(
     parent = this, name = "minCorrelation",
     doc = "Minimum correlation (absolute value) allowed between a feature in the feature vector and the label",
-    isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
+    isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
   )
   def setMinCorrelation(value: Double): this.type = set(minCorrelation, value)
   def getMinCorrelation: Double = $(minCorrelation)
@@ -190,6 +199,7 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
     sampleUpperLimit -> SanityChecker.SampleUpperLimit,
     maxCorrelation -> SanityChecker.MaxCorrelation,
     minCorrelation -> SanityChecker.MinCorrelation,
+    maxFeatureCorrelation -> SanityChecker.MaxFeatureCorr,
     minVariance -> SanityChecker.MinVariance,
     maxCramersV -> SanityChecker.MaxCramersV,
     removeBadFeatures -> SanityChecker.RemoveBadFeatures,
@@ -280,7 +290,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
               val colsCleaned = cols.map(_._2).filterNot(c => repeats.contains(c.index))
               (group, colsCleaned.map(_.makeColName()), colsCleaned.map(_.index),
                 colsCleaned.exists(_.hasParentOfSubType[MultiPickList]))
-          }
+            }
 
         colIndicesByGrouping.map {
           case (group, colNames, valueIndices, isMpl) =>
@@ -443,13 +453,13 @@ class SanityChecker(uid: String = UID[SanityChecker])
     else ((0 until featureSize + 1).toArray, vectorRows)
     val numCorrIndices = corrIndices.length
 
-    // TODO: We are still calculating the full correlation matrix when featureLabelCorrOnly is false, but are not
-    // TODO: storing it anywhere. If we want access to the inter-feature correlations then need to refactor this a bit.
-    val corrsWithLabel = if ($(featureLabelCorrOnly)) {
-      OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
+    val (corrMatrix, corrsWithLabel) = if ($(featureLabelCorrOnly)) {
+      None -> OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
+    }
+    else {
+      val matrix = Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName)
+      Option(matrix) -> matrix.rowIter.map(_.apply(numCorrIndices - 1)).toArray
     }
-    else Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName).rowIter
-      .map(_.apply(numCorrIndices - 1)).toArray
 
     // Only calculate this if the label is categorical, so ignore if user has flagged label as not categorical
     val categoricalStats =
@@ -467,7 +477,8 @@ class SanityChecker(uid: String = UID[SanityChecker])
       Option((in1.name, featureSize)), // label column goes at end of vector
       corrsWithLabel,
       corrIndices,
-      categoricalStats
+      categoricalStats,
+      corrMatrix
     )
     stats.foreach { stat => logInfo(stat.toString) }
 
@@ -478,6 +489,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
         $(minVariance),
         $(minCorrelation),
         $(maxCorrelation),
+        $(maxFeatureCorrelation),
         $(maxCramersV),
         $(maxRuleConfidence),
         $(minRequiredRuleSupport),
@@ -542,6 +554,7 @@ object SanityChecker {
   val SampleLowerLimit = 1E3.toInt
   val SampleUpperLimit = 1E6.toInt
   val MaxCorrelation = 0.95
+  val MaxFeatureCorr = 0.99
   val MinCorrelation = 0.0
   val MinVariance = 1E-5
   val MaxCramersV = 0.95