Skip to content

Commit

Permalink
Remove duplicate features using sanity checker feature to feature cor…
Browse files Browse the repository at this point in the history
…relations (#476)
  • Loading branch information
leahmcguire authored May 19, 2020
1 parent 24cdbc4 commit d222438
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 84 deletions.
8 changes: 2 additions & 6 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ case object ModelInsights {
derivedFeatureGroup = h.grouping,
derivedFeatureValue = h.indicatorValue,
excluded = Option(s.dropped.contains(h.columnName)),
corr = getCorr(s.correlationsWLabel, h.columnName),
corr = getCorr(s.correlations, h.columnName),
cramersV = catGroupIndex.map(i => s.categoricalStats(i).cramersV),
mutualInformation = catGroupIndex.map(i => s.categoricalStats(i).mutualInfo),
pointwiseMutualInformation = (catGroupIndex, catIndexWithinGroup) match {
Expand Down Expand Up @@ -747,11 +747,7 @@ case object ModelInsights {
if (index >= 0) values.mapValues(_ (index)) else Map.empty

private def getCorr(corr: Correlations, name: String): Option[Double] = {
getIfExists(corr.featuresIn.indexOf(name), corr.values).orElse {
val j = corr.nanCorrs.indexOf(name)
if (j >= 0) Option(Double.NaN)
else None
}
getIfExists(corr.featuresIn.indexOf(name), corr.valuesWithLabel)
}

private[op] def descaleLRContrib(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ trait RichNumericFeature {
sampleUpperLimit: Int = SanityChecker.SampleUpperLimit,
maxCorrelation: Double = SanityChecker.MaxCorrelation,
minCorrelation: Double = SanityChecker.MinCorrelation,
maxFeatureCorr: Double = SanityChecker.MaxFeatureCorr,
maxCramersV: Double = SanityChecker.MaxCramersV,
correlationType: CorrelationType = SanityChecker.CorrelationTypeDefault,
minVariance: Double = SanityChecker.MinVariance,
Expand All @@ -494,6 +495,7 @@ trait RichNumericFeature {
.setSampleUpperLimit(sampleUpperLimit)
.setMaxCorrelation(maxCorrelation)
.setMinCorrelation(minCorrelation)
.setMaxFeatureCorrelation(maxFeatureCorr)
.setMaxCramersV(maxCramersV)
.setCorrelationType(correlationType)
.setMinVariance(minVariance)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.log4j.Level
import org.apache.spark.ml.linalg.{Vectors => NewVectors}
import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Param, Params}
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.sql.types.Metadata

Expand Down Expand Up @@ -98,7 +99,8 @@ object DerivedFeatureFilterUtils {
labelNameAndIndex: Option[(String, Int)] = None,
corrsWithLabel: Array[Double] = Array.empty,
corrIndices: Array[Int] = Array.empty,
categoricalStats: Array[CategoricalGroupStats] = Array.empty
categoricalStats: Array[CategoricalGroupStats] = Array.empty,
corrMatrix: Option[Matrix] = None
): Array[ColumnStatistics] = {
// precompute all statistics to avoid rebuilding the vectors every time
val means = statsSummary.mean
Expand All @@ -108,7 +110,7 @@ object DerivedFeatureFilterUtils {
val variances = statsSummary.variance
val cramersVMap = categoricalStats.flatMap(f => f.categoricalFeatures.map(c => c -> f.cramersV))
.toMap[String, Double]
val numCorrIndices = corrIndices.length
val featureCorrs = corrMatrix.map(_.rowIter.map(_.toArray).toArray)

def maxByParent(seq: Seq[(String, Double)]) = seq.groupBy(_._1).map { case (k, v) =>
// Filter out the NaNs because max(3.4, NaN) = NaN, and we still want the keep the largest correlation
Expand Down Expand Up @@ -174,6 +176,10 @@ object DerivedFeatureFilterUtils {
case ind => Option(corrsWithLabel(ind))
},
cramersV = cramersVMap.get(name),
featureCorrs = corrIndices.indexOf(i) match {
case -1 => Seq.empty
case ind => featureCorrs.getOrElse(Array.empty).map(_.apply(ind)).dropRight(1) // drop label corr
},
parentCorr = getParentValue(col, corrParent, corrParentNoKeys),
parentCramersV = getParentValue(col, cramersVParent, cramersVParentNoKeys),
maxRuleConfidences = maxRuleConfMap.getOrElse(name, Seq.empty),
Expand All @@ -195,6 +201,7 @@ object DerivedFeatureFilterUtils {
variance = variances(labelColumnIndex),
corrLabel = None,
cramersV = None,
featureCorrs = Seq.empty,
parentCorr = None,
parentCramersV = None,
maxRuleConfidences = Seq.empty,
Expand All @@ -214,6 +221,7 @@ object DerivedFeatureFilterUtils {
* @param minVariance Min variance for dropping features
* @param minCorrelation Min correlation with label for dropping features
* @param maxCorrelation Max correlation with label for dropping features
* @param maxFeatureCorr Max correlation between features for dropping the later features
* @param maxCramersV Max Cramer's V for dropping categorical features
* @param maxRuleConfidence Max allowed confidence of association rules for dropping features
* @param minRequiredRuleSupport Threshold for association rule
Expand All @@ -229,6 +237,7 @@ object DerivedFeatureFilterUtils {
minVariance: Double,
minCorrelation: Double = 0.0,
maxCorrelation: Double = 1.0,
maxFeatureCorr: Double = 1.0,
maxCramersV: Double = 1.0,
maxRuleConfidence: Double = 1.0,
minRequiredRuleSupport: Double = 1.0,
Expand Down Expand Up @@ -256,6 +265,7 @@ object DerivedFeatureFilterUtils {
minVariance = minVariance,
minCorrelation = minCorrelation,
maxCorrelation = maxCorrelation,
maxFeatureCorr = maxFeatureCorr,
maxCramersV = maxCramersV,
maxRuleConfidence = maxRuleConfidence,
minRequiredRuleSupport = minRequiredRuleSupport,
Expand Down Expand Up @@ -314,6 +324,7 @@ private[op] case class ColumnStatistics
cramersV: Option[Double],
parentCorr: Option[Double],
parentCramersV: Option[Double],
featureCorrs: Seq[Double],
// Need to be able to hold up to two maxRuleConfidences or supports for the case of nullIndicator columns coming
// from non-categorical features (since they will correspond to a 2x2 contingency matrix)
maxRuleConfidences: Seq[Double],
Expand Down Expand Up @@ -341,6 +352,7 @@ private[op] case class ColumnStatistics
minVariance: Double,
maxCorrelation: Double,
minCorrelation: Double,
maxFeatureCorr: Double,
maxCramersV: Double,
maxRuleConfidence: Double,
minRequiredRuleSupport: Double,
Expand All @@ -361,6 +373,10 @@ private[op] case class ColumnStatistics
corrLabel.filter(Math.abs(_) > maxCorrelation).map(corr =>
s"correlation $corr higher than max correlation $maxCorrelation"
),
column.flatMap{ case cl => featureCorrs.take(cl.index).find(Math.abs(_) > maxFeatureCorr).map(corr =>
s"this feature has correlations $corr with another feature higher than max feature-feature" +
s" correlation $maxFeatureCorr")
},
cramersV.filter(_ > maxCramersV).map(cv =>
s"Cramer's V $cv higher than max Cramer's V $maxCramersV"
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,24 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
final val maxCorrelation = new DoubleParam(
parent = this, name = "maxCorrelation",
doc = "Maximum correlation (absolute value) allowed between a feature in the feature vector and the label",
isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
)
def setMaxCorrelation(value: Double): this.type = set(maxCorrelation, value)
def getMaxCorrelation: Double = $(maxCorrelation)

final val maxFeatureCorrelation = new DoubleParam(
parent = this, name = "maxFeatureCorr",
doc = "Maximum correlation (absolute value) allowed between two features. When this value is exceeded the second" +
" feature in the correlated pair will be dropped.",
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
)
def setMaxFeatureCorrelation(value: Double): this.type = set(maxFeatureCorrelation, value)
def getMaxFeatureCorrelation: Double = $(maxFeatureCorrelation)

final val minCorrelation = new DoubleParam(
parent = this, name = "minCorrelation",
doc = "Minimum correlation (absolute value) allowed between a feature in the feature vector and the label",
isValid = ParamValidators.inRange(lowerBound = 0.0, upperBound = 1.0, lowerInclusive = true, upperInclusive = true)
isValid = ParamValidators.inRange(lowerBound = -0.1, upperBound = 1.1, lowerInclusive = true, upperInclusive = true)
)
def setMinCorrelation(value: Double): this.type = set(minCorrelation, value)
def getMinCorrelation: Double = $(minCorrelation)
Expand Down Expand Up @@ -190,6 +199,7 @@ trait SanityCheckerParams extends DerivedFeatureFilterParams {
sampleUpperLimit -> SanityChecker.SampleUpperLimit,
maxCorrelation -> SanityChecker.MaxCorrelation,
minCorrelation -> SanityChecker.MinCorrelation,
maxFeatureCorrelation -> SanityChecker.MaxFeatureCorr,
minVariance -> SanityChecker.MinVariance,
maxCramersV -> SanityChecker.MaxCramersV,
removeBadFeatures -> SanityChecker.RemoveBadFeatures,
Expand Down Expand Up @@ -280,7 +290,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
val colsCleaned = cols.map(_._2).filterNot(c => repeats.contains(c.index))
(group, colsCleaned.map(_.makeColName()), colsCleaned.map(_.index),
colsCleaned.exists(_.hasParentOfSubType[MultiPickList]))
}
}

colIndicesByGrouping.map {
case (group, colNames, valueIndices, isMpl) =>
Expand Down Expand Up @@ -443,13 +453,13 @@ class SanityChecker(uid: String = UID[SanityChecker])
else ((0 until featureSize + 1).toArray, vectorRows)
val numCorrIndices = corrIndices.length

// TODO: We are still calculating the full correlation matrix when featureLabelCorrOnly is false, but are not
// TODO: storing it anywhere. If we want access to the inter-feature correlations then need to refactor this a bit.
val corrsWithLabel = if ($(featureLabelCorrOnly)) {
OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
val (corrMatrix, corrsWithLabel) = if ($(featureLabelCorrOnly)) {
None -> OpStatistics.computeCorrelationsWithLabel(vectorRowsForCorr, colStats, count)
}
else {
val matrix = Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName)
Option(matrix) -> matrix.rowIter.map(_.apply(numCorrIndices - 1)).toArray
}
else Statistics.corr(vectorRowsForCorr, getCorrelationType.sparkName).rowIter
.map(_.apply(numCorrIndices - 1)).toArray

// Only calculate this if the label is categorical, so ignore if user has flagged label as not categorical
val categoricalStats =
Expand All @@ -467,7 +477,8 @@ class SanityChecker(uid: String = UID[SanityChecker])
Option((in1.name, featureSize)), // label column goes at end of vector
corrsWithLabel,
corrIndices,
categoricalStats
categoricalStats,
corrMatrix
)
stats.foreach { stat => logInfo(stat.toString) }

Expand All @@ -478,6 +489,7 @@ class SanityChecker(uid: String = UID[SanityChecker])
$(minVariance),
$(minCorrelation),
$(maxCorrelation),
$(maxFeatureCorrelation),
$(maxCramersV),
$(maxRuleConfidence),
$(minRequiredRuleSupport),
Expand Down Expand Up @@ -542,6 +554,7 @@ object SanityChecker {
val SampleLowerLimit = 1E3.toInt
val SampleUpperLimit = 1E6.toInt
val MaxCorrelation = 0.95
val MaxFeatureCorr = 0.99
val MinCorrelation = 0.0
val MinVariance = 1E-5
val MaxCramersV = 0.95
Expand Down
Loading

0 comments on commit d222438

Please sign in to comment.