Skip to content

Commit

Permalink
Unary estimator for detecting names and transforming to gender (#445)
Browse files Browse the repository at this point in the history
* Added unary estimator for detecting names in text fields and looking up likely gender in government data

* Added custom map return type, NameStats, for said unary estimator
  • Loading branch information
MWYang authored Jan 14, 2020
1 parent a6aceb6 commit ac83ad7
Show file tree
Hide file tree
Showing 30 changed files with 674,935 additions and 2 deletions.
105,601 changes: 105,601 additions & 0 deletions core/src/main/resources/GenderDictionary_USandUK.csv

Large diffs are not rendered by default.

568,127 changes: 568,127 additions & 0 deletions core/src/main/resources/Names_JRC_Combined.txt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,15 @@ trait RichTextFeature {
toLowercase: Boolean = TextTokenizer.ToLowercase
): FeatureLike[Binary] =
f.transformWith(new SubstringTransformer[T, T2]().setToLowercase(toLowercase), f2)

/**
* Check if feature is actual human names, and if so, return related gender information (English language only)
*
* @param threshold optional, fraction of rows containing names before processing (default = 0.50)
* @return NameStats, a custom map that will be empty if no name was found
*/
def identifyIfHumanName(threshold: Double = 0.50): FeatureLike[NameStats] =
new HumanNameDetector[T]().setThreshold(threshold).setInput(f).getOutput()
}

implicit class RichPhoneFeature(val f: FeatureLike[Phone]) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types.NameStats.GenderValue
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
import com.salesforce.op.stages.impl.MetadataLike
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.stages._
import com.twitter.algebird.Operators._
import org.apache.spark.sql._
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

import scala.reflect.runtime.universe.TypeTag

/**
* Unary estimator for identifying whether a single Text column is a name or not. If the column does appear to be a
* name, a custom map will be returned that contains the guessed gender for each entry (gender detection only supported
* for English at the moment). If the column does not appear to be a name, then the output will be an empty map.
* @param uid uid for instance
* @param operationName unique name of the operation this stage performs
* @param tti type tag for input
* @param ttiv type tag for input value
* @tparam T the FeatureType (subtype of Text) to operate over
*/
class HumanNameDetector[T <: Text]
(
uid: String = UID[HumanNameDetector[T]],
operationName: String = "humanNameDetect"
)
(
implicit tti: TypeTag[T],
override val ttiv: TypeTag[T#Value]
) extends UnaryEstimator[T, NameStats](
uid = uid,
operationName = operationName
) with NameDetectFun[T] {

def fitFn(dataset: Dataset[T#Value]): HumanNameDetectorModel[T] = {
implicit val (nameDetectStatsEnc, nameDetectStatsMonoid) = (NameDetectStats.kryo, NameDetectStats.monoid)
val mapFun: T#Value => NameDetectStats = makeMapFunction(dataset.sparkSession)
val aggResults: NameDetectStats = dataset.map(mapFun).reduce(_ + _)
val treatAsName = computeTreatAsName(aggResults)

val newMetadata = HumanNameDetectorMetadata(
treatAsName, aggResults.dictCheckResult.value, aggResults.genderResultsByStrategy
).toMetadata()
val metaDataBuilder = new MetadataBuilder().withMetadata(getMetadata()).withMetadata(newMetadata)
setMetadata(metaDataBuilder.build())

val orderedGenderDetectStrategies =
if (treatAsName) orderGenderStrategies(aggResults) else Seq.empty[GenderDetectStrategy]
new HumanNameDetectorModel[T](uid, operationName, treatAsName, orderedGenderDetectStrategies)
}
}

class HumanNameDetectorModel[T <: Text]
(
override val uid: String,
operationName: String,
val treatAsName: Boolean,
val orderedGenderDetectStrategies: Seq[GenderDetectStrategy] = Seq.empty[GenderDetectStrategy]
)(implicit tti: TypeTag[T])
extends UnaryModel[T, NameStats](operationName, uid) with NameDetectFun[T] {

import NameStats.GenderValue.GenderNA
import NameStats.Key._
def transformFn: T => NameStats = (input: T) => {
val tokens = preProcess(input)
if (treatAsName) {
require(orderedGenderDetectStrategies.nonEmpty, "There must be a gender extraction strategy if treating as name.")
// Could figure out how to use a broadcast variable for the gender dictionary within a unary transformer
val genders: Seq[GenderValue] = orderedGenderDetectStrategies map {
identifyGender(input.value, tokens, _, NameDetectUtils.DefaultGenderDictionary)
}
val gender = genders.find(_ != GenderNA).getOrElse(GenderNA)
val map: Map[String, String] = Map(
IsName.toString -> true.toString,
OriginalValue.toString -> input.value.getOrElse(""),
Gender.toString -> gender.toString
)
NameStats(map)
}
else NameStats(Map.empty[String, String])
}
}

case class HumanNameDetectorMetadata
(
treatAsName: Boolean,
predictedNameProb: Double,
genderResultsByStrategy: Map[String, GenderStats]
) extends MetadataLike {
import HumanNameDetectorMetadata._

override def toMetadata(): Metadata = {
val metaDataBuilder = new MetadataBuilder()
metaDataBuilder.putBoolean(TreatAsNameKey, treatAsName)
metaDataBuilder.putDouble(PredictedNameProbKey, predictedNameProb)
val genderResultsMetaDataBuilder = new MetadataBuilder()
genderResultsByStrategy foreach { case (strategyString, stats) =>
genderResultsMetaDataBuilder.putDoubleArray(strategyString, Array(stats.numMale, stats.numFemale, stats.numOther))
}
metaDataBuilder.putMetadata(GenderResultsByStrategyKey, genderResultsMetaDataBuilder.build())
metaDataBuilder.build()
}

override def toMetadata(skipUnsupported: Boolean): Metadata = toMetadata()
}

case object HumanNameDetectorMetadata {
val TreatAsNameKey = "treatAsName"
val PredictedNameProbKey = "predictedNameProb"
val GenderResultsByStrategyKey = "genderResultsByStrategy"

def fromMetadata(metadata: Metadata): HumanNameDetectorMetadata = {
val genderResultsMetadata = metadata.getMetadata(GenderResultsByStrategyKey)
val genderResultsByStrategy: Map[String, GenderStats] = {
NameDetectUtils.GenderDetectStrategies map { strategy: GenderDetectStrategy =>
val strategyString = strategy.toString
val resultsArray = genderResultsMetadata.getDoubleArray(strategyString)
require(resultsArray.length == 3,
"There must be exactly three values for each gender detection strategy: numMale, numFemale, and numOther.")
strategyString -> GenderStats(
numMale = resultsArray(0).toInt, numFemale = resultsArray(1).toInt, numOther = resultsArray(2).toInt
)
} toMap
}
HumanNameDetectorMetadata(
metadata.getBoolean(TreatAsNameKey),
metadata.getDouble(PredictedNameProbKey),
genderResultsByStrategy
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,10 @@ private[op] case object Transmogrifier {
val (f, other) = castAs[StreetMap](g) // TODO make Street specific transformer
f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, cleanKeys = CleanKeys,
others = other, trackNulls = TrackNulls, maxPctCardinality = MaxPercentCardinality)
case t if t =:= weakTypeOf[NameStats] =>
val (f, other) = castAs[NameStats](g)
f.vectorize(topK = TopK, minSupport = MinSupport, cleanText = CleanText, cleanKeys = CleanKeys,
others = other, trackNulls = TrackNulls, maxPctCardinality = MaxPercentCardinality)
case t if t =:= weakTypeOf[GeolocationMap] =>
val (f, other) = castAs[GeolocationMap](g)
f.vectorize(cleanKeys = CleanKeys, others = other, trackNulls = TrackNulls)
Expand Down
Loading

0 comments on commit ac83ad7

Please sign in to comment.