From 52585b6bc58785c7a3427591277a1490ece0d69f Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 15 Nov 2019 13:30:21 -0800 Subject: [PATCH 1/7] Remove cardinality computation (#438) --- .../op/filters/FeatureDistribution.scala | 27 +++---------------- .../com/salesforce/op/ModelInsightsTest.scala | 15 +++-------- .../op/filters/FeatureDistributionTest.scala | 7 ++--- .../op/filters/FiltersTestData.scala | 12 ++++----- 4 files changed, 16 insertions(+), 45 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala index 7619d25308..1221bbe3c1 100644 --- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala +++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala @@ -64,7 +64,6 @@ case class FeatureDistribution distribution: Array[Double], summaryInfo: Array[Double], moments: Option[Moments] = None, - cardEstimate: Option[TextStats] = None, `type`: FeatureDistributionType = FeatureDistributionType.Training ) extends FeatureDistributionLike { @@ -110,10 +109,9 @@ case class FeatureDistribution val combinedSummaryInfo = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo val combinedMoments = moments + fd.moments - val combinedCard = cardEstimate + fd.cardEstimate FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist, - combinedSummaryInfo, combinedMoments, combinedCard, `type`) + combinedSummaryInfo, combinedMoments, `type`) } /** @@ -174,14 +172,14 @@ case class FeatureDistribution } override def equals(that: Any): Boolean = that match { - case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, c, `type`) => + case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, `type`) => distribution.deep == d.deep && summaryInfo.deep == s.deep && - moments == m && cardEstimate == c + moments == m case _ => false } override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution, - summaryInfo, moments, cardEstimate, `type`) + summaryInfo, moments, `type`) } object FeatureDistribution { @@ -240,7 +238,6 @@ object FeatureDistribution { .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins))) val moments = value.map(momentsValues) - val cardEstimate = value.map(cardinalityValues) FeatureDistribution( name = name, @@ -250,7 +247,6 @@ object FeatureDistribution { summaryInfo = summaryInfo, distribution = distribution, moments = moments, - cardEstimate = cardEstimate, `type` = `type` ) } @@ -269,21 +265,6 @@ object FeatureDistribution { MomentsGroup.sum(population.map(x => Moments(x))) } - /** - * Function to track frequency of the first $(MaxCardinality) unique values - * (number for numeric features, token for text features) - * - * @param values values to track distribution / frequency - * @return TextStats object containing a Map from a value to its frequency (histogram) - */ - private def cardinalityValues(values: ProcessedSeq): TextStats = { - val population = values match { - case Left(seq) => seq - case Right(seq) => seq.map(_.toString) - } - TextStats(population.groupBy(identity).map{case (key, value) => (key, value.size)}) - } - /** * Function to put data into histogram of counts * diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 7331b91dd5..abc0b906e3 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -166,16 +166,15 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff) } - def getFeatureMomentsAndCard(inputModel: FeatureLike[Prediction], - DF: DataFrame): (Map[String, Moments], Map[String, TextStats]) = { + def getFeatureMoments(inputModel: FeatureLike[Prediction], + DF: DataFrame): Map[String, Moments] = { lazy val workFlow = new OpWorkflow().setResultFeatures(inputModel).setInputDataset(DF) lazy val dummyReader = workFlow.getReader() lazy val workFlowRFF = workFlow.withRawFeatureFilter(Some(dummyReader), None) lazy val model = workFlowRFF.train() val insights = model.modelInsights(inputModel) val featureMoments = insights.features.map(f => f.featureName -> f.distributions.head.moments.get).toMap - val featureCardinality = insights.features.map(f => f.featureName -> f.distributions.head.cardEstimate.get).toMap - return (featureMoments, featureCardinality) + return featureMoments } val params = new OpParams() @@ -783,7 +782,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val df = linRegDF._3 val meanTol = 0.01 val varTol = 0.01 - val (moments, cardinality) = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3) + val moments = getFeatureMoments(standardizedLinpred, linRegDF._3) // Go through each feature and check that the mean, variance, and unique counts match the data moments.foreach { case (featureName, value) => { @@ -794,12 +793,6 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou math.abs((value.variance - expectedVariance) / expectedVariance) < varTol shouldBe true } } - - cardinality.foreach { case (featureName, value) => { - val actualUniques = df.select(featureName).as[Double].collect().toSet - value.valueCounts.keySet.map(_.toDouble).subsetOf(actualUniques) shouldBe true - } - } } it should "return correct insights when a model combiner equal is used as the final feature" in { diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala index cc8778c278..cc7b8584d0 100644 --- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala @@ -75,9 +75,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi distribs(3).distribution.sum shouldBe 0 distribs(4).distribution.sum shouldBe 3 distribs(4).summaryInfo.length shouldBe bins - distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1)) distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0) - distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1)) distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0) } @@ -202,8 +200,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi it should "marshall to/from json" in { val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty) val fd2 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), - Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))), - FeatureDistributionType.Scoring) + Array.empty, Some(Moments(1.0)), FeatureDistributionType.Scoring) val json = FeatureDistribution.toJson(Array(fd1, fd2)) FeatureDistribution.fromJson(json) match { case Success(r) => r shouldBe Seq(fd1, fd2) @@ -213,7 +210,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi it should "marshall to/from json with default vector args" in { val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), - Array.empty, None, None, FeatureDistributionType.Scoring) + Array.empty, None, FeatureDistributionType.Scoring) val fd2 = FeatureDistribution("A", Some("X"), 20, 20, Array(2, 8, 0, 0, 12), Array.empty) val json = """[{"name":"A","count":10,"nulls":1,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"}, diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala index 44c9a67871..c667453fb3 100644 --- a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala +++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala @@ -47,16 +47,16 @@ trait FiltersTestData { protected val scoreSummaries = Seq( FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty, - None, None, FeatureDistributionType.Scoring), + None, FeatureDistributionType.Scoring), FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty, - None, None, FeatureDistributionType.Scoring), + None, FeatureDistributionType.Scoring), FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), - Array.empty, None, None, FeatureDistributionType.Scoring), + Array.empty, None, FeatureDistributionType.Scoring), FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), - Array.empty, None, None, FeatureDistributionType.Scoring), + Array.empty, None, FeatureDistributionType.Scoring), FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, - None, None, FeatureDistributionType.Scoring), + None, FeatureDistributionType.Scoring), FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, - None, None, FeatureDistributionType.Scoring) + None, FeatureDistributionType.Scoring) ) } From e45073db415076c2fabb3bdeab8cd1bc07c6d7f0 Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Fri, 15 Nov 2019 13:37:10 -0800 Subject: [PATCH 2/7] Disable GitHub actions --- .github/workflows/main.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 19068f8345..0000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: CI - -on: - push: - branches: - - master - pull_request: - branches: - - master - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Build - run: ./gradlew scalaStyle reportScoverage - - name: Build Helloworld - run: cd helloworld && ./gradlew scalaStyle test From 977848135a9e0144bd57adac8b70a515e6a658ef Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 3 Dec 2019 12:15:19 -0800 Subject: [PATCH 3/7] Added Chinese and Korean examples to TextTokenizerTest (#442) --- .../impl/feature/TextTokenizerTest.scala | 101 ++++++++++++++++-- 1 file changed, 95 insertions(+), 6 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala index f469dbd617..ab7acc92a3 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala @@ -43,25 +43,33 @@ import org.scalatest.junit.JUnitRunner class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] { // scalastyle:off - val (inputData, english, japanese, french) = TestFeatureBuilder( + val (inputData, english, japanese, french, chinese, korean) = TestFeatureBuilder( Seq( ("I've got a lovely bunch of coconuts".toText, "古池や蛙飛び込む水の音".toText, - "Première détection d’une atmosphère autour d’une exoplanète de la taille de la Terre".toText + "Première détection d’une atmosphère autour d’une exoplanète de la taille de la Terre".toText, + "外面的大氣層依緯度成不同的區與帶,在彼此的交界處有湍流和風暴作用著".toText, + "외곽 대기는 위도에 따라 몇가지의 띠들로 눈에 띄게 구분되는데, 서로 상호작용하는 경계선을 따라 발생하는 난류와 폭풍에 의한 것이다".toText ), ("There they are, all standing in a row".toText, "地磁気発生の謎に迫る地球内部の環境、再現実験".toText, - "Les deux commissions, créées respectivement en juin 2016 et janvier 2017".toText + "Les deux commissions, créées respectivement en juin 2016 et janvier 2017".toText, + "理論模型顯示如果木星的質量比現在更大,而不是僅有目前的質量,它將會繼續收縮".toText, + "상층부 대기의 네온은 질량비로 차지하는데".toText ), ("Big ones, small ones, some as big as your head".toText, "初めまして私はケビンです".toText, - "Il publie sa théorie de la relativité restreinte en 1905".toText + "Il publie sa théorie de la relativité restreinte en 1905".toText, + "假設它確實存在,它可能因為現存的熱液態金屬氫與地函混合的對流而萎縮,並且熔融在行星內部的較上層".toText, + "금속성 수소층 위에는 수소로 이루어진 투명한 안쪽 대기가 자리잡고 있다".toText ), ("Big ones, small

ones

, some as big as your head".toText, "初めまして私はケビンです,

初めまして私はケビンです

".toText, - "Il

publie sa théorie de la relativité restreinte en 1905".toText + "Il

publie sa théorie de la relativité restreinte en 1905".toText, + "
在南半球有一個外觀與大紅斑類似,但較小的大氣特徵出現
".toText, + "목성의 중심으로부터 목성반경 지점에서 자기권과 태양풍의 상호작용으로 활꼴 충격파가 발생한다".toText ), - ("".toText, Text.empty, Text.empty) + ("".toText, Text.empty, Text.empty, Text.empty, Text.empty) ) ) // scalastyle:on @@ -125,6 +133,7 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] List("h2", "clas", "a", "publ", "theo", "relativit", "restreint", "1905").toTextList, TextList.empty ) + val expectedHtml = { val copy = expected.toList.toArray copy(3) = List("publ", "theo", "relativit", "restreint", "1905").toTextList @@ -133,6 +142,46 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] // scalastyle:on } + trait Chinese { + // scalastyle:off + val expectedHtml = Array( + "外面的大氣層依緯度成不同的區與帶,在彼此的交界處有湍流和風暴作用著", + "理論模型顯示如果木星的質量比現在更大,而不是僅有目前的質量,它將會繼續收縮", + "假設它確實存在,它可能因為現存的熱液態金屬氫與地函混合的對流而萎縮,並且熔融在行星內部的較上層", + "在南半球有一個外觀與大紅斑類似,但較小的大氣特徵出現", + "" + ).map(_.sliding(2, 1).filterNot(_.contains(",")).toList.toTextList) + + val expected = { + val copy = expectedHtml.clone() + copy(3) = (Seq("div") ++ expectedHtml(3).value ++ Seq("div")).toTextList + copy + } + // scalastyle:on + } + + trait Korean { + // scalastyle:off + val expectedHtml = Array( + "외곽 대기는 위도에 따라 몇가지의 띠들로 눈에 띄게 구분되는데, 서로 상호작용하는 경계선을 따라 발생하는 난류와 폭풍에 의한 것이다", + "상층부 대기의 네온은 질량비로 차지하는데", + "금속성 수소층 위에는 수소로 이루어진 투명한 안쪽 대기가 자리잡고 있다", + "목성의 중심으로부터 목성반경 지점에서 자기권과 태양풍의 상호작용으로 활꼴 충격파가 발생한다", + "" + ).map(_.sliding(2, 1).filterNot(s => s.contains(" ") || s.contains(",")).toList.toTextList) + + println(expectedHtml.toList) + + val expected = { + val copy = expectedHtml.clone() + copy(3) = List("목성", "성의", "중심", "심으", "으로", "로부", "부터", "목성", "성반", "반경", "b", "지점", "점에", + "에서", "b", "자기", "기권", "권과", "태양", "양풍", "풍의", "href", "www.google.com", "상호", "호작", "작용", + "용으", "으로", "활꼴", "충격", "격파", "파가", "발생", "생한", "한다").toTextList + copy + } + // scalastyle:on + } + it should "tokenize text correctly [English]" in new English { assertTextTokenizer( input = english, expected = expected, @@ -151,6 +200,19 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.French) ) } + it should "tokenize text correctly [Chinese (Simplified)]" in new Chinese { + assertTextTokenizer( + input = chinese, expected = expected, + tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.SimplifiedChinese) + ) + } + it should "tokenize text correctly [Korean]" in new Korean { + assertTextTokenizer( + input = korean, expected = expected, + tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.Korean) + ) + } + it should "strip html tags and tokenize text correctly [English]" in new English { assertTextTokenizer( input = english, expected = expectedHtml, @@ -172,6 +234,21 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] .setDefaultLanguage(Language.French) ) } + it should "strip html tags and tokenize text correctly [Chinese (Simplified)]" in new Chinese { + assertTextTokenizer( + input = chinese, expected = expectedHtml, + tokenizer = new TextTokenizer[Text](analyzer = TextTokenizer.AnalyzerHtmlStrip) + .setDefaultLanguage(Language.SimplifiedChinese) + ) + } + it should "strip html tags and tokenize text correctly [Korean]" in new Korean { + assertTextTokenizer( + input = korean, expected = expectedHtml, + tokenizer = new TextTokenizer[Text](analyzer = TextTokenizer.AnalyzerHtmlStrip) + .setDefaultLanguage(Language.Korean) + ) + } + it should "auto detect languages and tokenize accordingly [English]" in new English { assertTextTokenizer( input = english, expected = expected, @@ -190,6 +267,18 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true) ) } + it should "auto detect languages and tokenize accordingly [Chinese (Simplified)]" in new Chinese { + assertTextTokenizer( + input = chinese, expected = expected, + tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true) + ) + } + it should "auto detect languages and tokenize accordingly [Korean]" in new Korean { + assertTextTokenizer( + input = korean, expected = expected, + tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true) + ) + } it should "work as a shortcut" in { val tokenized = english.tokenize() assertTextTokenizer( From 8c6110bc5e72481b6276603462f9fa8aa649c3bb Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Thu, 5 Dec 2019 21:39:15 -0800 Subject: [PATCH 4/7] Update documentation Fixes https://github.com/salesforce/TransmogrifAI/issues/425 --- .../main/scala/com/salesforce/op/dsl/RichNumericFeature.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala index 62bd04644c..c4cfa6f224 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala @@ -215,14 +215,14 @@ trait RichNumericFeature { f.transformWith(new SqrtTransformer[I]()) /** - * Square root transformer + * Log transformer * @return transformed feature */ def log(base: Double): FeatureLike[Real] = f.transformWith(new LogTransformer[I](base = base)) /** - * Square root transformer + * Power transformer * @return transformed feature */ def power(power: Double): FeatureLike[Real] = From 3d51d5defcd1005bf255b12fac55003e1c942f11 Mon Sep 17 00:00:00 2001 From: Winters Lu Date: Wed, 18 Dec 2019 10:48:35 -0800 Subject: [PATCH 5/7] Skip serializing cardinality estimates in FeatureDistributions (#447) --- .../op/filters/FeatureDistribution.scala | 36 +++++++++++++++---- .../com/salesforce/op/ModelInsightsTest.scala | 28 ++++++++------- .../op/filters/FeatureDistributionTest.scala | 28 +++++++++++++-- .../op/filters/FiltersTestData.scala | 12 +++---- 4 files changed, 78 insertions(+), 26 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala index 1221bbe3c1..20e4da6511 100644 --- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala +++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala @@ -40,7 +40,7 @@ import com.twitter.algebird._ import com.twitter.algebird.Operators._ import org.apache.spark.mllib.feature.HashingTF import org.json4s.jackson.Serialization -import org.json4s.{DefaultFormats, Formats} +import org.json4s.{DefaultFormats, FieldSerializer, Formats} import scala.util.Try @@ -64,6 +64,7 @@ case class FeatureDistribution distribution: Array[Double], summaryInfo: Array[Double], moments: Option[Moments] = None, + cardEstimate: Option[TextStats] = None, `type`: FeatureDistributionType = FeatureDistributionType.Training ) extends FeatureDistributionLike { @@ -109,9 +110,10 @@ case class FeatureDistribution val combinedSummaryInfo = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo val combinedMoments = moments + fd.moments + val combinedCard = cardEstimate + fd.cardEstimate FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist, - combinedSummaryInfo, combinedMoments, `type`) + combinedSummaryInfo, combinedMoments, combinedCard, `type`) } /** @@ -172,14 +174,14 @@ case class FeatureDistribution } override def equals(that: Any): Boolean = that match { - case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, `type`) => + case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, c, `type`) => distribution.deep == d.deep && summaryInfo.deep == s.deep && - moments == m + moments == m && cardEstimate == c case _ => false } override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution, - summaryInfo, moments, `type`) + summaryInfo, moments, cardEstimate, `type`) } object FeatureDistribution { @@ -190,8 +192,13 @@ object FeatureDistribution { override def plus(l: FeatureDistribution, r: FeatureDistribution): FeatureDistribution = l.reduce(r) } + val FeatureDistributionSerializer = FieldSerializer[FeatureDistribution]( + FieldSerializer.ignore("cardEstimate") + ) + implicit val formats: Formats = DefaultFormats + - EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) + EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) + + FeatureDistributionSerializer /** * Feature distributions to json @@ -238,6 +245,7 @@ object FeatureDistribution { .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins))) val moments = value.map(momentsValues) + val cardEstimate = value.map(cardinalityValues) FeatureDistribution( name = name, @@ -247,6 +255,7 @@ object FeatureDistribution { summaryInfo = summaryInfo, distribution = distribution, moments = moments, + cardEstimate = cardEstimate, `type` = `type` ) } @@ -265,6 +274,21 @@ object FeatureDistribution { MomentsGroup.sum(population.map(x => Moments(x))) } + /** + * Function to track frequency of the first $(MaxCardinality) unique values + * (number for numeric features, token for text features) + * + * @param values values to track distribution / frequency + * @return TextStats object containing a Map from a value to its frequency (histogram) + */ + private def cardinalityValues(values: ProcessedSeq): TextStats = { + TextStats(countStringValues(values.left.getOrElse(values.right.get))) + } + + private def countStringValues[T](seq: Seq[T]): Map[String, Int] = { + seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size } + } + /** * Function to put data into histogram of counts * diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index abc0b906e3..d1fa503188 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -31,30 +31,28 @@ package com.salesforce.op import com.salesforce.op.evaluators._ -import com.salesforce.op.features.types._ +import com.salesforce.op.features.types.{Real, _} import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike} import com.salesforce.op.filters._ import com.salesforce.op.stages.impl.classification._ +import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats} import com.salesforce.op.stages.impl.preparators._ import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpXGBoostRegressor, RegressionModelSelector} import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType -import com.salesforce.op.stages.impl.selector.{SelectedModelCombiner, SelectedCombinerModel, SelectedModel} import com.salesforce.op.stages.impl.selector.ValidationType._ +import com.salesforce.op.stages.impl.selector.{SelectedCombinerModel, SelectedModel, SelectedModelCombiner} import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter} import com.salesforce.op.test.{PassengerSparkFixtureTest, TestFeatureBuilder} import com.salesforce.op.testkit.RandomReal import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} +import com.twitter.algebird.Moments import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ import org.junit.runner.RunWith -import com.salesforce.op.features.types.Real -import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats} -import com.twitter.algebird.Moments -import org.apache.spark.sql.{DataFrame, Dataset} -import org.scalactic.Equality import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner -import org.apache.spark.sql.functions._ import scala.util.{Failure, Success} @@ -166,15 +164,16 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff) } - def getFeatureMoments(inputModel: FeatureLike[Prediction], - DF: DataFrame): Map[String, Moments] = { + def getFeatureMomentsAndCard(inputModel: FeatureLike[Prediction], + DF: DataFrame): (Map[String, Moments], Map[String, TextStats]) = { lazy val workFlow = new OpWorkflow().setResultFeatures(inputModel).setInputDataset(DF) lazy val dummyReader = workFlow.getReader() lazy val workFlowRFF = workFlow.withRawFeatureFilter(Some(dummyReader), None) lazy val model = workFlowRFF.train() val insights = model.modelInsights(inputModel) val featureMoments = insights.features.map(f => f.featureName -> f.distributions.head.moments.get).toMap - return featureMoments + val featureCardinality = insights.features.map(f => f.featureName -> f.distributions.head.cardEstimate.get).toMap + featureMoments -> featureCardinality } val params = new OpParams() @@ -782,7 +781,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou val df = linRegDF._3 val meanTol = 0.01 val varTol = 0.01 - val moments = getFeatureMoments(standardizedLinpred, linRegDF._3) + val (moments, cardinality) = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3) // Go through each feature and check that the mean, variance, and unique counts match the data moments.foreach { case (featureName, value) => { @@ -793,6 +792,11 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou math.abs((value.variance - expectedVariance) / expectedVariance) < varTol shouldBe true } } + + cardinality.foreach { case (featureName, value) => + val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet + actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble) + } } it should "return correct insights when a model combiner equal is used as the final feature" in { diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala index cc7b8584d0..086284e20b 100644 --- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala @@ -34,7 +34,10 @@ import com.salesforce.op.features.{FeatureDistributionType, TransientFeature} import com.salesforce.op.stages.impl.feature.TextStats import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.testkit.RandomText +import com.salesforce.op.utils.json.EnumEntrySerializer import com.twitter.algebird.Moments +import org.json4s.DefaultFormats +import org.json4s.jackson.Serialization import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @@ -75,7 +78,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi distribs(3).distribution.sum shouldBe 0 distribs(4).distribution.sum shouldBe 3 distribs(4).summaryInfo.length shouldBe bins + distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1)) distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0) + distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1)) distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0) } @@ -200,7 +205,8 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi it should "marshall to/from json" in { val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty) val fd2 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), - Array.empty, Some(Moments(1.0)), FeatureDistributionType.Scoring) + Array.empty, Some(Moments(1.0)), Option.empty, + FeatureDistributionType.Scoring) val json = FeatureDistribution.toJson(Array(fd1, fd2)) FeatureDistribution.fromJson(json) match { case Success(r) => r shouldBe Seq(fd1, fd2) @@ -210,7 +216,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi it should "marshall to/from json with default vector args" in { val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), - Array.empty, None, FeatureDistributionType.Scoring) + Array.empty, None, None, FeatureDistributionType.Scoring) val fd2 = FeatureDistribution("A", Some("X"), 20, 20, Array(2, 8, 0, 0, 12), Array.empty) val json = """[{"name":"A","count":10,"nulls":1,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"}, @@ -238,4 +244,22 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi intercept[IllegalArgumentException](fd1.jsDivergence(fd1.copy(name = "boo"))) should have message "requirement failed: Name must match to compare or combine feature distributions: A != boo" } + + it should "not serialize cardEstimate field" in { + val cardEstimate = "cardEstimate" + val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), + Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))), + FeatureDistributionType.Scoring) + val featureDistributions = Seq(fd1, fd1.copy(cardEstimate = None)) + + FeatureDistribution.toJson(featureDistributions) shouldNot include (cardEstimate) + + // deserialization from json with and without cardEstimate works + val jsonWithCardEstimate = Serialization.write(featureDistributions)(DefaultFormats + + EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType)) + jsonWithCardEstimate should fullyMatch regex Seq(cardEstimate).mkString(".*", ".*", ".*") + jsonWithCardEstimate shouldNot fullyMatch regex Seq.fill(2)(cardEstimate).mkString(".*", ".*", ".*") + + FeatureDistribution.fromJson(jsonWithCardEstimate) shouldBe Success(featureDistributions) + } } diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala index c667453fb3..44c9a67871 100644 --- a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala +++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala @@ -47,16 +47,16 @@ trait FiltersTestData { protected val scoreSummaries = Seq( FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty, - None, FeatureDistributionType.Scoring), + None, None, FeatureDistributionType.Scoring), FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty, - None, FeatureDistributionType.Scoring), + None, None, FeatureDistributionType.Scoring), FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), - Array.empty, None, FeatureDistributionType.Scoring), + Array.empty, None, None, FeatureDistributionType.Scoring), FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), - Array.empty, None, FeatureDistributionType.Scoring), + Array.empty, None, None, FeatureDistributionType.Scoring), FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, - None, FeatureDistributionType.Scoring), + None, None, FeatureDistributionType.Scoring), FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, - None, FeatureDistributionType.Scoring) + None, None, FeatureDistributionType.Scoring) ) } From bad14d53fffaa850d0103800628e9aeab66a4a1f Mon Sep 17 00:00:00 2001 From: Matthew Tovbin Date: Tue, 24 Dec 2019 08:18:53 -0800 Subject: [PATCH 6/7] Remove extra > --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 07e102ccfd..59cc8a4799 100644 --- a/pom.xml +++ b/pom.xml @@ -256,7 +256,7 @@ spark-sql_2.11 2.3.2 compile - > + org.apache.avro avro From a6aceb60624c9db50ffc92c135e303e4f6a9c0c9 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 7 Jan 2020 16:37:01 -0800 Subject: [PATCH 7/7] Add support for ignoring text that looks like IDs in SmartTextVectorizer (#448) --- .../op/filters/FeatureDistribution.scala | 6 +- .../impl/feature/SmartTextMapVectorizer.scala | 3 +- .../impl/feature/SmartTextVectorizer.scala | 145 +++++++++++++----- .../op/filters/FeatureDistributionTest.scala | 6 +- .../feature/SmartTextMapVectorizerTest.scala | 18 +-- .../feature/SmartTextVectorizerTest.scala | 82 +++++++++- .../stages/OpPipelineStageReaderWriter.scala | 1 + .../feature/TextVectorizationMethod.scala | 46 ++++++ 8 files changed, 242 insertions(+), 65 deletions(-) create mode 100644 features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala index 20e4da6511..42b17f760e 100644 --- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala +++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala @@ -282,11 +282,11 @@ object FeatureDistribution { * @return TextStats object containing a Map from a value to its frequency (histogram) */ private def cardinalityValues(values: ProcessedSeq): TextStats = { - TextStats(countStringValues(values.left.getOrElse(values.right.get))) + TextStats(countStringValues(values.left.getOrElse(values.right.get)), Map.empty) } - private def countStringValues[T](seq: Seq[T]): Map[String, Int] = { - seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size } + private def countStringValues[T](seq: Seq[T]): Map[String, Long] = { + seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size.toLong } } /** diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 99e82fc00d..f149f5abba 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -72,7 +72,8 @@ class SmartTextMapVectorizer[T <: OPMap[String]] textMap: T#Value, shouldCleanKeys: Boolean, shouldCleanValues: Boolean ): TextMapStats = { val keyValueCounts = textMap.map{ case (k, v) => - cleanTextFn(k, shouldCleanKeys) -> TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1)) + cleanTextFn(k, shouldCleanKeys) -> + TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1), Map(cleanTextFn(v, shouldCleanValues).length -> 1)) } TextMapStats(keyValueCounts) } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala index ca46a52b56..e95939d249 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala @@ -62,7 +62,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( extends SequenceEstimator[T, OPVector](operationName = "smartTxtVec", uid = uid) with PivotParams with CleanTextFun with SaveOthersParams with TrackNullsParam with MinSupportParam with TextTokenizerParams with TrackTextLenParam - with HashingVectorizerParams with HashingFun with OneHotFun with MaxCardinalityParams { + with HashingVectorizerParams with HashingFun with OneHotFun with MaxCardinalityParams with MinLengthStdDevParams { private implicit val textStatsSeqEnc: Encoder[Array[TextStats]] = ExpressionEncoder[Array[TextStats]]() @@ -81,23 +81,28 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( require(!dataset.isEmpty, "Input dataset cannot be empty") val maxCard = $(maxCardinality) + val minLenStdDev = $(minLengthStdDev) val shouldCleanText = $(cleanText) implicit val testStatsMonoid: Semigroup[TextStats] = TextStats.monoid(maxCard) val valueStats: Dataset[Array[TextStats]] = dataset.map(_.map(computeTextStats(_, shouldCleanText)).toArray) val aggregatedStats: Array[TextStats] = valueStats.reduce(_ + _) - val (isCategorical, topValues) = aggregatedStats.map { stats => - val isCategorical = stats.valueCounts.size <= maxCard + val (vectorizationMethods, topValues) = aggregatedStats.map { stats => + val vecMethod: TextVectorizationMethod = stats match { + case _ if stats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot + case _ if stats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore + case _ => TextVectorizationMethod.Hash + } val topValues = stats.valueCounts .filter { case (_, count) => count >= $(minSupport) } .toSeq.sortBy(v => -v._2 -> v._1) .take($(topK)).map(_._1) - isCategorical -> topValues + (vecMethod, topValues) }.unzip val smartTextParams = SmartTextVectorizerModelArgs( - isCategorical = isCategorical, + vectorizationMethods = vectorizationMethods, topValues = topValues, shouldCleanText = shouldCleanText, shouldTrackNulls = $(trackNulls), @@ -117,36 +122,40 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( } private def computeTextStats(text: T#Value, shouldCleanText: Boolean): TextStats = { - val valueCounts = text match { - case Some(v) => Map(cleanTextFn(v, shouldCleanText) -> 1) - case None => Map.empty[String, Int] + val (valueCounts, lengthCounts) = text match { + case Some(v) => (Map(cleanTextFn(v, shouldCleanText) -> 1L), Map(cleanTextFn(v, shouldCleanText).length -> 1L)) + case None => (Map.empty[String, Long], Map.empty[Int, Long]) } - TextStats(valueCounts) + TextStats(valueCounts, lengthCounts) } private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = { - require(inN.length == smartTextParams.isCategorical.length) + require(inN.length == smartTextParams.vectorizationMethods.length) - val (categoricalFeatures, textFeatures) = - SmartTextVectorizer.partition[TransientFeature](inN, smartTextParams.isCategorical) + val groups = inN.toArray.zip(smartTextParams.vectorizationMethods).groupBy(_._2) + val textToPivot = groups.getOrElse(TextVectorizationMethod.Pivot, Array.empty).map(_._1) + val textToIgnore = groups.getOrElse(TextVectorizationMethod.Ignore, Array.empty).map(_._1) + val textToHash = groups.getOrElse(TextVectorizationMethod.Hash, Array.empty).map(_._1) + val allTextFeatures = textToHash ++ textToIgnore // build metadata describing output val shouldTrackNulls = $(trackNulls) val shouldTrackLen = $(trackTextLen) val unseen = Option($(unseenName)) - val categoricalColumns = if (categoricalFeatures.nonEmpty) { - makeVectorColumnMetadata(shouldTrackNulls, unseen, smartTextParams.categoricalTopValues, categoricalFeatures) + val categoricalColumns = if (textToPivot.nonEmpty) { + makeVectorColumnMetadata(shouldTrackNulls, unseen, smartTextParams.categoricalTopValues, textToPivot) } else Array.empty[OpVectorColumnMetadata] - val textColumns = if (textFeatures.nonEmpty) { + + val textColumns = if (allTextFeatures.nonEmpty) { if (shouldTrackLen) { - makeVectorColumnMetadata(textFeatures, makeHashingParams()) ++ - textFeatures.map(_.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString)) ++ - textFeatures.map(_.toColumnMetaData(isNull = true)) + makeVectorColumnMetadata(textToHash, makeHashingParams()) ++ + allTextFeatures.map(_.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString)) ++ + allTextFeatures.map(_.toColumnMetaData(isNull = true)) } else { - makeVectorColumnMetadata(textFeatures, makeHashingParams()) ++ - textFeatures.map(_.toColumnMetaData(isNull = true)) + makeVectorColumnMetadata(textToHash, makeHashingParams()) ++ + allTextFeatures.map(_.toColumnMetaData(isNull = true)) } } else Array.empty[OpVectorColumnMetadata] @@ -156,53 +165,84 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( } object SmartTextVectorizer { - val MaxCardinality = 100 + val MaxCardinality: Int = 100 + val MinTextLengthStdDev: Double = 0 private[op] def partition[T: ClassTag](input: Array[T], condition: Array[Boolean]): (Array[T], Array[T]) = { val all = input.zip(condition) - (all.collect { case (item, true) => item }.toSeq.toArray, all.collect { case (item, false) => item }.toSeq.toArray) + (all.collect { case (item, true) => item }, all.collect { case (item, false) => item }) } } /** * Summary statistics of a text feature * - * @param valueCounts counts of feature values + * @param valueCounts counts of feature values + * @param lengthCounts counts of token lengths */ -private[op] case class TextStats(valueCounts: Map[String, Int]) extends JsonLike +private[op] case class TextStats( + valueCounts: Map[String, Long], + lengthCounts: Map[Int, Long] +) extends JsonLike { + + val lengthSize = lengthCounts.values.sum + val lengthMean: Double = lengthCounts.foldLeft(0.0)((acc, el) => acc + el._1 * el._2) / lengthSize + val lengthVariance: Double = lengthCounts.foldLeft(0.0)( + (acc, el) => acc + el._2 * (el._1 - lengthMean) * (el._1 - lengthMean) + ) + val lengthStdDev: Double = math.sqrt(lengthVariance / lengthSize) +} private[op] object TextStats { + /** + * Helper function to add two maps subject to a max cardinality restriction on the number of unique values + * + * @param totalMap Current accumulated map + * @param mapToAdd Additional map to add the to accumulated one + * @param maxCardinality Maximum number of unique keys to keep track of (stop counting once this is hit) + * @tparam T Type parameter for the keys + * @return Newly accumulated map subject to the key cardinality constraints + */ + def additionHelper[T](totalMap: Map[T, Long], mapToAdd: Map[T, Long], maxCardinality: Int): Map[T, Long] = { + if (totalMap.size > maxCardinality) totalMap + else if (mapToAdd.size > maxCardinality) mapToAdd + else totalMap + mapToAdd + } + def monoid(maxCardinality: Int): Monoid[TextStats] = new Monoid[TextStats] { override def plus(l: TextStats, r: TextStats): TextStats = { - if (l.valueCounts.size > maxCardinality) l - else if (r.valueCounts.size > maxCardinality) r - else TextStats(l.valueCounts + r.valueCounts) + val newValueCounts = additionHelper(l.valueCounts, r.valueCounts, maxCardinality) + val newLengthCounts = additionHelper(l.lengthCounts, r.lengthCounts, maxCardinality) + TextStats(newValueCounts, newLengthCounts) } override def zero: TextStats = TextStats.empty } - def empty: TextStats = TextStats(Map.empty) + def empty: TextStats = TextStats(Map.empty, Map.empty) } /** * Arguments for [[SmartTextVectorizerModel]] * - * @param isCategorical is feature a categorical or not - * @param topValues top values to each feature - * @param shouldCleanText should clean text value - * @param shouldTrackNulls should track nulls - * @param hashingParams hashing function params + * @param vectorizationMethods method to use for text vectorization (either pivot, hashing, or ignoring) + * @param isCategorical is feature a categorical or not + * @param isIgnorable is a text feature that we think is ignorable? high cardinality + low length variance + * @param topValues top values to each feature + * @param shouldCleanText should clean text value + * @param shouldTrackNulls should track nulls + * @param hashingParams hashing function params */ case class SmartTextVectorizerModelArgs ( - isCategorical: Array[Boolean], + vectorizationMethods: Array[TextVectorizationMethod], topValues: Array[Seq[String]], shouldCleanText: Boolean, shouldTrackNulls: Boolean, hashingParams: HashingFunctionParams ) extends JsonLike { - def categoricalTopValues: Array[Seq[String]] = - topValues.zip(isCategorical).collect { case (top, true) => top } + def categoricalTopValues: Array[Seq[String]] = { + topValues.zip(vectorizationMethods.map(_ == TextVectorizationMethod.Pivot)).collect { case (top, true) => top } + } } final class SmartTextVectorizerModel[T <: Text] private[op] @@ -222,19 +262,28 @@ final class SmartTextVectorizerModel[T <: Text] private[op] shouldTrackNulls = args.shouldTrackNulls ) (row: Seq[Text]) => { - val (rowCategorical, rowText) = SmartTextVectorizer.partition[Text](row.toArray, args.isCategorical) - val categoricalVector: OPVector = categoricalPivotFn(rowCategorical) - val textTokens: Seq[TextList] = rowText.map(tokenize(_).tokens) + val groups = row.toArray.zip(args.vectorizationMethods).groupBy(_._2) + val textToPivot = groups.getOrElse(TextVectorizationMethod.Pivot, Array.empty).map(_._1) + val textToIgnore = groups.getOrElse(TextVectorizationMethod.Ignore, Array.empty).map(_._1) + val textToHash = groups.getOrElse(TextVectorizationMethod.Hash, Array.empty).map(_._1) + + val categoricalVector: OPVector = categoricalPivotFn(textToPivot) + val textTokens: Seq[TextList] = textToHash.map(tokenize(_).tokens) + val ignorableTextTokens: Seq[TextList] = textToIgnore.map(tokenize(_).tokens) val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams) - val textNullIndicatorsVector = if (args.shouldTrackNulls) getNullIndicatorsVector(textTokens) else OPVector.empty - val textLenVector = if ($(trackTextLen)) getLenVector(textTokens) else OPVector.empty + val textNullIndicatorsVector = if (args.shouldTrackNulls) { + getNullIndicatorsVector(textTokens ++ ignorableTextTokens) + } else OPVector.empty + val textLenVector = if ($(trackTextLen)) getLenVector(textTokens ++ ignorableTextTokens) else OPVector.empty categoricalVector.combine(textVector, textLenVector, textNullIndicatorsVector) } } private def getTextTransientFeatures: Array[TransientFeature] = - SmartTextVectorizer.partition[TransientFeature](getTransientFeatures(), args.isCategorical)._2 + getTransientFeatures().zip(args.vectorizationMethods).collect { + case (tf, method) if method != TextVectorizationMethod.Pivot => tf + } private def getNullIndicatorsVector(textTokens: Seq[TextList]): OPVector = { val nullIndicators = textTokens.map { tokens => @@ -261,3 +310,15 @@ trait MaxCardinalityParams extends Params { final def getMaxCardinality: Int = $(maxCardinality) setDefault(maxCardinality -> SmartTextVectorizer.MaxCardinality) } + +trait MinLengthStdDevParams extends Params { + final val minLengthStdDev = new DoubleParam( + parent = this, name = "minLengthStdDev", + doc = "minimum standard deviation of the lengths of tokens in a text field for it to be hashed instead " + + "of ignored", + isValid = ParamValidators.inRange(lowerBound = 0, upperBound = 100) + ) + final def setMinLengthStdDev(v: Double): this.type = set(minLengthStdDev, v) + final def getMinLengthStdDev: Double = $(minLengthStdDev) + setDefault(minLengthStdDev -> SmartTextVectorizer.MinTextLengthStdDev) +} diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala index 086284e20b..888e1de836 100644 --- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala +++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala @@ -78,9 +78,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi distribs(3).distribution.sum shouldBe 0 distribs(4).distribution.sum shouldBe 3 distribs(4).summaryInfo.length shouldBe bins - distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1)) + distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1), Map.empty) distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0) - distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1)) + distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1), Map.empty) distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0) } @@ -248,7 +248,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi it should "not serialize cardEstimate field" in { val cardEstimate = "cardEstimate" val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), - Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))), + Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2), Map.empty)), FeatureDistributionType.Scoring) val featureDistributions = Seq(fd1, fd1.copy(cardEstimate = None)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index bbf2cb8035..5584ea1565 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -94,22 +94,22 @@ class SmartTextMapVectorizerTest Spec[TextMapStats] should "provide a proper semigroup" in { val data = Seq( TextMapStats(Map( - "f1" -> TextStats(Map("hello" -> 2, "world" -> 1)), - "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 2)), - "f3" -> TextStats(Map("foo" -> 1)) + "f1" -> TextStats(Map("hello" -> 2, "world" -> 1), Map(5 -> 3)), + "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 2), Map(5 -> 4)), + "f3" -> TextStats(Map("foo" -> 1), Map(3 -> 1)) )), TextMapStats(Map( - "f1" -> TextStats(Map("hello" -> 1)), - "f2" -> TextStats(Map("ocean" -> 1, "other" -> 5)) + "f1" -> TextStats(Map("hello" -> 1), Map(5 -> 1)), + "f2" -> TextStats(Map("ocean" -> 1, "other" -> 5), Map(5 -> 6)) )), TextMapStats(Map( - "f2" -> TextStats(Map("other" -> 1)) + "f2" -> TextStats(Map("other" -> 1), Map(5 -> 1)) )) ) TextMapStats.monoid(2).sumOption(data) shouldBe Some(TextMapStats(Map( - "f1" -> TextStats(Map("hello" -> 3, "world" -> 1)), - "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 3, "other" -> 5)), - "f3" -> TextStats(Map("foo" -> 1)) + "f1" -> TextStats(Map("hello" -> 3, "world" -> 1), Map(5 -> 4)), + "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 3, "other" -> 5), Map(5 -> 11)), + "f3" -> TextStats(Map("foo" -> 1), Map(3 -> 1)) ))) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index 30143977c1..09b124a5ad 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -34,6 +34,7 @@ import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.SequenceModel import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} +import com.salesforce.op.testkit.{RandomReal, RandomText} import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.linalg.Vectors @@ -59,7 +60,6 @@ class SmartTextVectorizerTest .setTopK(2).setPrependFeatureName(false) .setHashSpaceStrategy(HashSpaceStrategy.Shared) .setInput(f1, f2) - val expectedResult = Seq( Vectors.sparse(9, Array(0, 4, 6), Array(1.0, 1.0, 1.0)), Vectors.sparse(9, Array(0, 8), Array(1.0, 1.0)), @@ -68,6 +68,31 @@ class SmartTextVectorizerTest Vectors.sparse(9, Array(3, 8), Array(1.0, 1.0)) ).map(_.toOPVector) + /* + Generate some more complicated input data to check things a little closer. There are four text fields with + different token distributions: + + country: Uniformly distributed from a larger list of ~few hundred countries, should be hashed + categoricalText: Uniformly distributed from a small list of choices, should be pivoted (also has fixed lengths, + so serves as a test that the categorical check happens before the token length variance check) + textId: Uniformly distributed high cardinality Ids with fixed lengths, should be ignored + text: Uniformly distributed unicode strings with lengths ranging from 0-100, should be hashed + */ + val countryData: Seq[Text] = RandomText.countries.withProbabilityOfEmpty(0.2).limit(1000) + val categoricalTextData: Seq[Text] = RandomText.textFromDomain(domain = List("A", "B", "C", "D", "E", "F")) + .withProbabilityOfEmpty(0.2).limit(1000) + // Generate List containing elements like 040231, 040232, ... + val textIdData: Seq[Text] = RandomText.textFromDomain( + domain = (1 to 1000).map(x => "%06d".format(40230 + x)).toList + ).withProbabilityOfEmpty(0.2).limit(1000) + val textData: Seq[Text] = RandomText.strings(minLen = 0, maxLen = 100).withProbabilityOfEmpty(0.2).limit(1000) + val generatedData: Seq[(Text, Text, Text, Text)] = + countryData.zip(categoricalTextData).zip(textIdData).zip(textData).map { + case (((co, ca), id), te) => (co, ca, id, te) + } + val (rawDF, rawCountry, rawCategorical, rawTextId, rawText) = TestFeatureBuilder( + "country", "categorical", "textId", "text", generatedData) + it should "detect one categorical and one non-categorical text feature" in { val smartVectorized = new SmartTextVectorizer() .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(false) @@ -170,6 +195,40 @@ class SmartTextVectorizerTest regular shouldBe shortcut } + it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in { + val topKCategorial = 3 + val hashSize = 5 + + val smartVectorized = new SmartTextVectorizer() + .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial).setMinLengthStdDev(1.0) + .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false) + .setTrackNulls(true).setTrackTextLen(true) + .setInput(rawCountry, rawCategorical, rawTextId, rawText).getOutput() + + val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDF) + val result = transformed.collect(smartVectorized) + + /* + Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and + one ignored text field. + + Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements + Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements + Ignored text: (1 length + 1 null indicator) = 2 elements + */ + val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2 + val firstRes = result.head + firstRes.v.size shouldBe featureVectorSize + + val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) + meta.columns.length shouldBe featureVectorSize + meta.columns.slice(0, 5).forall(_.grouping.contains("categorical")) + meta.columns.slice(5, 10).forall(_.grouping.contains("country")) + meta.columns.slice(10, 15).forall(_.grouping.contains("text")) + meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString)) + meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString)) + } + it should "fail with an error" in { val emptyDF = inputData.filter(inputData("text1") === "").toDF() @@ -375,16 +434,25 @@ class SmartTextVectorizerTest } Spec[TextStats] should "aggregate correctly" in { - val l1 = TextStats(Map("hello" -> 1, "world" -> 2)) - val r1 = TextStats(Map("hello" -> 1, "world" -> 1)) - val expected1 = TextStats(Map("hello" -> 2, "world" -> 3)) + val l1 = TextStats(Map("hello" -> 1, "world" -> 2), Map(5 -> 3)) + val r1 = TextStats(Map("hello" -> 1, "world" -> 1), Map(5 -> 2)) + val expected1 = TextStats(Map("hello" -> 2, "world" -> 3), Map(5 -> 5)) - val l2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3)) - val r2 = TextStats(Map("hello" -> 1)) - val expected2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3)) + val l2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3), Map(5 -> 6)) + val r2 = TextStats(Map("hello" -> 1), Map(5 -> 1)) + val expected2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3), Map(5 -> 7)) TextStats.monoid(2).plus(l1, r1) shouldBe expected1 TextStats.monoid(2).plus(l2, r2) shouldBe expected2 } + it should "compute correct statistics on the length distributions" in { + val ts = TextStats(Map("hello" -> 2, "joe" -> 2, "woof" -> 1), Map(3 -> 2, 4 -> 1, 5 -> 2)) + + ts.lengthSize shouldBe 5 + ts.lengthMean shouldBe 4.0 + ts.lengthVariance shouldBe 4.0 + ts.lengthStdDev shouldBe 2.0 / math.sqrt(5.0) + } + } diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala index ef63f5d50c..a23bb3b72e 100644 --- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala +++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala @@ -211,6 +211,7 @@ trait OpPipelineStageReadWriteFormats { EnumEntrySerializer.json4s[AnyValueTypes](AnyValueTypes) + EnumEntrySerializer.json4s[HashAlgorithm](HashAlgorithm) + EnumEntrySerializer.json4s[HashSpaceStrategy](HashSpaceStrategy) + + EnumEntrySerializer.json4s[TextVectorizationMethod](TextVectorizationMethod) + EnumEntrySerializer.json4s[ScalingType](ScalingType) + EnumEntrySerializer.json4s[TimePeriod](TimePeriod) + EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) + diff --git a/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala b/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala new file mode 100644 index 0000000000..f1ade68105 --- /dev/null +++ b/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import enumeratum._ + + +/** + * Methods of vectorizing text (eg. to be chosen by statistics computed in SmartTextVectorizer) + */ +sealed trait TextVectorizationMethod extends EnumEntry with Serializable + +object TextVectorizationMethod extends Enum[TextVectorizationMethod] { + val values = findValues + case object Pivot extends TextVectorizationMethod + case object Hash extends TextVectorizationMethod + case object Ignore extends TextVectorizationMethod +}