From 52585b6bc58785c7a3427591277a1490ece0d69f Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <anhtuan277@gmail.com>
Date: Fri, 15 Nov 2019 13:30:21 -0800
Subject: [PATCH 1/7] Remove cardinality computation (#438)

---
 .../op/filters/FeatureDistribution.scala      | 27 +++----------------
 .../com/salesforce/op/ModelInsightsTest.scala | 15 +++--------
 .../op/filters/FeatureDistributionTest.scala  |  7 ++---
 .../op/filters/FiltersTestData.scala          | 12 ++++-----
 4 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
index 7619d25308..1221bbe3c1 100644
--- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
@@ -64,7 +64,6 @@ case class FeatureDistribution
   distribution: Array[Double],
   summaryInfo: Array[Double],
   moments: Option[Moments] = None,
-  cardEstimate: Option[TextStats] = None,
   `type`: FeatureDistributionType = FeatureDistributionType.Training
 ) extends FeatureDistributionLike {
 
@@ -110,10 +109,9 @@ case class FeatureDistribution
     val combinedSummaryInfo = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo
 
     val combinedMoments = moments + fd.moments
-    val combinedCard = cardEstimate + fd.cardEstimate
 
     FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist,
-      combinedSummaryInfo, combinedMoments, combinedCard, `type`)
+      combinedSummaryInfo, combinedMoments, `type`)
   }
 
   /**
@@ -174,14 +172,14 @@ case class FeatureDistribution
   }
 
   override def equals(that: Any): Boolean = that match {
-    case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, c, `type`) =>
+    case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, `type`) =>
       distribution.deep == d.deep && summaryInfo.deep == s.deep &&
-        moments == m && cardEstimate == c
+        moments == m
     case _ => false
   }
 
   override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution,
-    summaryInfo, moments, cardEstimate, `type`)
+    summaryInfo, moments, `type`)
 }
 
 object FeatureDistribution {
@@ -240,7 +238,6 @@ object FeatureDistribution {
         .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))
 
     val moments = value.map(momentsValues)
-    val cardEstimate = value.map(cardinalityValues)
 
     FeatureDistribution(
       name = name,
@@ -250,7 +247,6 @@ object FeatureDistribution {
       summaryInfo = summaryInfo,
       distribution = distribution,
       moments = moments,
-      cardEstimate = cardEstimate,
       `type` = `type`
     )
   }
@@ -269,21 +265,6 @@ object FeatureDistribution {
     MomentsGroup.sum(population.map(x => Moments(x)))
   }
 
-  /**
-   * Function to track frequency of the first $(MaxCardinality) unique values
-   * (number for numeric features, token for text features)
-   *
-   * @param values          values to track distribution / frequency
-   * @return TextStats object containing a Map from a value to its frequency (histogram)
-   */
-  private def cardinalityValues(values: ProcessedSeq): TextStats = {
-    val population = values match {
-      case Left(seq) => seq
-      case Right(seq) => seq.map(_.toString)
-    }
-    TextStats(population.groupBy(identity).map{case (key, value) => (key, value.size)})
-  }
-
   /**
    * Function to put data into histogram of counts
    *
diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index 7331b91dd5..abc0b906e3 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -166,16 +166,15 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff)
   }
 
-  def getFeatureMomentsAndCard(inputModel: FeatureLike[Prediction],
-    DF: DataFrame): (Map[String, Moments], Map[String, TextStats]) = {
+  def getFeatureMoments(inputModel: FeatureLike[Prediction],
+    DF: DataFrame): Map[String, Moments] = {
     lazy val workFlow = new OpWorkflow().setResultFeatures(inputModel).setInputDataset(DF)
     lazy val dummyReader = workFlow.getReader()
     lazy val workFlowRFF = workFlow.withRawFeatureFilter(Some(dummyReader), None)
     lazy val model = workFlowRFF.train()
     val insights = model.modelInsights(inputModel)
     val featureMoments = insights.features.map(f => f.featureName -> f.distributions.head.moments.get).toMap
-    val featureCardinality = insights.features.map(f => f.featureName -> f.distributions.head.cardEstimate.get).toMap
-    return (featureMoments, featureCardinality)
+    return featureMoments
   }
 
   val params = new OpParams()
@@ -783,7 +782,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     val df = linRegDF._3
     val meanTol = 0.01
     val varTol = 0.01
-    val (moments, cardinality) = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3)
+    val moments = getFeatureMoments(standardizedLinpred, linRegDF._3)
 
     // Go through each feature and check that the mean, variance, and unique counts match the data
     moments.foreach { case (featureName, value) => {
@@ -794,12 +793,6 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
       math.abs((value.variance - expectedVariance) / expectedVariance) < varTol shouldBe true
     }
     }
-
-    cardinality.foreach { case (featureName, value) => {
-      val actualUniques = df.select(featureName).as[Double].collect().toSet
-      value.valueCounts.keySet.map(_.toDouble).subsetOf(actualUniques) shouldBe true
-    }
-    }
   }
 
   it should "return correct insights when a model combiner equal is used as the final feature" in {
diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
index cc8778c278..cc7b8584d0 100644
--- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
@@ -75,9 +75,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
     distribs(3).distribution.sum shouldBe 0
     distribs(4).distribution.sum shouldBe 3
     distribs(4).summaryInfo.length shouldBe bins
-    distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1))
     distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0)
-    distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1))
     distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0)
   }
 
@@ -202,8 +200,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   it should "marshall to/from json" in {
     val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
     val fd2 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
-      Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))),
-      FeatureDistributionType.Scoring)
+      Array.empty, Some(Moments(1.0)), FeatureDistributionType.Scoring)
     val json = FeatureDistribution.toJson(Array(fd1, fd2))
     FeatureDistribution.fromJson(json) match {
       case Success(r) => r shouldBe Seq(fd1, fd2)
@@ -213,7 +210,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
 
   it should "marshall to/from json with default vector args" in {
     val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
-      Array.empty, None, None, FeatureDistributionType.Scoring)
+      Array.empty, None, FeatureDistributionType.Scoring)
     val fd2 = FeatureDistribution("A", Some("X"), 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
     val json =
       """[{"name":"A","count":10,"nulls":1,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"},
diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
index 44c9a67871..c667453fb3 100644
--- a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
@@ -47,16 +47,16 @@ trait FiltersTestData {
 
   protected val scoreSummaries = Seq(
     FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty,
-      None, None, FeatureDistributionType.Scoring),
+      None, FeatureDistributionType.Scoring),
     FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty,
-      None, None, FeatureDistributionType.Scoring),
+      None, FeatureDistributionType.Scoring),
     FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0),
-      Array.empty, None, None, FeatureDistributionType.Scoring),
+      Array.empty, None, FeatureDistributionType.Scoring),
     FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12),
-      Array.empty, None, None, FeatureDistributionType.Scoring),
+      Array.empty, None, FeatureDistributionType.Scoring),
     FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty,
-      None, None, FeatureDistributionType.Scoring),
+      None, FeatureDistributionType.Scoring),
     FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty,
-      None, None, FeatureDistributionType.Scoring)
+      None, FeatureDistributionType.Scoring)
   )
 }

From e45073db415076c2fabb3bdeab8cd1bc07c6d7f0 Mon Sep 17 00:00:00 2001
From: Matthew Tovbin <tovbinm@users.noreply.github.com>
Date: Fri, 15 Nov 2019 13:37:10 -0800
Subject: [PATCH 2/7] Disable GitHub actions

---
 .github/workflows/main.yml | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 .github/workflows/main.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
deleted file mode 100644
index 19068f8345..0000000000
--- a/.github/workflows/main.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: CI
-
-on:
-  push:
-    branches:
-    - master
-  pull_request:
-    branches:
-    - master
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v1
-    - name: Set up JDK 1.8
-      uses: actions/setup-java@v1
-      with:
-        java-version: 1.8
-    - name: Build
-      run: ./gradlew scalaStyle reportScoverage
-    - name: Build Helloworld
-      run: cd helloworld && ./gradlew scalaStyle test

From 977848135a9e0144bd57adac8b70a515e6a658ef Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 3 Dec 2019 12:15:19 -0800
Subject: [PATCH 3/7] Added Chinese and Korean examples to TextTokenizerTest
 (#442)

---
 .../impl/feature/TextTokenizerTest.scala      | 101 ++++++++++++++++--
 1 file changed, 95 insertions(+), 6 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
index f469dbd617..ab7acc92a3 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTokenizerTest.scala
@@ -43,25 +43,33 @@ import org.scalatest.junit.JUnitRunner
 class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]] {
 
   // scalastyle:off
-  val (inputData, english, japanese, french) = TestFeatureBuilder(
+  val (inputData, english, japanese, french, chinese, korean) = TestFeatureBuilder(
     Seq(
       ("I've got a lovely bunch of coconuts".toText,
         "古池や蛙飛び込む水の音".toText,
-        "Première détection d’une atmosphère autour d’une exoplanète de la taille de la Terre".toText
+        "Première détection d’une atmosphère autour d’une exoplanète de la taille de la Terre".toText,
+        "外面的大氣層依緯度成不同的區與帶，在彼此的交界處有湍流和風暴作用著".toText,
+        "외곽 대기는 위도에 따라 몇가지의 띠들로 눈에 띄게 구분되는데, 서로 상호작용하는 경계선을 따라 발생하는 난류와 폭풍에 의한 것이다".toText
       ),
       ("There they are, all standing in a row".toText,
         "地磁気発生の謎に迫る地球内部の環境、再現実験".toText,
-        "Les deux commissions, créées respectivement en juin 2016 et janvier 2017".toText
+        "Les deux commissions, créées respectivement en juin 2016 et janvier 2017".toText,
+        "理論模型顯示如果木星的質量比現在更大，而不是僅有目前的質量，它將會繼續收縮".toText,
+        "상층부 대기의 네온은 질량비로 차지하는데".toText
       ),
       ("Big ones, small ones, some as big as your head".toText,
         "初めまして私はケビンです".toText,
-        "Il publie sa théorie de la relativité restreinte en 1905".toText
+        "Il publie sa théorie de la relativité restreinte en 1905".toText,
+        "假設它確實存在，它可能因為現存的熱液態金屬氫與地函混合的對流而萎縮，並且熔融在行星內部的較上層".toText,
+        "금속성 수소층 위에는 수소로 이루어진 투명한 안쪽 대기가 자리잡고 있다".toText
       ),
       ("<body>Big ones, small <h1>ones</h1>, some as big as your head</body>".toText,
         "初めまして私はケビンです, <h1>初めまして私はケビンです</h1>".toText,
-        "Il <h2 class=\"a\">publie sa théorie de la relativité restreinte en 1905".toText
+        "Il <h2 class=\"a\">publie sa théorie de la relativité restreinte en 1905".toText,
+        "<div>在南半球有一個外觀與大紅斑類似，但較小的大氣特徵出現</div>".toText,
+        "목성의 중심으로부터 목성반경 <b>지점에서</b> 자기권과 태양풍의 <a href=www.google.com>상호작용으로</a> 활꼴 충격파가 발생한다".toText
       ),
-      ("".toText, Text.empty, Text.empty)
+      ("".toText, Text.empty, Text.empty, Text.empty, Text.empty)
     )
   )
   // scalastyle:on
@@ -125,6 +133,7 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]]
       List("h2", "clas", "a", "publ", "theo", "relativit", "restreint", "1905").toTextList,
       TextList.empty
     )
+
     val expectedHtml = {
       val copy = expected.toList.toArray
       copy(3) = List("publ", "theo", "relativit", "restreint", "1905").toTextList
@@ -133,6 +142,46 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]]
     // scalastyle:on
   }
 
+  trait Chinese {
+    // scalastyle:off
+    val expectedHtml = Array(
+      "外面的大氣層依緯度成不同的區與帶，在彼此的交界處有湍流和風暴作用著",
+      "理論模型顯示如果木星的質量比現在更大，而不是僅有目前的質量，它將會繼續收縮",
+      "假設它確實存在，它可能因為現存的熱液態金屬氫與地函混合的對流而萎縮，並且熔融在行星內部的較上層",
+      "在南半球有一個外觀與大紅斑類似，但較小的大氣特徵出現",
+      ""
+    ).map(_.sliding(2, 1).filterNot(_.contains("，")).toList.toTextList)
+
+    val expected = {
+      val copy = expectedHtml.clone()
+      copy(3) = (Seq("div") ++ expectedHtml(3).value ++ Seq("div")).toTextList
+      copy
+    }
+    // scalastyle:on
+  }
+
+  trait Korean {
+    // scalastyle:off
+    val expectedHtml = Array(
+      "외곽 대기는 위도에 따라 몇가지의 띠들로 눈에 띄게 구분되는데, 서로 상호작용하는 경계선을 따라 발생하는 난류와 폭풍에 의한 것이다",
+      "상층부 대기의 네온은 질량비로 차지하는데",
+      "금속성 수소층 위에는 수소로 이루어진 투명한 안쪽 대기가 자리잡고 있다",
+      "목성의 중심으로부터 목성반경 지점에서 자기권과 태양풍의 상호작용으로 활꼴 충격파가 발생한다",
+      ""
+    ).map(_.sliding(2, 1).filterNot(s => s.contains(" ") || s.contains(",")).toList.toTextList)
+
+    println(expectedHtml.toList)
+
+    val expected = {
+      val copy = expectedHtml.clone()
+      copy(3) = List("목성", "성의", "중심", "심으", "으로", "로부", "부터", "목성", "성반", "반경", "b", "지점", "점에",
+        "에서", "b", "자기", "기권", "권과", "태양", "양풍", "풍의", "href", "www.google.com", "상호", "호작", "작용",
+        "용으", "으로", "활꼴", "충격", "격파", "파가", "발생", "생한", "한다").toTextList
+      copy
+    }
+    // scalastyle:on
+  }
+
   it should "tokenize text correctly [English]" in new English {
     assertTextTokenizer(
       input = english, expected = expected,
@@ -151,6 +200,19 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]]
       tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.French)
     )
   }
+  it should "tokenize text correctly [Chinese (Simplified)]" in new Chinese {
+    assertTextTokenizer(
+      input = chinese, expected = expected,
+      tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.SimplifiedChinese)
+    )
+  }
+  it should "tokenize text correctly [Korean]" in new Korean {
+    assertTextTokenizer(
+      input = korean, expected = expected,
+      tokenizer = new TextTokenizer[Text]().setDefaultLanguage(Language.Korean)
+    )
+  }
+
   it should "strip html tags and tokenize text correctly [English]" in new English {
     assertTextTokenizer(
       input = english, expected = expectedHtml,
@@ -172,6 +234,21 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]]
         .setDefaultLanguage(Language.French)
     )
   }
+  it should "strip html tags and tokenize text correctly [Chinese (Simplified)]" in new Chinese {
+    assertTextTokenizer(
+      input = chinese, expected = expectedHtml,
+      tokenizer = new TextTokenizer[Text](analyzer = TextTokenizer.AnalyzerHtmlStrip)
+        .setDefaultLanguage(Language.SimplifiedChinese)
+    )
+  }
+  it should "strip html tags and tokenize text correctly [Korean]" in new Korean {
+    assertTextTokenizer(
+      input = korean, expected = expectedHtml,
+      tokenizer = new TextTokenizer[Text](analyzer = TextTokenizer.AnalyzerHtmlStrip)
+        .setDefaultLanguage(Language.Korean)
+    )
+  }
+
   it should "auto detect languages and tokenize accordingly [English]" in new English {
     assertTextTokenizer(
       input = english, expected = expected,
@@ -190,6 +267,18 @@ class TextTokenizerTest extends OpTransformerSpec[TextList, TextTokenizer[Text]]
       tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true)
     )
   }
+  it should "auto detect languages and tokenize accordingly [Chinese (Simplified)]" in new Chinese {
+    assertTextTokenizer(
+      input = chinese, expected = expected,
+      tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true)
+    )
+  }
+  it should "auto detect languages and tokenize accordingly [Korean]" in new Korean {
+    assertTextTokenizer(
+      input = korean, expected = expected,
+      tokenizer = new TextTokenizer[Text]().setAutoDetectLanguage(true)
+    )
+  }
   it should "work as a shortcut" in {
     val tokenized = english.tokenize()
     assertTextTokenizer(

From 8c6110bc5e72481b6276603462f9fa8aa649c3bb Mon Sep 17 00:00:00 2001
From: Matthew Tovbin <tovbinm@users.noreply.github.com>
Date: Thu, 5 Dec 2019 21:39:15 -0800
Subject: [PATCH 4/7] Update documentation

Fixes https://github.com/salesforce/TransmogrifAI/issues/425
---
 .../main/scala/com/salesforce/op/dsl/RichNumericFeature.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
index 62bd04644c..c4cfa6f224 100644
--- a/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
+++ b/core/src/main/scala/com/salesforce/op/dsl/RichNumericFeature.scala
@@ -215,14 +215,14 @@ trait RichNumericFeature {
       f.transformWith(new SqrtTransformer[I]())
 
     /**
-     * Square root transformer
+     * Log transformer
      * @return transformed feature
      */
     def log(base: Double): FeatureLike[Real] =
       f.transformWith(new LogTransformer[I](base = base))
 
     /**
-     * Square root transformer
+     * Power transformer
      * @return transformed feature
      */
     def power(power: Double): FeatureLike[Real] =

From 3d51d5defcd1005bf255b12fac55003e1c942f11 Mon Sep 17 00:00:00 2001
From: Winters Lu <luke.s.winters@gmail.com>
Date: Wed, 18 Dec 2019 10:48:35 -0800
Subject: [PATCH 5/7] Skip serializing cardinality estimates in
 FeatureDistributions (#447)

---
 .../op/filters/FeatureDistribution.scala      | 36 +++++++++++++++----
 .../com/salesforce/op/ModelInsightsTest.scala | 28 ++++++++-------
 .../op/filters/FeatureDistributionTest.scala  | 28 +++++++++++++--
 .../op/filters/FiltersTestData.scala          | 12 +++----
 4 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
index 1221bbe3c1..20e4da6511 100644
--- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
@@ -40,7 +40,7 @@ import com.twitter.algebird._
 import com.twitter.algebird.Operators._
 import org.apache.spark.mllib.feature.HashingTF
 import org.json4s.jackson.Serialization
-import org.json4s.{DefaultFormats, Formats}
+import org.json4s.{DefaultFormats, FieldSerializer, Formats}
 
 import scala.util.Try
 
@@ -64,6 +64,7 @@ case class FeatureDistribution
   distribution: Array[Double],
   summaryInfo: Array[Double],
   moments: Option[Moments] = None,
+  cardEstimate: Option[TextStats] = None,
   `type`: FeatureDistributionType = FeatureDistributionType.Training
 ) extends FeatureDistributionLike {
 
@@ -109,9 +110,10 @@ case class FeatureDistribution
     val combinedSummaryInfo = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo
 
     val combinedMoments = moments + fd.moments
+    val combinedCard = cardEstimate + fd.cardEstimate
 
     FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist,
-      combinedSummaryInfo, combinedMoments, `type`)
+      combinedSummaryInfo, combinedMoments, combinedCard, `type`)
   }
 
   /**
@@ -172,14 +174,14 @@ case class FeatureDistribution
   }
 
   override def equals(that: Any): Boolean = that match {
-    case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, `type`) =>
+    case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, c, `type`) =>
       distribution.deep == d.deep && summaryInfo.deep == s.deep &&
-        moments == m
+        moments == m && cardEstimate == c
     case _ => false
   }
 
   override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution,
-    summaryInfo, moments, `type`)
+    summaryInfo, moments, cardEstimate, `type`)
 }
 
 object FeatureDistribution {
@@ -190,8 +192,13 @@ object FeatureDistribution {
     override def plus(l: FeatureDistribution, r: FeatureDistribution): FeatureDistribution = l.reduce(r)
   }
 
+  val FeatureDistributionSerializer = FieldSerializer[FeatureDistribution](
+    FieldSerializer.ignore("cardEstimate")
+  )
+
   implicit val formats: Formats = DefaultFormats +
-    EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType)
+    EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) +
+    FeatureDistributionSerializer
 
   /**
    * Feature distributions to json
@@ -238,6 +245,7 @@ object FeatureDistribution {
         .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))
 
     val moments = value.map(momentsValues)
+    val cardEstimate = value.map(cardinalityValues)
 
     FeatureDistribution(
       name = name,
@@ -247,6 +255,7 @@ object FeatureDistribution {
       summaryInfo = summaryInfo,
       distribution = distribution,
       moments = moments,
+      cardEstimate = cardEstimate,
       `type` = `type`
     )
   }
@@ -265,6 +274,21 @@ object FeatureDistribution {
     MomentsGroup.sum(population.map(x => Moments(x)))
   }
 
+  /**
+   * Function to track frequency of the first $(MaxCardinality) unique values
+   * (number for numeric features, token for text features)
+   *
+   * @param values          values to track distribution / frequency
+   * @return TextStats object containing a Map from a value to its frequency (histogram)
+   */
+  private def cardinalityValues(values: ProcessedSeq): TextStats = {
+    TextStats(countStringValues(values.left.getOrElse(values.right.get)))
+  }
+
+  private def countStringValues[T](seq: Seq[T]): Map[String, Int] = {
+    seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size }
+  }
+
   /**
    * Function to put data into histogram of counts
    *
diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index abc0b906e3..d1fa503188 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -31,30 +31,28 @@
 package com.salesforce.op
 
 import com.salesforce.op.evaluators._
-import com.salesforce.op.features.types._
+import com.salesforce.op.features.types.{Real, _}
 import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike}
 import com.salesforce.op.filters._
 import com.salesforce.op.stages.impl.classification._
+import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats}
 import com.salesforce.op.stages.impl.preparators._
 import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpXGBoostRegressor, RegressionModelSelector}
 import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType
-import com.salesforce.op.stages.impl.selector.{SelectedModelCombiner, SelectedCombinerModel, SelectedModel}
 import com.salesforce.op.stages.impl.selector.ValidationType._
+import com.salesforce.op.stages.impl.selector.{SelectedCombinerModel, SelectedModel, SelectedModelCombiner}
 import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
 import com.salesforce.op.test.{PassengerSparkFixtureTest, TestFeatureBuilder}
 import com.salesforce.op.testkit.RandomReal
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
+import com.twitter.algebird.Moments
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tuning.ParamGridBuilder
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
 import org.junit.runner.RunWith
-import com.salesforce.op.features.types.Real
-import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats}
-import com.twitter.algebird.Moments
-import org.apache.spark.sql.{DataFrame, Dataset}
-import org.scalactic.Equality
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
-import org.apache.spark.sql.functions._
 
 import scala.util.{Failure, Success}
 
@@ -166,15 +164,16 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     return Array(descaledsmallCoeff, originalsmallCoeff, descaledbigCoeff, orginalbigCoeff)
   }
 
-  def getFeatureMoments(inputModel: FeatureLike[Prediction],
-    DF: DataFrame): Map[String, Moments] = {
+  def getFeatureMomentsAndCard(inputModel: FeatureLike[Prediction],
+    DF: DataFrame): (Map[String, Moments], Map[String, TextStats]) = {
     lazy val workFlow = new OpWorkflow().setResultFeatures(inputModel).setInputDataset(DF)
     lazy val dummyReader = workFlow.getReader()
     lazy val workFlowRFF = workFlow.withRawFeatureFilter(Some(dummyReader), None)
     lazy val model = workFlowRFF.train()
     val insights = model.modelInsights(inputModel)
     val featureMoments = insights.features.map(f => f.featureName -> f.distributions.head.moments.get).toMap
-    return featureMoments
+    val featureCardinality = insights.features.map(f => f.featureName -> f.distributions.head.cardEstimate.get).toMap
+    featureMoments -> featureCardinality
   }
 
   val params = new OpParams()
@@ -782,7 +781,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     val df = linRegDF._3
     val meanTol = 0.01
     val varTol = 0.01
-    val moments = getFeatureMoments(standardizedLinpred, linRegDF._3)
+    val (moments, cardinality) = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3)
 
     // Go through each feature and check that the mean, variance, and unique counts match the data
     moments.foreach { case (featureName, value) => {
@@ -793,6 +792,11 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
       math.abs((value.variance - expectedVariance) / expectedVariance) < varTol shouldBe true
     }
     }
+
+    cardinality.foreach { case (featureName, value) =>
+        val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
+        actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
+    }
   }
 
   it should "return correct insights when a model combiner equal is used as the final feature" in {
diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
index cc7b8584d0..086284e20b 100644
--- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
@@ -34,7 +34,10 @@ import com.salesforce.op.features.{FeatureDistributionType, TransientFeature}
 import com.salesforce.op.stages.impl.feature.TextStats
 import com.salesforce.op.test.PassengerSparkFixtureTest
 import com.salesforce.op.testkit.RandomText
+import com.salesforce.op.utils.json.EnumEntrySerializer
 import com.twitter.algebird.Moments
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
@@ -75,7 +78,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
     distribs(3).distribution.sum shouldBe 0
     distribs(4).distribution.sum shouldBe 3
     distribs(4).summaryInfo.length shouldBe bins
+    distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1))
     distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0)
+    distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1))
     distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0)
   }
 
@@ -200,7 +205,8 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   it should "marshall to/from json" in {
     val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
     val fd2 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
-      Array.empty, Some(Moments(1.0)), FeatureDistributionType.Scoring)
+      Array.empty, Some(Moments(1.0)), Option.empty,
+      FeatureDistributionType.Scoring)
     val json = FeatureDistribution.toJson(Array(fd1, fd2))
     FeatureDistribution.fromJson(json) match {
       case Success(r) => r shouldBe Seq(fd1, fd2)
@@ -210,7 +216,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
 
   it should "marshall to/from json with default vector args" in {
     val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
-      Array.empty, None, FeatureDistributionType.Scoring)
+      Array.empty, None, None, FeatureDistributionType.Scoring)
     val fd2 = FeatureDistribution("A", Some("X"), 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
     val json =
       """[{"name":"A","count":10,"nulls":1,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"},
@@ -238,4 +244,22 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
     intercept[IllegalArgumentException](fd1.jsDivergence(fd1.copy(name = "boo"))) should have message
       "requirement failed: Name must match to compare or combine feature distributions: A != boo"
   }
+
+  it should "not serialize cardEstimate field" in {
+    val cardEstimate = "cardEstimate"
+    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
+      Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))),
+      FeatureDistributionType.Scoring)
+    val featureDistributions = Seq(fd1, fd1.copy(cardEstimate = None))
+
+    FeatureDistribution.toJson(featureDistributions) shouldNot include (cardEstimate)
+
+    // deserialization from json with and without cardEstimate works
+    val jsonWithCardEstimate = Serialization.write(featureDistributions)(DefaultFormats +
+      EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType))
+    jsonWithCardEstimate should fullyMatch regex Seq(cardEstimate).mkString(".*", ".*", ".*")
+    jsonWithCardEstimate shouldNot fullyMatch regex Seq.fill(2)(cardEstimate).mkString(".*", ".*", ".*")
+
+    FeatureDistribution.fromJson(jsonWithCardEstimate) shouldBe Success(featureDistributions)
+  }
 }
diff --git a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
index c667453fb3..44c9a67871 100644
--- a/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/FiltersTestData.scala
@@ -47,16 +47,16 @@ trait FiltersTestData {
 
   protected val scoreSummaries = Seq(
     FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty,
-      None, FeatureDistributionType.Scoring),
+      None, None, FeatureDistributionType.Scoring),
     FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty,
-      None, FeatureDistributionType.Scoring),
+      None, None, FeatureDistributionType.Scoring),
     FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0),
-      Array.empty, None, FeatureDistributionType.Scoring),
+      Array.empty, None, None, FeatureDistributionType.Scoring),
     FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12),
-      Array.empty, None, FeatureDistributionType.Scoring),
+      Array.empty, None, None, FeatureDistributionType.Scoring),
     FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty,
-      None, FeatureDistributionType.Scoring),
+      None, None, FeatureDistributionType.Scoring),
     FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty,
-      None, FeatureDistributionType.Scoring)
+      None, None, FeatureDistributionType.Scoring)
   )
 }

From bad14d53fffaa850d0103800628e9aeab66a4a1f Mon Sep 17 00:00:00 2001
From: Matthew Tovbin <tovbinm@users.noreply.github.com>
Date: Tue, 24 Dec 2019 08:18:53 -0800
Subject: [PATCH 6/7] Remove extra >

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 07e102ccfd..59cc8a4799 100644
--- a/pom.xml
+++ b/pom.xml
@@ -256,7 +256,7 @@
             <artifactId>spark-sql_2.11</artifactId>
             <version>2.3.2</version>
             <scope>compile</scope>
-        </dependency>>
+        </dependency>
         <dependency>
             <groupId>org.apache.avro</groupId>
             <artifactId>avro</artifactId>

From a6aceb60624c9db50ffc92c135e303e4f6a9c0c9 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 7 Jan 2020 16:37:01 -0800
Subject: [PATCH 7/7] Add support for ignoring text that looks like IDs in
 SmartTextVectorizer (#448)

---
 .../op/filters/FeatureDistribution.scala      |   6 +-
 .../impl/feature/SmartTextMapVectorizer.scala |   3 +-
 .../impl/feature/SmartTextVectorizer.scala    | 145 +++++++++++++-----
 .../op/filters/FeatureDistributionTest.scala  |   6 +-
 .../feature/SmartTextMapVectorizerTest.scala  |  18 +--
 .../feature/SmartTextVectorizerTest.scala     |  82 +++++++++-
 .../stages/OpPipelineStageReaderWriter.scala  |   1 +
 .../feature/TextVectorizationMethod.scala     |  46 ++++++
 8 files changed, 242 insertions(+), 65 deletions(-)
 create mode 100644 features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala

diff --git a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
index 20e4da6511..42b17f760e 100644
--- a/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
+++ b/core/src/main/scala/com/salesforce/op/filters/FeatureDistribution.scala
@@ -282,11 +282,11 @@ object FeatureDistribution {
    * @return TextStats object containing a Map from a value to its frequency (histogram)
    */
   private def cardinalityValues(values: ProcessedSeq): TextStats = {
-    TextStats(countStringValues(values.left.getOrElse(values.right.get)))
+    TextStats(countStringValues(values.left.getOrElse(values.right.get)), Map.empty)
   }
 
-  private def countStringValues[T](seq: Seq[T]): Map[String, Int] = {
-    seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size }
+  private def countStringValues[T](seq: Seq[T]): Map[String, Long] = {
+    seq.groupBy(identity).map { case (k, valSeq) => k.toString -> valSeq.size.toLong }
   }
 
   /**
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 99e82fc00d..f149f5abba 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -72,7 +72,8 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
     textMap: T#Value, shouldCleanKeys: Boolean, shouldCleanValues: Boolean
   ): TextMapStats = {
     val keyValueCounts = textMap.map{ case (k, v) =>
-      cleanTextFn(k, shouldCleanKeys) -> TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1))
+      cleanTextFn(k, shouldCleanKeys) ->
+        TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1), Map(cleanTextFn(v, shouldCleanValues).length -> 1))
     }
     TextMapStats(keyValueCounts)
   }
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
index ca46a52b56..e95939d249 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
@@ -62,7 +62,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
   extends SequenceEstimator[T, OPVector](operationName = "smartTxtVec", uid = uid)
     with PivotParams with CleanTextFun with SaveOthersParams
     with TrackNullsParam with MinSupportParam with TextTokenizerParams with TrackTextLenParam
-    with HashingVectorizerParams with HashingFun with OneHotFun with MaxCardinalityParams {
+    with HashingVectorizerParams with HashingFun with OneHotFun with MaxCardinalityParams with MinLengthStdDevParams {
 
   private implicit val textStatsSeqEnc: Encoder[Array[TextStats]] = ExpressionEncoder[Array[TextStats]]()
 
@@ -81,23 +81,28 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
     require(!dataset.isEmpty, "Input dataset cannot be empty")
 
     val maxCard = $(maxCardinality)
+    val minLenStdDev = $(minLengthStdDev)
     val shouldCleanText = $(cleanText)
 
     implicit val testStatsMonoid: Semigroup[TextStats] = TextStats.monoid(maxCard)
     val valueStats: Dataset[Array[TextStats]] = dataset.map(_.map(computeTextStats(_, shouldCleanText)).toArray)
     val aggregatedStats: Array[TextStats] = valueStats.reduce(_ + _)
 
-    val (isCategorical, topValues) = aggregatedStats.map { stats =>
-      val isCategorical = stats.valueCounts.size <= maxCard
+    val (vectorizationMethods, topValues) = aggregatedStats.map { stats =>
+      val vecMethod: TextVectorizationMethod = stats match {
+        case _ if stats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot
+        case _ if stats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore
+        case _ => TextVectorizationMethod.Hash
+      }
       val topValues = stats.valueCounts
         .filter { case (_, count) => count >= $(minSupport) }
         .toSeq.sortBy(v => -v._2 -> v._1)
         .take($(topK)).map(_._1)
-      isCategorical -> topValues
+      (vecMethod, topValues)
     }.unzip
 
     val smartTextParams = SmartTextVectorizerModelArgs(
-      isCategorical = isCategorical,
+      vectorizationMethods = vectorizationMethods,
       topValues = topValues,
       shouldCleanText = shouldCleanText,
       shouldTrackNulls = $(trackNulls),
@@ -117,36 +122,40 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
   }
 
   private def computeTextStats(text: T#Value, shouldCleanText: Boolean): TextStats = {
-    val valueCounts = text match {
-      case Some(v) => Map(cleanTextFn(v, shouldCleanText) -> 1)
-      case None => Map.empty[String, Int]
+    val (valueCounts, lengthCounts) = text match {
+      case Some(v) => (Map(cleanTextFn(v, shouldCleanText) -> 1L), Map(cleanTextFn(v, shouldCleanText).length -> 1L))
+      case None => (Map.empty[String, Long], Map.empty[Int, Long])
     }
-    TextStats(valueCounts)
+    TextStats(valueCounts, lengthCounts)
   }
 
   private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = {
-    require(inN.length == smartTextParams.isCategorical.length)
+    require(inN.length == smartTextParams.vectorizationMethods.length)
 
-    val (categoricalFeatures, textFeatures) =
-      SmartTextVectorizer.partition[TransientFeature](inN, smartTextParams.isCategorical)
+    val groups = inN.toArray.zip(smartTextParams.vectorizationMethods).groupBy(_._2)
+    val textToPivot = groups.getOrElse(TextVectorizationMethod.Pivot, Array.empty).map(_._1)
+    val textToIgnore = groups.getOrElse(TextVectorizationMethod.Ignore, Array.empty).map(_._1)
+    val textToHash = groups.getOrElse(TextVectorizationMethod.Hash, Array.empty).map(_._1)
+    val allTextFeatures = textToHash ++ textToIgnore
 
     // build metadata describing output
     val shouldTrackNulls = $(trackNulls)
     val shouldTrackLen = $(trackTextLen)
     val unseen = Option($(unseenName))
 
-    val categoricalColumns = if (categoricalFeatures.nonEmpty) {
-      makeVectorColumnMetadata(shouldTrackNulls, unseen, smartTextParams.categoricalTopValues, categoricalFeatures)
+    val categoricalColumns = if (textToPivot.nonEmpty) {
+      makeVectorColumnMetadata(shouldTrackNulls, unseen, smartTextParams.categoricalTopValues, textToPivot)
     } else Array.empty[OpVectorColumnMetadata]
-    val textColumns = if (textFeatures.nonEmpty) {
+
+    val textColumns = if (allTextFeatures.nonEmpty) {
       if (shouldTrackLen) {
-        makeVectorColumnMetadata(textFeatures, makeHashingParams()) ++
-          textFeatures.map(_.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString)) ++
-          textFeatures.map(_.toColumnMetaData(isNull = true))
+        makeVectorColumnMetadata(textToHash, makeHashingParams()) ++
+          allTextFeatures.map(_.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString)) ++
+          allTextFeatures.map(_.toColumnMetaData(isNull = true))
       }
       else {
-        makeVectorColumnMetadata(textFeatures, makeHashingParams()) ++
-          textFeatures.map(_.toColumnMetaData(isNull = true))
+        makeVectorColumnMetadata(textToHash, makeHashingParams()) ++
+          allTextFeatures.map(_.toColumnMetaData(isNull = true))
       }
     } else Array.empty[OpVectorColumnMetadata]
 
@@ -156,53 +165,84 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
 }
 
 object SmartTextVectorizer {
-  val MaxCardinality = 100
+  val MaxCardinality: Int = 100
+  val MinTextLengthStdDev: Double = 0
   private[op] def partition[T: ClassTag](input: Array[T], condition: Array[Boolean]): (Array[T], Array[T]) = {
     val all = input.zip(condition)
-    (all.collect { case (item, true) => item }.toSeq.toArray, all.collect { case (item, false) => item }.toSeq.toArray)
+    (all.collect { case (item, true) => item }, all.collect { case (item, false) => item })
   }
 }
 
 /**
  * Summary statistics of a text feature
  *
- * @param valueCounts counts of feature values
+ * @param valueCounts  counts of feature values
+ * @param lengthCounts counts of token lengths
  */
-private[op] case class TextStats(valueCounts: Map[String, Int]) extends JsonLike
+private[op] case class TextStats(
+  valueCounts: Map[String, Long],
+  lengthCounts: Map[Int, Long]
+) extends JsonLike {
+
+  val lengthSize = lengthCounts.values.sum
+  val lengthMean: Double = lengthCounts.foldLeft(0.0)((acc, el) => acc + el._1 * el._2) / lengthSize
+  val lengthVariance: Double = lengthCounts.foldLeft(0.0)(
+    (acc, el) => acc + el._2 * (el._1 - lengthMean) * (el._1 - lengthMean)
+  )
+  val lengthStdDev: Double = math.sqrt(lengthVariance / lengthSize)
+}
 
 private[op] object TextStats {
+  /**
+   * Helper function to add two maps subject to a max cardinality restriction on the number of unique values
+   *
+   * @param totalMap        Current accumulated map
+   * @param mapToAdd        Additional map to add the to accumulated one
+   * @param maxCardinality  Maximum number of unique keys to keep track of (stop counting once this is hit)
+   * @tparam T              Type parameter for the keys
+   * @return                Newly accumulated map subject to the key cardinality constraints
+   */
+  def additionHelper[T](totalMap: Map[T, Long], mapToAdd: Map[T, Long], maxCardinality: Int): Map[T, Long] = {
+    if (totalMap.size > maxCardinality) totalMap
+    else if (mapToAdd.size > maxCardinality) mapToAdd
+    else totalMap + mapToAdd
+  }
+
   def monoid(maxCardinality: Int): Monoid[TextStats] = new Monoid[TextStats] {
     override def plus(l: TextStats, r: TextStats): TextStats = {
-      if (l.valueCounts.size > maxCardinality) l
-      else if (r.valueCounts.size > maxCardinality) r
-      else TextStats(l.valueCounts + r.valueCounts)
+      val newValueCounts = additionHelper(l.valueCounts, r.valueCounts, maxCardinality)
+      val newLengthCounts = additionHelper(l.lengthCounts, r.lengthCounts, maxCardinality)
+      TextStats(newValueCounts, newLengthCounts)
     }
 
     override def zero: TextStats = TextStats.empty
   }
 
-  def empty: TextStats = TextStats(Map.empty)
+  def empty: TextStats = TextStats(Map.empty, Map.empty)
 }
 
 /**
  * Arguments for [[SmartTextVectorizerModel]]
  *
- * @param isCategorical    is feature a categorical or not
- * @param topValues        top values to each feature
- * @param shouldCleanText  should clean text value
- * @param shouldTrackNulls should track nulls
- * @param hashingParams    hashing function params
+ * @param vectorizationMethods method to use for text vectorization (either pivot, hashing, or ignoring)
+ * @param isCategorical        is feature a categorical or not
+ * @param isIgnorable          is a text feature that we think is ignorable? high cardinality + low length variance
+ * @param topValues            top values to each feature
+ * @param shouldCleanText      should clean text value
+ * @param shouldTrackNulls     should track nulls
+ * @param hashingParams        hashing function params
  */
 case class SmartTextVectorizerModelArgs
 (
-  isCategorical: Array[Boolean],
+  vectorizationMethods: Array[TextVectorizationMethod],
   topValues: Array[Seq[String]],
   shouldCleanText: Boolean,
   shouldTrackNulls: Boolean,
   hashingParams: HashingFunctionParams
 ) extends JsonLike {
-  def categoricalTopValues: Array[Seq[String]] =
-    topValues.zip(isCategorical).collect { case (top, true) => top }
+  def categoricalTopValues: Array[Seq[String]] = {
+    topValues.zip(vectorizationMethods.map(_ == TextVectorizationMethod.Pivot)).collect { case (top, true) => top }
+  }
 }
 
 final class SmartTextVectorizerModel[T <: Text] private[op]
@@ -222,19 +262,28 @@ final class SmartTextVectorizerModel[T <: Text] private[op]
       shouldTrackNulls = args.shouldTrackNulls
     )
     (row: Seq[Text]) => {
-      val (rowCategorical, rowText) = SmartTextVectorizer.partition[Text](row.toArray, args.isCategorical)
-      val categoricalVector: OPVector = categoricalPivotFn(rowCategorical)
-      val textTokens: Seq[TextList] = rowText.map(tokenize(_).tokens)
+      val groups = row.toArray.zip(args.vectorizationMethods).groupBy(_._2)
+      val textToPivot = groups.getOrElse(TextVectorizationMethod.Pivot, Array.empty).map(_._1)
+      val textToIgnore = groups.getOrElse(TextVectorizationMethod.Ignore, Array.empty).map(_._1)
+      val textToHash = groups.getOrElse(TextVectorizationMethod.Hash, Array.empty).map(_._1)
+
+      val categoricalVector: OPVector = categoricalPivotFn(textToPivot)
+      val textTokens: Seq[TextList] = textToHash.map(tokenize(_).tokens)
+      val ignorableTextTokens: Seq[TextList] = textToIgnore.map(tokenize(_).tokens)
       val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams)
-      val textNullIndicatorsVector = if (args.shouldTrackNulls) getNullIndicatorsVector(textTokens) else OPVector.empty
-      val textLenVector = if ($(trackTextLen)) getLenVector(textTokens) else OPVector.empty
+      val textNullIndicatorsVector = if (args.shouldTrackNulls) {
+        getNullIndicatorsVector(textTokens ++ ignorableTextTokens)
+      } else OPVector.empty
+      val textLenVector = if ($(trackTextLen)) getLenVector(textTokens ++ ignorableTextTokens) else OPVector.empty
 
       categoricalVector.combine(textVector, textLenVector, textNullIndicatorsVector)
     }
   }
 
   private def getTextTransientFeatures: Array[TransientFeature] =
-    SmartTextVectorizer.partition[TransientFeature](getTransientFeatures(), args.isCategorical)._2
+    getTransientFeatures().zip(args.vectorizationMethods).collect {
+      case (tf, method) if method != TextVectorizationMethod.Pivot => tf
+    }
 
   private def getNullIndicatorsVector(textTokens: Seq[TextList]): OPVector = {
     val nullIndicators = textTokens.map { tokens =>
@@ -261,3 +310,15 @@ trait MaxCardinalityParams extends Params {
   final def getMaxCardinality: Int = $(maxCardinality)
   setDefault(maxCardinality -> SmartTextVectorizer.MaxCardinality)
 }
+
+trait MinLengthStdDevParams extends Params {
+  final val minLengthStdDev = new DoubleParam(
+    parent = this, name = "minLengthStdDev",
+    doc = "minimum standard deviation of the lengths of tokens in a text field for it to be hashed instead " +
+      "of ignored",
+    isValid = ParamValidators.inRange(lowerBound = 0, upperBound = 100)
+  )
+  final def setMinLengthStdDev(v: Double): this.type = set(minLengthStdDev, v)
+  final def getMinLengthStdDev: Double = $(minLengthStdDev)
+  setDefault(minLengthStdDev -> SmartTextVectorizer.MinTextLengthStdDev)
+}
diff --git a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
index 086284e20b..888e1de836 100644
--- a/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
+++ b/core/src/test/scala/com/salesforce/op/filters/FeatureDistributionTest.scala
@@ -78,9 +78,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
     distribs(3).distribution.sum shouldBe 0
     distribs(4).distribution.sum shouldBe 3
     distribs(4).summaryInfo.length shouldBe bins
-    distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1))
+    distribs(2).cardEstimate.get shouldBe TextStats(Map("male" -> 1, "female" -> 1), Map.empty)
     distribs(2).moments.get shouldBe Moments(2, 5.0, 2.0, 0.0, 2.0)
-    distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1))
+    distribs(4).cardEstimate.get shouldBe TextStats(Map("5.0" -> 1, "1.0" -> 1, "3.0" -> 1), Map.empty)
     distribs(4).moments.get shouldBe Moments(3, 3.0, 8.0, 0.0, 32.0)
   }
 
@@ -248,7 +248,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   it should "not serialize cardEstimate field" in {
     val cardEstimate = "cardEstimate"
     val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6),
-      Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2))),
+      Array.empty, Some(Moments(1.0)), Some(TextStats(Map("foo" -> 1, "bar" ->2), Map.empty)),
       FeatureDistributionType.Scoring)
     val featureDistributions = Seq(fd1, fd1.copy(cardEstimate = None))
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
index bbf2cb8035..5584ea1565 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
@@ -94,22 +94,22 @@ class SmartTextMapVectorizerTest
   Spec[TextMapStats] should "provide a proper semigroup" in {
     val data = Seq(
       TextMapStats(Map(
-        "f1" -> TextStats(Map("hello" -> 2, "world" -> 1)),
-        "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 2)),
-        "f3" -> TextStats(Map("foo" -> 1))
+        "f1" -> TextStats(Map("hello" -> 2, "world" -> 1), Map(5 -> 3)),
+        "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 2), Map(5 -> 4)),
+        "f3" -> TextStats(Map("foo" -> 1), Map(3 -> 1))
       )),
       TextMapStats(Map(
-        "f1" -> TextStats(Map("hello" -> 1)),
-        "f2" -> TextStats(Map("ocean" -> 1, "other" -> 5))
+        "f1" -> TextStats(Map("hello" -> 1), Map(5 -> 1)),
+        "f2" -> TextStats(Map("ocean" -> 1, "other" -> 5), Map(5 -> 6))
       )),
       TextMapStats(Map(
-        "f2" -> TextStats(Map("other" -> 1))
+        "f2" -> TextStats(Map("other" -> 1), Map(5 -> 1))
       ))
     )
     TextMapStats.monoid(2).sumOption(data) shouldBe Some(TextMapStats(Map(
-      "f1" -> TextStats(Map("hello" -> 3, "world" -> 1)),
-      "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 3, "other" -> 5)),
-      "f3" -> TextStats(Map("foo" -> 1))
+      "f1" -> TextStats(Map("hello" -> 3, "world" -> 1), Map(5 -> 4)),
+      "f2" -> TextStats(Map("hello" -> 2, "ocean" -> 3, "other" -> 5), Map(5 -> 11)),
+      "f3" -> TextStats(Map("foo" -> 1), Map(3 -> 1))
     )))
   }
 
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala
index 30143977c1..09b124a5ad 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala
@@ -34,6 +34,7 @@ import com.salesforce.op._
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.sequence.SequenceModel
 import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import com.salesforce.op.testkit.{RandomReal, RandomText}
 import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import org.apache.spark.ml.linalg.Vectors
@@ -59,7 +60,6 @@ class SmartTextVectorizerTest
     .setTopK(2).setPrependFeatureName(false)
     .setHashSpaceStrategy(HashSpaceStrategy.Shared)
     .setInput(f1, f2)
-
   val expectedResult = Seq(
     Vectors.sparse(9, Array(0, 4, 6), Array(1.0, 1.0, 1.0)),
     Vectors.sparse(9, Array(0, 8), Array(1.0, 1.0)),
@@ -68,6 +68,31 @@ class SmartTextVectorizerTest
     Vectors.sparse(9, Array(3, 8), Array(1.0, 1.0))
   ).map(_.toOPVector)
 
+  /*
+    Generate some more complicated input data to check things a little closer. There are four text fields with
+    different token distributions:
+
+    country: Uniformly distributed from a larger list of ~few hundred countries, should be hashed
+    categoricalText: Uniformly distributed from a small list of choices, should be pivoted (also has fixed lengths,
+      so serves as a test that the categorical check happens before the token length variance check)
+    textId: Uniformly distributed high cardinality Ids with fixed lengths, should be ignored
+    text: Uniformly distributed unicode strings with lengths ranging from 0-100, should be hashed
+   */
+  val countryData: Seq[Text] = RandomText.countries.withProbabilityOfEmpty(0.2).limit(1000)
+  val categoricalTextData: Seq[Text] = RandomText.textFromDomain(domain = List("A", "B", "C", "D", "E", "F"))
+    .withProbabilityOfEmpty(0.2).limit(1000)
+  // Generate List containing elements like 040231, 040232, ...
+  val textIdData: Seq[Text] = RandomText.textFromDomain(
+    domain = (1 to 1000).map(x => "%06d".format(40230 + x)).toList
+  ).withProbabilityOfEmpty(0.2).limit(1000)
+  val textData: Seq[Text] = RandomText.strings(minLen = 0, maxLen = 100).withProbabilityOfEmpty(0.2).limit(1000)
+  val generatedData: Seq[(Text, Text, Text, Text)] =
+    countryData.zip(categoricalTextData).zip(textIdData).zip(textData).map {
+      case (((co, ca), id), te) => (co, ca, id, te)
+    }
+  val (rawDF, rawCountry, rawCategorical, rawTextId, rawText) = TestFeatureBuilder(
+    "country", "categorical", "textId", "text", generatedData)
+
   it should "detect one categorical and one non-categorical text feature" in {
     val smartVectorized = new SmartTextVectorizer()
       .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(false)
@@ -170,6 +195,40 @@ class SmartTextVectorizerTest
     regular shouldBe shortcut
   }
 
+  it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in {
+    val topKCategorial = 3
+    val hashSize = 5
+
+    val smartVectorized = new SmartTextVectorizer()
+      .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial).setMinLengthStdDev(1.0)
+      .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false)
+      .setTrackNulls(true).setTrackTextLen(true)
+      .setInput(rawCountry, rawCategorical, rawTextId, rawText).getOutput()
+
+    val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDF)
+    val result = transformed.collect(smartVectorized)
+
+    /*
+      Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and
+      one ignored text field.
+
+      Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements
+      Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements
+      Ignored text: (1 length + 1 null indicator) = 2 elements
+     */
+    val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2
+    val firstRes = result.head
+    firstRes.v.size shouldBe featureVectorSize
+
+    val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
+    meta.columns.length shouldBe featureVectorSize
+    meta.columns.slice(0, 5).forall(_.grouping.contains("categorical"))
+    meta.columns.slice(5, 10).forall(_.grouping.contains("country"))
+    meta.columns.slice(10, 15).forall(_.grouping.contains("text"))
+    meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString))
+    meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString))
+  }
+
   it should "fail with an error" in {
     val emptyDF = inputData.filter(inputData("text1") === "").toDF()
 
@@ -375,16 +434,25 @@ class SmartTextVectorizerTest
   }
 
   Spec[TextStats] should "aggregate correctly" in {
-    val l1 = TextStats(Map("hello" -> 1, "world" -> 2))
-    val r1 = TextStats(Map("hello" -> 1, "world" -> 1))
-    val expected1 = TextStats(Map("hello" -> 2, "world" -> 3))
+    val l1 = TextStats(Map("hello" -> 1, "world" -> 2), Map(5 -> 3))
+    val r1 = TextStats(Map("hello" -> 1, "world" -> 1), Map(5 -> 2))
+    val expected1 = TextStats(Map("hello" -> 2, "world" -> 3), Map(5 -> 5))
 
-    val l2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3))
-    val r2 = TextStats(Map("hello" -> 1))
-    val expected2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3))
+    val l2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3), Map(5 -> 6))
+    val r2 = TextStats(Map("hello" -> 1), Map(5 -> 1))
+    val expected2 = TextStats(Map("hello" -> 1, "world" -> 2, "ocean" -> 3), Map(5 -> 7))
 
     TextStats.monoid(2).plus(l1, r1) shouldBe expected1
     TextStats.monoid(2).plus(l2, r2) shouldBe expected2
   }
 
+  it should "compute correct statistics on the length distributions" in {
+    val ts = TextStats(Map("hello" -> 2, "joe" -> 2, "woof" -> 1), Map(3 -> 2, 4 -> 1, 5 -> 2))
+
+    ts.lengthSize shouldBe 5
+    ts.lengthMean shouldBe 4.0
+    ts.lengthVariance shouldBe 4.0
+    ts.lengthStdDev shouldBe 2.0 / math.sqrt(5.0)
+  }
+
 }
diff --git a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala
index ef63f5d50c..a23bb3b72e 100644
--- a/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala
+++ b/features/src/main/scala/com/salesforce/op/stages/OpPipelineStageReaderWriter.scala
@@ -211,6 +211,7 @@ trait OpPipelineStageReadWriteFormats {
       EnumEntrySerializer.json4s[AnyValueTypes](AnyValueTypes) +
       EnumEntrySerializer.json4s[HashAlgorithm](HashAlgorithm) +
       EnumEntrySerializer.json4s[HashSpaceStrategy](HashSpaceStrategy) +
+      EnumEntrySerializer.json4s[TextVectorizationMethod](TextVectorizationMethod) +
       EnumEntrySerializer.json4s[ScalingType](ScalingType) +
       EnumEntrySerializer.json4s[TimePeriod](TimePeriod) +
       EnumEntrySerializer.json4s[FeatureDistributionType](FeatureDistributionType) +
diff --git a/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala b/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala
new file mode 100644
index 0000000000..f1ade68105
--- /dev/null
+++ b/features/src/main/scala/com/salesforce/op/stages/impl/feature/TextVectorizationMethod.scala
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import enumeratum._
+
+
+/**
+ * Methods of vectorizing text (eg. to be chosen by statistics computed in SmartTextVectorizer)
+ */
+sealed trait TextVectorizationMethod extends EnumEntry with Serializable
+
+object TextVectorizationMethod extends Enum[TextVectorizationMethod] {
+  val values = findValues
+  case object Pivot extends TextVectorizationMethod
+  case object Hash extends TextVectorizationMethod
+  case object Ignore extends TextVectorizationMethod
+}