Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve test coverage for VectorsCombiner and make vector aggregator efficient #168

Merged
merged 6 commits into from
Nov 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import com.salesforce.op.stages.AllowLabelAsInput
import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel}
import com.salesforce.op.utils.spark.OpVectorColumnMetadata
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.spark.RichVector._
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.Metadata

Expand Down Expand Up @@ -163,7 +164,7 @@ final class DecisionTreeNumericMapBucketizerModel[I2 <: OPMap[_]] private[op]
input = cleanedInputMap.get(k)
)
}
VectorsCombiner.combine(vectors).toOPVector
combine(vectors).toOPVector
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import com.salesforce.op.features.types._
import com.salesforce.op.stages.OpPipelineStageBase
import com.salesforce.op.stages.base.sequence.SequenceTransformer
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import com.salesforce.op.utils.spark.RichVector._
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.ml.param._
import org.apache.spark.mllib.feature.HashingTF
Expand Down Expand Up @@ -265,7 +266,7 @@ private[op] trait HashingFun {
fNameHashesWithInputs.map { case (featureNameHash, el) =>
hasher.transform(prepare[T](el, params.hashWithIndex, params.prependFeatureName, featureNameHash)).asML
}
VectorsCombiner.combine(hashedVecs).toOPVector
combine(hashedVecs).toOPVector
}
}
}
Expand Down Expand Up @@ -379,7 +380,7 @@ private[op] trait MapHashingFun extends HashingFun {
prepare[TextList](el, params.hashWithIndex, params.prependFeatureName, featureNameHash)
).asML
})
VectorsCombiner.combine(hashedVecs.flatten).toOPVector
combine(hashedVecs.flatten).toOPVector
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
val textVector = hash(rowTextTokenized, keysText, args.hashingParams)
val textNullIndicatorsVector =
if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(keysText, rowTextTokenized)) else Nil
VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector)

categoricalVector.combine(textVector, textNullIndicatorsVector: _*)
}

private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[Map[String, TextList]]): OPVector = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ final class SmartTextVectorizerModel[T <: Text] private[op]
val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams)
val textNullIndicatorsVector = if (args.shouldTrackNulls) Seq(getNullIndicatorsVector(textTokens)) else Seq.empty

VectorsCombiner.combineOP(Seq(categoricalVector, textVector) ++ textNullIndicatorsVector)
categoricalVector.combine(textVector, textNullIndicatorsVector: _*)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,42 +81,9 @@ class VectorsCombiner(uid: String = UID[VectorsCombiner])

final class VectorsCombinerModel private[op] (operationName: String, uid: String)
extends SequenceModel[OPVector, OPVector](operationName = operationName, uid = uid) {
def transformFn: Seq[OPVector] => OPVector = VectorsCombiner.combineOP
}

case object VectorsCombiner {

/**
* Combine multiple OP vectors into one
*
* @param vectors input vectors
* @return result vector
*/
def combineOP(vectors: Seq[OPVector]): OPVector = {
new OPVector(combine(vectors.view.map(_.value)))
}

/**
* Combine multiple vectors into one
*
* @param vectors input vectors
* @return result vector
*/
def combine(vectors: Seq[Vector]): Vector = {
val indices = ArrayBuffer.empty[Int]
val values = ArrayBuffer.empty[Double]

val size = vectors.foldLeft(0)((size, vector) => {
vector.foreachActive { case (i, v) =>
if (v != 0.0) {
indices += size + i
values += v
}
}
size + vector.size
})
Vectors.sparse(size, indices.toArray, values.toArray).compressed
def transformFn: Seq[OPVector] => OPVector = s => s.toList match {
case v1 :: v2 :: tail => v1.combine(v2, tail: _*)
case v :: Nil => v
case Nil => OPVector.empty
}

}

Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class SmartTextVectorizerTest
val textRes = transformed.collect(textVectorized)
assertNominal(fieldText, Array.fill(textRes.head.value.size)(false), textRes)
val (smart, expected) = result.map { case (smartVector, categoricalVector, textVector, nullVector) =>
val combined = VectorsCombiner.combineOP(Seq(categoricalVector, textVector, nullVector))
val combined = categoricalVector.combine(textVector, nullVector)
smartVector -> combined
}.unzip

Expand Down Expand Up @@ -139,7 +139,7 @@ class SmartTextVectorizerTest
val textRes = transformed.collect(textVectorized)
assertNominal(fieldText, Array.fill(textRes.head.value.size)(false), textRes)
val (smart, expected) = result.map { case (smartVector, textVector, nullVector) =>
val combined = VectorsCombiner.combineOP(Seq(textVector, nullVector))
val combined = textVector.combine(nullVector)
smartVector -> combined
}.unzip

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,31 +31,42 @@
package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types.Text
import com.salesforce.op.features.{FeatureLike, TransientFeature}
import com.salesforce.op.test.PassengerSparkFixtureTest
import com.salesforce.op.features.TransientFeature
import com.salesforce.op.features.types.{Text, _}
import com.salesforce.op.stages.base.sequence.SequenceModel
import com.salesforce.op.test.{OpEstimatorSpec, PassengerSparkFixtureTest, TestFeatureBuilder}
import com.salesforce.op.testkit.{RandomReal, RandomVector}
import com.salesforce.op.utils.spark.OpVectorMetadata
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.linalg.Vectors
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner
import com.salesforce.op.utils.spark.RichMetadata._


@RunWith(classOf[JUnitRunner])
class VectorsCombinerTest extends FlatSpec with PassengerSparkFixtureTest {
class VectorsCombinerTest
extends OpEstimatorSpec[OPVector, SequenceModel[OPVector, OPVector], VectorsCombiner]
with PassengerSparkFixtureTest {

val vectors = Seq(
Vectors.sparse(4, Array(0, 3), Array(1.0, 1.0)),
Vectors.dense(Array(2.0, 3.0, 4.0)),
Vectors.sparse(4, Array(1), Array(777.0))
)
val expected = Vectors.sparse(11, Array(0, 3, 4, 5, 6, 8), Array(1.0, 1.0, 2.0, 3.0, 4.0, 777.0))
override def specName: String = classOf[VectorsCombiner].getSimpleName

Spec[VectorsCombiner] should "combine vectors correctly" in {
val combined = VectorsCombiner.combine(vectors)
assert(combined.compressed == combined, "combined is expected to be compressed")
combined shouldBe expected
}
val (inputData, f1, f2) = TestFeatureBuilder(Seq(
Vectors.sparse(4, Array(0, 3), Array(1.0, 1.0)).toOPVector ->
Vectors.sparse(4, Array(0, 3), Array(2.0, 3.0)).toOPVector,
Vectors.dense(Array(2.0, 3.0, 4.0)).toOPVector ->
Vectors.dense(Array(12.0, 13.0, 14.0)).toOPVector,
// Purposely added some very large sparse vectors to verify the efficiency
Vectors.sparse(100000000, Array(1), Array(777.0)).toOPVector ->
Vectors.sparse(500000000, Array(0), Array(888.0)).toOPVector
))

val estimator = new VectorsCombiner().setInput(f1, f2)

val expectedResult = Seq(
Vectors.sparse(8, Array(0, 3, 4, 7), Array(1.0, 1.0, 2.0, 3.0)).toOPVector,
Vectors.dense(Array(2.0, 3.0, 4.0, 12.0, 13.0, 14.0)).toOPVector,
Vectors.sparse(600000000, Array(1, 100000000), Array(777.0, 888.0)).toOPVector
)

it should "combine metadata correctly" in {
val vector = Seq(height, description, stringMap).transmogrify()
Expand All @@ -69,12 +80,11 @@ class VectorsCombinerTest extends FlatSpec with PassengerSparkFixtureTest {
}

it should "create metadata correctly" in {
val descVect = description.map[Text]{
t =>
Text(t.value match {
case Some(text) => "this is dumb " + text
case None => "some STUFF to tokenize"
})
val descVect = description.map[Text] { t =>
Text(t.value match {
case Some(text) => "this is dumb " + text
case None => "some STUFF to tokenize"
})
}.tokenize().tf(numTerms = 5)
val vector = Seq(height, stringMap, descVect).transmogrify()
val Seq(inputs1, inputs2, inputs3) = vector.parents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ object MonoidAggregatorDefaults {

val aggregator = weakTypeOf[O] match {
// Vector
case wt if wt =:= weakTypeOf[OPVector] => UnionVector
case wt if wt =:= weakTypeOf[OPVector] => CombineVector

// Lists
case wt if wt =:= weakTypeOf[TextList] => ConcatTextList
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,29 @@ package com.salesforce.op.aggregators

import com.salesforce.op.features.types._
import com.twitter.algebird._
import com.salesforce.op.utils.spark.RichVector._
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.reflect.runtime.universe._

/**
* Aggregator that gives the union of Vector data
*/
case object UnionVector
case object CombineVector
extends MonoidAggregator[Event[OPVector], Vector, OPVector]
with AggregatorDefaults[OPVector] {
implicit val ttag = weakTypeTag[OPVector]
val ftFactory = FeatureTypeFactory[OPVector]()
val monoid: Monoid[Vector] = Monoid.from(Vectors.zeros(0))((v1: Vector, v2: Vector) =>
Vectors.dense(v1.toArray ++ v2.toArray)
)
val monoid: Monoid[Vector] = Monoid.from(Vectors.zeros(0))(_ combine _)
}

/**
* Aggregator that gives the sum of Vector data
*/
case object SumVector
extends MonoidAggregator[Event[OPVector], Vector, OPVector]
with AggregatorDefaults[OPVector] {
implicit val ttag = weakTypeTag[OPVector]
val ftFactory = FeatureTypeFactory[OPVector]()
val monoid: Monoid[Vector] = Monoid.from(Vectors.zeros(0))(_ + _)
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

package com.salesforce.op.features.types

import com.salesforce.op.utils.spark.RichVector._
import org.apache.spark.ml.linalg._

/**
Expand All @@ -39,8 +40,46 @@ import org.apache.spark.ml.linalg._
*/
class OPVector(val value: Vector) extends OPCollection {
type Value = Vector

final def isEmpty: Boolean = value.size == 0

/**
* Add vectors
*
* @param that another vector
* @throws IllegalArgumentException if the vectors have different sizes
* @return vector addition
*/
def +(that: OPVector): OPVector = (value + that.value).toOPVector

/**
* Subtract vectors
*
* @param that another vector
* @throws IllegalArgumentException if the vectors have different sizes
* @return vector subtraction
*/
def -(that: OPVector): OPVector = (value - that.value).toOPVector
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as long as you are adding addition and subtraction why not dot product and cross product?


/**
* Dot product between vectors
*
* @param that another vector
* @throws IllegalArgumentException if the vectors have different sizes
* @return dot product
*/
def dot(that: OPVector): Double = value dot that.value

/**
* Combine multiple vectors into one
*
* @param that another vector
* @param other other vectors
* @return result vector
*/
def combine(that: OPVector, other: OPVector*): OPVector = value.combine(that.value, other.map(_.value): _*).toOPVector
}

object OPVector {
def apply(value: Vector): OPVector = new OPVector(value)
def empty: OPVector = FeatureTypeDefaults.OPVector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
package com.salesforce.op.utils.spark

import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable.ArrayBuffer

/**
* [[org.apache.spark.ml.linalg.Vector]] enrichment functions
Expand Down Expand Up @@ -64,6 +66,30 @@ object RichVector {
toSpark(res)
}

/**
* Dot product between vectors
*
* @param that another vector
* @throws IllegalArgumentException if the vectors have different sizes
* @return dot product
*/
def dot(that: Vector): Double = {
require(v.size == that.size,
s"Vectors must have the same length: a.length == b.length (${v.size} != ${that.size})"
)
v.toBreeze dot that.toBreeze
}

/**
* Combine multiple vectors into one
*
* @param that another vector
* @param other other vectors
* @return result vector
*/
def combine(that: Vector, other: Vector*): Vector =
com.salesforce.op.utils.spark.RichVector.combine(v +: that +: other)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need the full import here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is to distinguish the object from the implicit class.


/**
* Convert to [[breeze.linalg.Vector]]
*
Expand All @@ -85,4 +111,26 @@ object RichVector {

}

/**
* Combine multiple vectors into one
*
* @param vectors input vectors
* @return result vector
*/
def combine(vectors: Seq[Vector]): Vector = {
val indices = ArrayBuffer.empty[Int]
val values = ArrayBuffer.empty[Double]

val size = vectors.foldLeft(0)((size, vector) => {
vector.foreachActive { case (i, v) =>
if (v != 0.0) {
indices += size + i
values += v
}
}
size + vector.size
})
Vectors.sparse(size, indices.toArray, values.toArray).compressed
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -519,10 +519,20 @@ class MonoidAggregatorDefaultsTest extends FlatSpec with TestCommon {
assertDefaultAggr(multiPickListMapTestSeq, expectedRes)
}

Spec(UnionVector.getClass) should "work" in {
Spec(CombineVector.getClass) should "work" in {
assertDefaultAggr(vectorTestSeq, Vectors.dense(Array(0.1, 0.2, 1.0, 0.2)))
}

Spec(SumVector.getClass) should "work" in {
val vectors = Seq(Array(0.1, 0.2), Array(1.0, -1.5), Array(0.2, 0.0)).map(Vectors.dense(_).toOPVector)
assertAggr(SumVector, vectors, Vectors.dense(Array(1.3, -1.3)))
}
it should "error on vectors of invalid sizes" in {
val vectors = Seq(Array(0.1, 0.2), Array(1.0)).map(Vectors.dense(_).toOPVector)
intercept[IllegalArgumentException](assertAggr(SumVector, vectors, Vectors.zeros(0))).getMessage shouldBe
"requirement failed: Vectors must have same length: x.length == y.length (1 != 2)"
}

Spec[CustomMonoidAggregator[_]] should "work" in {
val customAgg = new CustomMonoidAggregator[Real](zero = None, associativeFn = (r1, r2) => (r1 -> r2).map(_ + _))
assertAggr(customAgg, realTestSeq, Option(doubleBase.flatten.sum))
Expand Down
Loading