Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup Helloworld examples #230

Merged
merged 1 commit into from
Feb 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 0 additions & 24 deletions helloworld/src/main/avro/Iris.avsc

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,35 +34,19 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait BostonFeatures extends Serializable {

val rowId = FeatureBuilder.Integral[BostonHouse].extract(_.rowId.toIntegral).asPredictor

val crim = FeatureBuilder.RealNN[BostonHouse].extract(_.crim.toRealNN).asPredictor

val zn = FeatureBuilder.RealNN[BostonHouse].extract(_.zn.toRealNN).asPredictor

val indus = FeatureBuilder.RealNN[BostonHouse].extract(_.indus.toRealNN).asPredictor

val chas = FeatureBuilder.PickList[BostonHouse].extract(x => Option(x.chas).toPickList).asPredictor

val nox = FeatureBuilder.RealNN[BostonHouse].extract(_.nox.toRealNN).asPredictor

val rm = FeatureBuilder.RealNN[BostonHouse].extract(_.rm.toRealNN).asPredictor

val age = FeatureBuilder.RealNN[BostonHouse].extract(_.age.toRealNN).asPredictor

val dis = FeatureBuilder.RealNN[BostonHouse].extract(_.dis.toRealNN).asPredictor

val rad = FeatureBuilder.Integral[BostonHouse].extract(_.rad.toIntegral).asPredictor

val tax = FeatureBuilder.RealNN[BostonHouse].extract(_.tax.toRealNN).asPredictor

val ptratio = FeatureBuilder.RealNN[BostonHouse].extract(_.ptratio.toRealNN).asPredictor

val b = FeatureBuilder.RealNN[BostonHouse].extract(_.b.toRealNN).asPredictor

val lstat = FeatureBuilder.RealNN[BostonHouse].extract(_.lstat.toRealNN).asPredictor

val medv = FeatureBuilder.RealNN[BostonHouse].extract(_.medv.toRealNN).asResponse

}

This file was deleted.

33 changes: 14 additions & 19 deletions helloworld/src/main/scala/com/salesforce/hw/boston/OpBoston.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import com.salesforce.op.readers.CustomReader
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._
import com.salesforce.op.stages.impl.tuning.DataSplitter
import com.salesforce.op.utils.kryo.OpKryoRegistrator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

Expand All @@ -45,52 +44,48 @@ import org.apache.spark.sql.{Dataset, SparkSession}
*/
object OpBoston extends OpAppWithRunner with BostonFeatures {

override def kryoRegistrator: Class[_ <: OpKryoRegistrator] = classOf[BostonKryoRegistrator]

////////////////////////////////////////////////////////////////////////////////
// READERS DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 112233L
val randomSeed = 42L

def customRead(path: Option[String], spark: SparkSession): RDD[BostonHouse] = {
require(path.isDefined, "The path is not set")
val myFile = spark.sparkContext.textFile(path.get)
def customRead(path: String)(implicit spark: SparkSession): RDD[BostonHouse] = {
val myFile = spark.sparkContext.textFile(path)

myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, number) =>
myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) =>
val words = x.replaceAll("\\s+", " ").replaceAll(s"^\\s+(?m)", "").replaceAll(s"(?m)\\s+$$", "").split(" ")
BostonHouse(number.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble,
BostonHouse(id.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble,
words(5).toDouble, words(6).toDouble, words(7).toDouble, words(8).toInt, words(9).toDouble,
words(10).toDouble, words(11).toDouble, words(12).toDouble, words(13).toDouble)
}
}

val trainingReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = {
val Array(train, _) = customRead(Some(getFinalReadPath(params)), spark).randomSplit(weights = Array(0.9, 0.1),
seed = randomSeed)
Left(train)
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
val Array(train, _) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
train
}
}

val scoringReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = {
val Array(_, test) = customRead(Some(getFinalReadPath(params)), spark).randomSplit(weights = Array(0.9, 0.1),
seed = randomSeed)
Left(test)
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
val Array(_, test) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
test
}
}


////////////////////////////////////////////////////////////////////////////////
// WORKFLOW DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val houseFeatures = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify()

val splitter = DataSplitter(seed = randomSeed)

val prediction = RegressionModelSelector
.withCrossValidation(
dataSplitter = Some(DataSplitter(seed = randomSeed)), seed = randomSeed,
dataSplitter = Some(splitter), seed = randomSeed,
modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
).setInput(medv, houseFeatures).getOutput()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,11 @@

package com.salesforce.hw.iris

import com.esotericsoftware.kryo.Kryo
import com.salesforce.op.utils.kryo.OpKryoRegistrator

class IrisKryoRegistrator extends OpKryoRegistrator {

override def registerCustomClasses(kryo: Kryo): Unit = {
doAvroRegistration[com.salesforce.hw.iris.Iris](kryo)
}

}
case class Iris
(
sepalLength: Double,
sepalWidth: Double,
petalLength: Double,
petalWidth: Double,
irisClass: String
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait IrisFeatures extends Serializable {
val id = FeatureBuilder.Integral[Iris].extract(_.getID.toIntegral).asPredictor
val sepalLength = FeatureBuilder.Real[Iris].extract(_.getSepalLength.toReal).asPredictor
val sepalWidth = FeatureBuilder.Real[Iris].extract(_.getSepalWidth.toReal).asPredictor
val petalLength = FeatureBuilder.Real[Iris].extract(_.getPetalLength.toReal).asPredictor
val petalWidth = FeatureBuilder.Real[Iris].extract(_.getPetalWidth.toReal).asPredictor
val irisClass = FeatureBuilder.Text[Iris].extract(_.getClass$.toText).asResponse
val sepalLength = FeatureBuilder.Real[Iris].extract(_.sepalLength.toReal).asPredictor
val sepalWidth = FeatureBuilder.Real[Iris].extract(_.sepalWidth.toReal).asPredictor
val petalLength = FeatureBuilder.Real[Iris].extract(_.petalLength.toReal).asPredictor
val petalWidth = FeatureBuilder.Real[Iris].extract(_.petalWidth.toReal).asPredictor
val irisClass = FeatureBuilder.Text[Iris].extract(_.irisClass.toText).asResponse
}
23 changes: 6 additions & 17 deletions helloworld/src/main/scala/com/salesforce/hw/iris/OpIris.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,37 +32,24 @@ package com.salesforce.hw.iris

import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.CustomReader
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.classification.MultiClassificationModelSelector
import com.salesforce.op.stages.impl.tuning.DataCutter
import com.salesforce.op.utils.kryo.OpKryoRegistrator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.Encoders

/**
* TransmogrifAI MultiClass Classification example on the Iris Dataset
*/
object OpIris extends OpAppWithRunner with IrisFeatures {

override def kryoRegistrator: Class[_ <: OpKryoRegistrator] = classOf[IrisKryoRegistrator]
implicit val irisEncoder = Encoders.product[Iris]

////////////////////////////////////////////////////////////////////////////////
// READER DEFINITIONS
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 42

val irisReader = new CustomReader[Iris](key = _.getID.toString){
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Iris], Dataset[Iris]] = {
val path = getFinalReadPath(params)
val myFile = spark.sparkContext.textFile(path)

Left(myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) =>
val Array(sepalLength, sepalWidth, petalLength, petalWidth, klass) = x.split(",")
new Iris(id.toInt, sepalLength.toDouble, sepalWidth.toDouble, petalLength.toDouble, petalWidth.toDouble, klass)
})
}
}
val irisReader = DataReaders.Simple.csvCase[Iris]()

////////////////////////////////////////////////////////////////////////////////
// WORKFLOW DEFINITION
Expand All @@ -72,6 +59,8 @@ object OpIris extends OpAppWithRunner with IrisFeatures {

val features = Seq(sepalLength, sepalWidth, petalLength, petalWidth).transmogrify()

val randomSeed = 42L

val cutter = DataCutter(reserveTestFraction = 0.2, seed = randomSeed)

val prediction = MultiClassificationModelSelector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ object OpTitanic extends OpAppWithRunner with TitanicFeatures {
// READER DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 112233L
val randomSeed = 42L
val simpleReader = DataReaders.Simple.csv[Passenger](
schema = Passenger.getClassSchema.toString, key = _.getPassengerId.toString
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,15 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait TitanicFeatures extends Serializable {

val survived = FeatureBuilder.RealNN[Passenger].extract(_.getSurvived.toDouble.toRealNN).asResponse

val pClass = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getPclass).map(_.toString).toPickList).asPredictor // scalastyle:off

val name = FeatureBuilder.Text[Passenger].extract(d => Option(d.getName).toText).asPredictor

val sex = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getSex).toPickList).asPredictor

val age = FeatureBuilder.Real[Passenger].extract(d => Option(Double.unbox(d.getAge)).toReal).asPredictor

val sibSp = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getSibSp).map(_.toString).toPickList).asPredictor

val parch = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getParch).map(_.toString).toPickList).asPredictor

val ticket = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getTicket).toPickList).asPredictor

val fare = FeatureBuilder.Real[Passenger].extract(d => Option(Double.unbox(d.getFare)).toReal).asPredictor

val cabin = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getCabin).toPickList).asPredictor

val embarked = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getEmbarked).toPickList).asPredictor

}