More refactoring, #8 #15

eHarmony · Aug 3, 2016 · c5e152a · c5e152a
1 parent e4bf1e7
commit c5e152a
Show file tree

Hide file tree

Showing 30 changed files with 336 additions and 198 deletions.
diff --git a/core/src/main/scala/com/eharmony/spotz/Preamble.scala b/core/src/main/scala/com/eharmony/spotz/Preamble.scala
@@ -1,12 +1,22 @@
 package com.eharmony.spotz
 
+import com.eharmony.spotz.optimizer.SamplerFunction
+
 import scala.math.Ordering
 import scala.language.implicitConversions
 
 /**
   * @author vsuthichai
   */
 object Preamble {
+  // type SamplerFunctionOrIterable = SamplerFunction[_] with Iterable[_]
+
+  type neg[A] = A => Nothing
+  type union[T, U] = neg[neg[T] with neg[U]]
+  type doubleNeg[A] = neg[neg[A]]
+
+  type SamplerFunctionOrIterable = union[SamplerFunction[_], Iterable[_]]
+
   implicit object PointLossOrdering extends Ordering[(Point, Double)] {
     override def compare(x: (Point, Double), y: (Point, Double)): Int = {
       if (x._2 > y._2) 1

diff --git a/core/src/main/scala/com/eharmony/spotz/backend/BackendFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/BackendFunctions.scala
@@ -9,14 +9,14 @@ import scala.reflect.ClassTag
   * @author vsuthichai
   */
 trait BackendFunctions {
-  def bestRandomPoint[P, L](startIndex: Long,
-                            batchSize: Long,
-                            objective: Objective[P, L],
-                            space: RandomSpace[P],
-                            reducer: ((P, L), (P, L)) => (P, L)): (P, L)
+  protected def bestRandomPoint[P, L](startIndex: Long,
+                                      batchSize: Long,
+                                      objective: Objective[P, L],
+                                      space: RandomSpace[P],
+                                      reducer: ((P, L), (P, L)) => (P, L)): (P, L)
 
-  def bestPointAndLoss[P, L](gridPoints: Seq[P],
-                             objective: Objective[P, L],
-                             reducer: ((P, L), (P, L)) => (P, L))
-                            (implicit c: ClassTag[P], p: ClassTag[L]): (P, L)
+  protected def bestPointAndLoss[P, L](gridPoints: Seq[P],
+                                       objective: Objective[P, L],
+                                       reducer: ((P, L), (P, L)) => (P, L))
+                                      (implicit c: ClassTag[P], p: ClassTag[L]): (P, L)
 }
diff --git a/core/src/main/scala/com/eharmony/spotz/backend/ParallelFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/ParallelFunctions.scala
@@ -9,11 +9,11 @@ import scala.reflect.ClassTag
   * @author vsuthichai
   */
 trait ParallelFunctions extends BackendFunctions {
-  override def bestRandomPoint[P, L](startIndex: Long,
-                                     batchSize: Long,
-                                     objective: Objective[P, L],
-                                     space: RandomSpace[P],
-                                     reducer: ((P, L), (P, L)) => (P, L)): (P, L) = {
+  protected override def bestRandomPoint[P, L](startIndex: Long,
+                                               batchSize: Long,
+                                               objective: Objective[P, L],
+                                               space: RandomSpace[P],
+                                               reducer: ((P, L), (P, L)) => (P, L)): (P, L) = {
     val pointsAndLosses = (startIndex to (startIndex + batchSize)).par.map { trial =>
       val rngModifiedSpace = space.setSeed(space.seed + trial)
       val point = rngModifiedSpace.sample
@@ -23,10 +23,10 @@ trait ParallelFunctions extends BackendFunctions {
     pointsAndLosses.reduce(reducer)
   }
 
-  override def bestPointAndLoss[P, L](gridPoints: Seq[P],
-                                      objective: Objective[P, L],
-                                      reducer: ((P, L), (P, L)) => (P, L))
-                                      (implicit c: ClassTag[P], p: ClassTag[L]): (P, L) = {
+  protected override def bestPointAndLoss[P, L](gridPoints: Seq[P],
+                                                objective: Objective[P, L],
+                                                reducer: ((P, L), (P, L)) => (P, L))
+                                                (implicit c: ClassTag[P], p: ClassTag[L]): (P, L) = {
     gridPoints.par.map(point => (point, objective(point))).reduce(reducer)
   }
 }
diff --git a/core/src/main/scala/com/eharmony/spotz/backend/SparkFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/SparkFunctions.scala
@@ -37,7 +37,7 @@ trait SparkFunctions extends BackendFunctions {
     *                generated
     * @return the best point with the best loss
     */
-  override def bestRandomPoint[P, L](startIndex: Long,
+  protected[backend] override def bestRandomPoint[P, L](startIndex: Long,
                                      batchSize: Long,
                                      objective: Objective[P, L],
                                      space: RandomSpace[P],
@@ -58,7 +58,7 @@ trait SparkFunctions extends BackendFunctions {
     pointAndLossRDD.reduce(reducer)
   }
 
-  override def bestPointAndLoss[P, L](gridPoints: Seq[P],
+  protected[backend] override def bestPointAndLoss[P, L](gridPoints: Seq[P],
                                       objective: Objective[P, L],
                                       reducer: ((P, L), (P, L)) => (P, L))
                                      (implicit c: ClassTag[P], p: ClassTag[L]): (P, L) = {

diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/HyperParameter.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/HyperParameter.scala
@@ -5,16 +5,14 @@ import scala.util.Random
 /**
   * @author vsuthichai
   */
-trait SamplerFunction[T] extends Serializable {
-  def apply(rng: Random): T
-}
-
-trait RandomSampler[T] extends SamplerFunction[T]
+trait SamplerFunction[T] extends Serializable
 
-abstract class Uniform[T](lb: T, ub: T) extends RandomSampler[T] {
+abstract class RandomSampler[T] extends SamplerFunction[T] {
   def apply(rng: Random): T
 }
 
+abstract class Uniform[T](lb: T, ub: T) extends RandomSampler[T]
+
 case class UniformDouble(lb: Double, ub: Double) extends Uniform[Double](lb, ub) {
   override def apply(rng: Random): Double = lb + ((ub - lb) * rng.nextDouble)
 }
@@ -23,7 +21,13 @@ case class UniformInt(lb: Int, ub: Int) extends Uniform[Int](lb, ub) {
   override def apply(rng: Random): Int = lb + rng.nextInt(ub - lb)
 }
 
-case class IterableSampler[T](iterable: Iterable[T]) extends RandomSampler[T] {
+case class RandomChoice[T](iterable: Iterable[T]) extends RandomSampler[T] {
+  val values = iterable.toSeq
+
+  override def apply(rng: Random): T = values(rng.nextInt(values.length))
+}
+
+case class Grid[T](iterable: Iterable[T]) extends RandomSampler[T] {
   val values = iterable.toSeq
 
   override def apply(rng: Random): T = values(rng.nextInt(values.length))

diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/Optimizer.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/Optimizer.scala
@@ -8,17 +8,20 @@ import scala.reflect.ClassTag
 /**
   * @author vsuthichai
   */
-trait Optimizer[P, L, -S <: Space[P], +R <: OptimizerResult[P, L]] extends Serializable {
-  def minimize(objective: Objective[P, L], space: S)(implicit c: ClassTag[P], p: ClassTag[L]): R
-  def maximize(objective: Objective[P, L], space: S)(implicit c: ClassTag[P], p: ClassTag[L]): R
+trait Optimizer[P, L, +R] extends Serializable {
+  def minimize(objective: Objective[P, L])(implicit c: ClassTag[P], p: ClassTag[L]): R
+  def maximize(objective: Objective[P, L])(implicit c: ClassTag[P], p: ClassTag[L]): R
 }
 
+/*
 trait AbstractOptimizer[P, L, -S <: Space[P], +R <: OptimizerResult[P, L]] extends Optimizer[P, L, S, R] {
   type Reducer[T] = (T, T) => T
   implicit val ord: Ordering[(P, L)]
 
-  def min(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.min(p1, p2)
-  def max(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.max(p1, p2)
+  protected def min(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.min(p1, p2)
+  protected def max(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.max(p1, p2)
+  protected def optimize(objective: Objective[P, L], space: S, reducer: Reducer[(P, L)])
+                        (implicit c: ClassTag[P], p: ClassTag[L]): R
 
   override def minimize(objective: Objective[P, L], space: S)(implicit c: ClassTag[P], p: ClassTag[L]): R = {
     optimize(objective, space, min)
@@ -27,14 +30,25 @@ trait AbstractOptimizer[P, L, -S <: Space[P], +R <: OptimizerResult[P, L]] exten
   override def maximize(objective: Objective[P, L], space: S)(implicit c: ClassTag[P], p: ClassTag[L]): R = {
     optimize(objective, space, max)
   }
+}
+*/
 
-  protected def optimize(objective: Objective[P, L], space: S, reducer: Reducer[(P, L)])
+trait AbstractOptimizer[P, L, R <: OptimizerResult[P, L]] extends Optimizer[P, L, R] {
+  type Reducer[T] = (T, T) => T
+  implicit val ord: Ordering[(P, L)]
+
+  protected def min(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.min(p1, p2)
+  protected def max(p1: (P,L), p2: (P,L))(implicit ord: Ordering[(P,L)]): (P,L) = ord.max(p1, p2)
+  protected def optimize(objective: Objective[P, L], reducer: Reducer[(P, L)])
                         (implicit c: ClassTag[P], p: ClassTag[L]): R
-}
 
-abstract class OptimizerResult[P, L](bestPoint: P, bestLoss: L)
+  override def minimize(objective: Objective[P, L])(implicit c: ClassTag[P], p: ClassTag[L]): R = {
+    optimize(objective, min)
+  }
 
-trait Space[P] extends Serializable {
-  def sample: P
-  def sample(howMany: Int): Iterable[P] = Seq.fill(howMany)(sample)
+  override def maximize(objective: Objective[P, L])(implicit c: ClassTag[P], p: ClassTag[L]): R = {
+    optimize(objective, max)
+  }
 }
+
+abstract class OptimizerResult[P, L](bestPoint: P, bestLoss: L)
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/OptimizerConstants.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/OptimizerConstants.scala
@@ -0,0 +1,13 @@
+package com.eharmony.spotz.optimizer
+
+/**
+  * Created by vsuthichai on 8/2/16.
+  */
+object OptimizerConstants {
+  val RANDOM_SEARCH = "random"
+  val GRID_SEARCH = "grid"
+
+
+  val SPARK_BACKEND = "spark"
+  val THREADS_BACKEND = "threads"
+}
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/Space.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/Space.scala
@@ -0,0 +1,9 @@
+package com.eharmony.spotz.optimizer
+
+/**
+  * @author vsuthichai
+  */
+trait Space[P] extends Serializable {
+  def sample: P
+  def sample(howMany: Int): Iterable[P] = Seq.fill(howMany)(sample)
+}
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/StopStrategy.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/StopStrategy.scala
@@ -14,6 +14,9 @@ sealed trait StopStrategy extends Serializable {
   def shouldStop(trialsSoFar: Long, timeSinceFirstTrial: Duration): Boolean
 }
 
+// TODO
+case class StopContext[P, L](foo: Any)
+
 class MaxTrialsStop(maxTrials: Long) extends StopStrategy {
   assert(maxTrials > 0, "Must specify greater than 0 trials.")
   override def getMaxTrials: Long = maxTrials
@@ -38,13 +41,22 @@ class MaxTrialsOrMaxDurationStop(maxTrials: Long, maxDuration: Duration) extends
   }
 }
 
+object OptimizerFinishes extends StopStrategy {
+  override def shouldStop(trialsSoFar: Long, durationSinceFirstTrial: Duration): Boolean = false
+}
+
+class StopStrategyPredicate[P, L](f: (StopContext[P, L]) => Boolean) {
+  def shouldStop(stopContext: StopContext[P, L]) = f(stopContext)
+}
+
 /**
- * Companion factory object to instantiate various stop strategies.
- */
+  * Companion factory object to instantiate various stop strategies.
+  */
 object StopStrategy {
-  def stopAfterMaxDuration(maxDuration: Duration): TimedStop = new TimedStop(maxDuration)
-  def stopAfterMaxTrials(maxTrials: Long): MaxTrialsStop = new MaxTrialsStop(maxTrials)
-  def stopAfterMaxTrialsOrMaxDuration(maxTrials: Long, maxDuration: Duration): MaxTrialsOrMaxDurationStop = {
+  def stopAfterMaxDuration(maxDuration: Duration): StopStrategy = new TimedStop(maxDuration)
+  def stopAfterMaxTrials(maxTrials: Long): StopStrategy = new MaxTrialsStop(maxTrials)
+  def stopAfterMaxTrialsOrMaxDuration(maxTrials: Long, maxDuration: Duration): StopStrategy = {
     new MaxTrialsOrMaxDurationStop(maxTrials, maxDuration)
   }
+  def stopWhenOptimizerFinishes: StopStrategy = OptimizerFinishes
 }
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/grid/GridSearch.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/grid/GridSearch.scala
@@ -3,6 +3,7 @@ package com.eharmony.spotz.optimizer.grid
 import com.eharmony.spotz.backend.{BackendFunctions, ParallelFunctions, SparkFunctions}
 import com.eharmony.spotz.objective.Objective
 import com.eharmony.spotz.optimizer.AbstractOptimizer
+import com.eharmony.spotz.util.{DurationUtils, Logger}
 import org.apache.spark.SparkContext
 import org.joda.time.{DateTime, Duration}
 
@@ -14,16 +15,27 @@ import scala.reflect.ClassTag
   * @author vsuthichai
   */
 abstract class GridSearch[P, L]
-    (trialBatchSize: Int = 100)
-    (implicit val ord: Ordering[(P, L)])
-  extends AbstractOptimizer[P, L, GridSpace[P], GridSearchResult[P, L]]
+    (paramSpace: Map[String, Iterable[_]], trialBatchSize: Int)
+    (implicit ord: Ordering[(P, L)], factory: Map[String, _] => P)
+extends AbstractOptimizer[P, L, GridSearchResult[P, L]]
     with BackendFunctions {
 
-  override def optimize(objective: Objective[P, L],
-                        space: GridSpace[P],
-                        reducer: Reducer[(P, L)])
-                       (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
+  val LOG = Logger[this.type]()
 
+  def minimize(objective: Objective[P, L], space: Map[String, Iterable[_]])
+              (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
+    optimize(objective, min)
+  }
+
+  def maximize(objective: Objective[P, L], space: Map[String, Iterable[_]])
+              (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
+    optimize(objective, max)
+  }
+
+  override protected def optimize(objective: Objective[P, L],
+                                  reducer: Reducer[(P, L)])
+                                 (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
+    val space = new GridSpace[P](paramSpace)
     val startTime = DateTime.now()
     val firstPoint = space.sample
     val firstLoss = objective(firstPoint)
@@ -33,26 +45,31 @@ abstract class GridSearch[P, L]
   }
 
   @tailrec
-  private[this] def gridSearch(objective: Objective[P, L], space: GridSpace[P], reducer: Reducer[(P, L)],
-                               startTime: DateTime, bestPointSoFar: P, bestLossSoFar: L, trialsSoFar: Long)
-                              (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
+  private def gridSearch(objective: Objective[P, L], space: GridSpace[P], reducer: Reducer[(P, L)],
+                         startTime: DateTime, bestPointSoFar: P, bestLossSoFar: L, trialsSoFar: Long)
+                        (implicit c: ClassTag[P], p: ClassTag[L]): GridSearchResult[P, L] = {
 
     val endTime = DateTime.now()
     val elapsedTime = new Duration(startTime, endTime)
 
+    LOG.info(s"Best point and loss after $trialsSoFar trials and ${DurationUtils.format(elapsedTime)} : $bestPointSoFar loss: $bestLossSoFar")
+
     space.isExhausted match {
       case true =>
         GridSearchResult(bestPointSoFar, bestLossSoFar, startTime, endTime, trialsSoFar, elapsedTime)
       case false =>
         val points = space.sample(trialBatchSize)
         val (bestPoint, bestLoss) = reducer((bestPointSoFar, bestLossSoFar), bestPointAndLoss(points.toSeq, objective, reducer))
-        gridSearch(objective, space, reducer, startTime, bestPoint, bestLoss, trialsSoFar)
+
+        gridSearch(objective, space, reducer, startTime, bestPoint, bestLoss, trialsSoFar + points.size)
     }
   }
 }
 
-class ParGridSearch[P, L](trialBatchSize: Int = 100)(implicit ord: Ordering[(P, L)])
-  extends GridSearch[P, L](trialBatchSize)(ord) with ParallelFunctions
+class ParGridSearch[P, L](paramSpace: Map[String, Iterable[_]], trialBatchSize: Int = 100000)
+                         (implicit val ord: Ordering[(P, L)], factory: Map[String, _] => P)
+  extends GridSearch[P, L](paramSpace, trialBatchSize)(ord, factory) with ParallelFunctions
 
-class SparkGridSearch[P, L](@transient val sc: SparkContext, trialBatchSize: Int = 100)(implicit ord: Ordering[(P, L)])
-  extends GridSearch[P, L](trialBatchSize)(ord) with SparkFunctions
+class SparkGridSearch[P, L](@transient val sc: SparkContext, paramSpace: Map[String, Iterable[_]], trialBatchSize: Int = 100000)
+                           (implicit val ord: Ordering[(P, L)], factory: Map[String, _] => P)
+  extends GridSearch[P, L](paramSpace, trialBatchSize)(ord, factory) with SparkFunctions
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/grid/GridSearchResult.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/grid/GridSearchResult.scala
@@ -1,7 +1,7 @@
 package com.eharmony.spotz.optimizer.grid
 
 import com.eharmony.spotz.optimizer.OptimizerResult
-import org.joda.time.format.PeriodFormatterBuilder
+import com.eharmony.spotz.util.DurationUtils
 import org.joda.time.{DateTime, Duration}
 
 /**
@@ -17,15 +17,7 @@ case class GridSearchResult[P, L](
   extends OptimizerResult[P, L](bestPoint, bestLoss) {
 
   override def toString = {
-    val formatter = new PeriodFormatterBuilder()
-      .appendDays().appendSuffix("d")
-      .appendHours().appendSuffix("h")
-      .appendMinutes().appendSuffix("m")
-      .appendSeconds().appendSuffix("s")
-      .appendMillis().appendSuffix("ms")
-      .toFormatter
-
     s"GridSearchResult(bestPoint=$bestPoint, bestLoss=$bestLoss, " +
-      s"totalTrials=$totalTrials, duration=${formatter.print(elapsedTime.toPeriod)}"
+      s"totalTrials=$totalTrials, duration=${DurationUtils.format(elapsedTime)}"
   }
 }