databricks · phi-dbq · Oct 14, 2017 · Oct 14, 2017 · Oct 14, 2017 · Oct 14, 2017
diff --git a/README.md b/README.md
@@ -244,4 +244,6 @@ registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"),
 ### Estimator
 
 ## Releases:
+* 0.2.x Feature
+  - Developer tools
 * 0.1.0 initial release
diff --git a/bin/totgen.sh b/bin/totgen.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+#######################################################
+# Create the list of necessary environment variables
+scala_major_ver=2.11
+package_name="sparkdl"
+######################################################
+
+set -eu
+
+# The current directory of the script.
+_bsd_="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+function log_info { >&2 echo "$(tput setaf 6)INFO: $@$(tput sgr0)"; }
+function quit_with { >&2 echo "$(tput setaf 1)ERROR: $@$(tput sgr0)"; exit 1; }
+
+[[ -n "${SPARK_HOME:-}" ]] || \
+    quit_with "must provide Spark home"
+
+host_spark_home="${SPARK_HOME:-}"
+
+# Generate required classpaths
+[[ -x "${_bsd_}/sbt" ]] || \
+    quit_with "cannot locate runnable project sbt executable"
+
+sbt_path_root="${_bsd_}/.sbt.paths"
+function tr_classpath { perl -pe "s@~@$HOME@g" "${sbt_path_root}/$@"; }
+function mk_classpath { local IFS=':'; echo "$*"; }
+
+# Spark packages are cached in local ivy cache
+(cd "${_bsd_}"
+ #./sbt genClasspath assembly
+ #./sbt genClasspath spPackage
+ ./sbt genClasspath spDist
+ cd "${sbt_path_root}"
+ rm -f SPARK_PACKAGE_PYREQ && touch $_
+ for spkg in $(cat SBT_SPARK_PACKAGE_CLASSPATH | tr ':' '\n'); do
+     log_info "[py-deps]: ${spkg}"
+     printf "\n# BEGIN: $(basename $spkg)\n" >> SPARK_PACKAGE_PYREQ
+     spkg_pyreq="$(jar -tf "${spkg}" | grep -Ei 'requirements.txt' || echo 'none')"
+     if [[ "none" != "${spkg_pyreq}" ]]; then
+         unzip -p "${spkg}" "${spkg_pyreq}" >> SPARK_PACKAGE_PYREQ
+     else
+         log_info "didn't detect requirements.txt"
+     fi
+     printf "\n# END: $(basename $spkg)\n" >> SPARK_PACKAGE_PYREQ
+ done
+ perl -pe "s@${HOME}@~@g" SBT_SPARK_PACKAGE_CLASSPATH > SPARK_PACKAGE_CLASSPATH
+ perl -pe "s@${HOME}@~@g" SBT_RUNTIME_CLASSPATH > SPARK_RUNTIME_EXTRA_CLASSPATH
+ rm -f SBT_*_CLASSPATH
+)
+
+# Find the local assembly jar, if any
+_sbt_target_path="${_bsd_}/target/scala-${scala_major_ver}"
+#assembly_jar="$(find ${_sbt_target_path} -name "*-assembly*.jar" -type f | uniq)"
+spkg_zip="$(find ${_sbt_target_path}/.. -name "*${scala_major_ver}.zip" -type f | uniq)"
+
+# Setup python paths
+_proj_pypath="${_bsd_}"/python
+_spark_pypath="${SPARK_HOME}"/python:"$(find "${SPARK_HOME}"/python/lib/ -name 'py4j-*-src.zip' -type f | uniq)"
+_spark_pkg_path="$(tr_classpath SPARK_PACKAGE_CLASSPATH)"
+_spark_pkg_pyreq="${sbt_path_root}/SPARK_PACKAGE_PYREQ"
+
+# Notice that spark submit requires colons
+#_spark_pkg_submit_common="${assembly_jar},${_spark_pkg_path}"
+#_spark_pkg_submit_common="${_spark_pkg_path}"
+_spark_pkg_submit_common="${spkg_zip},${_spark_pkg_path}"
+
+_submit_py_files="${_spark_pkg_submit_common}"
+_submit_jars="${_spark_pkg_submit_common}"
+
+log_info "[spark submit] --jars ${_submit_jars}"
+log_info "[spark submit] --py-files ${_submit_py_files}"
+
+# Provide required python and jar files for Spark testing cluster
+# Create individial scripts
+
+#######################################################
+# PySpark
+#######################################################
+
+SCPT="${_bsd_}/.EXEC_SCRIPT"
+
+function SCPT_BEGIN {
+    rm -f "${SCPT}" && touch $_ && chmod +x $_
+    cat << '_SCPT_HEADER_EOF_' >> "${SCPT}"
+#!/usr/bin/env bash
+##%%----
+## Generated automatically
+##%%----
+
+_bsd_="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+function quit_with { >&2 echo "ERROR: $@"; exit 1; }
+function check_vars {
+         for _varname in ${@}; do
+                  local _var="$(eval "echo \$${_varname}")"
+                  [[ -n "${_var}" ]] || quit_with "${_varname} not defined"
+         done
+}
+
+_SCPT_HEADER_EOF_
+
+cat << _SCPT_HEADER_VAR_EOF_ >> "${SCPT}"
+##%%----
+# Global variables
+
+spark_pkg_path="${_spark_pkg_path}"
+
+##%%----
+_SCPT_HEADER_VAR_EOF_
+}
+
+function SCPT_PYSPARK_BODY {
+    cat << _SCPT_LOCAL_VAR_EOF_ >> "${SCPT}"
+##%%----
+# Local variables
+
+_proj_pypath="${_proj_pypath}"
+_spark_pypath="${_spark_pypath}"
+_spark_pkg_pypath="${_spark_pkg_path}"
+_submit_jars="${_submit_jars}"
+_submit_py_files="${_submit_py_files}"
+
+##%%----
+_SCPT_LOCAL_VAR_EOF_
+
+    cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}"
+check_vars _py _ipy _pyspark
+
+_local_pypath="$(${_py} -c 'import site; print(site.USER_SITE)')"
+
+export PYSPARK_PYTHON=${_py}
+export PYSPARK_DRIVER_PYTHON=${_ipy}
+export PYSPARK_DRIVER_PYTHON_OPTS="-i --simple-prompt --pprint"
+# We should only be using the assembly to make sure consistency
+# REPL based development could always go and reload pieces
+export PYTHONPATH="${_proj_pypath}:${_local_pypath}:${_spark_pypath}:${_spark_pkg_pypath}"
+#export PYTHONPATH="${_local_pypath}:${_spark_pypath}:${_spark_pkg_pypath}"
+
+exec "${_pyspark}" \
+     --master "local[4]" \
+     --conf spark.app.name="[drgscl]::pyspark" \
+     --conf spark.eventLog.enabled=false \
+     --conf spark.driver.memory=10g \
+     --conf spark.executor.memory=10g \
+     --py-files "${_submit_py_files}" \
+     --jars "${_submit_jars}" \
+     --verbose \
+     $@
+
+_EXEC_SCRIPT_EOF_
+}
+
+
+function SCPT_SPARK_SHELL_BODY {
+    cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}"
+check_vars _spark_shell
+check_vars _submit_jars
+
+exec "${_spark_shell}" \
+     --master "local[4]" \
+     --conf spark.app.name="[drgscl]::spark-shell" \
+     --conf spark.eventLog.enabled=false \
+     --conf spark.driver.memory=10g \
+     --conf spark.executor.memory=10g \
+     --jars "${_submit_jars}" \
+     --verbose \
+     $@
+
+_EXEC_SCRIPT_EOF_
+}
+
+#######################################################
+# Documentation
+#######################################################
+
+function gen_jekyll {
+    SCPT_BEGIN
+    cat << _SCPT_LOCAL_VAR_EOF_ >> "${SCPT}"
+##%%----
+# Local variables
+
+this_package="${package_name}"
+_proj_pypath="${_proj_pypath}"
+_spark_pypath="${_spark_pypath}"
+_spark_pkg_pypath="${_spark_pkg_path}"
+_spark_pkg_prereqs="${_spark_pkg_pyreq}"
+export SPARK_HOME="${host_spark_home}"
+_py="$(which python)"
+_pip="$(which pip)"
+
+##%%----
+_SCPT_LOCAL_VAR_EOF_
+
+    cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}"
+check_vars _py _pip
+_local_pypath="$(${_py} -c 'import site; print(site.USER_SITE)')"
+export PYTHONPATH="${_local_pypath}:${_proj_pypath}:${_spark_pypath}:${_spark_pkg_pypath}"
+
+pip install --user -r "${_spark_pkg_prereqs}"
+
+(cd ${_bsd_}/python && sphinx-apidoc -f -o docs ${this_package})
+
+pushd "${_bsd_}/docs"
+jekyll $@
+popd
+
+_EXEC_SCRIPT_EOF_
+
+    mv "${SCPT}" ${_bsd_}/.jekyll
+}
+
+function gen_py2_spark_shell {
+    SCPT_BEGIN
+    cat << _SCPT_VAR_EOF_ >> "${SCPT}"
+_py="$(which python2)"
+_ipy="$(which ipython2)"
+_pyspark="${SPARK_HOME}"/bin/pyspark
+_SCPT_VAR_EOF_
+    SCPT_PYSPARK_BODY
+    mv "${SCPT}" ${_bsd_}/.py2.spark.shell
+}
+
+function gen_py3_spark_shell {
+    SCPT_BEGIN
+    cat << _SCPT_VAR_EOF_ >> "${SCPT}"
+_py="$(which python3)"
+_ipy="$(which ipython3)"
+_pyspark="${SPARK_HOME}"/bin/pyspark
+_SCPT_VAR_EOF_
+    SCPT_PYSPARK_BODY
+    mv "${SCPT}" ${_bsd_}/.py3.spark.shell
+}
+
+function gen_spark_shell {
+    SCPT_BEGIN
+    cat << _SCPT_VAR_EOF_ >> "${SCPT}"
+_spark_shell="${SPARK_HOME}"/bin/spark-shell
+_submit_jars="${_submit_jars}"
+_SCPT_VAR_EOF_
+    SCPT_SPARK_SHELL_BODY
+    mv "${SCPT}" ${_bsd_}/.spark.shell
+}
+
+gen_py2_spark_shell
+gen_py3_spark_shell
+gen_spark_shell
+gen_jekyll
diff --git a/build.sbt b/build.sbt
@@ -1,16 +1,6 @@
 // Your sbt build file. Guides on how to write one can be found at
 // http://www.scala-sbt.org/0.13/docs/index.html
-
-val sparkVer = sys.props.getOrElse("spark.version", "2.1.1")
-val sparkBranch = sparkVer.substring(0, 3)
-val defaultScalaVer = sparkBranch match {
-  case "2.0" => "2.11.8"
-  case "2.1" => "2.11.8"
-  case "2.2" => "2.11.8"
-  case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
-}
-val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer)
-val scalaMajorVersion = scalaVer.substring(0, scalaVer.indexOf(".", scalaVer.indexOf(".") + 1))
+import libdeps.LibVers._
 
 sparkVersion := sparkVer
 
@@ -19,7 +9,7 @@ scalaVersion := scalaVer
 spName := "databricks/spark-deep-learning"
 
 // Don't forget to set the version
-version := s"0.1.0-spark$sparkBranch"
+version := s"0.2.0-spark$sparkBranch"
 
 // All Spark Packages need a license
 licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
@@ -34,7 +24,7 @@ sparkComponents ++= Seq("mllib-local", "mllib", "sql")
 
 // add any Spark Package dependencies using spDependencies.
 // e.g. spDependencies += "databricks/spark-avro:0.1"
-spDependencies += s"databricks/tensorframes:0.2.9-s_${scalaMajorVersion}"
+spDependencies += s"databricks/tensorframes:0.2.9-s_${scalaMajorVer}"
 
 // These versions are ancient, but they cross-compile around scala 2.10 and 2.11.
 // Update them when dropping support for scala 2.10

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,58 @@
+FROM ubuntu:16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        libfreetype6-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+
+# Please don't set DEBIAN_FRONTEND.
+# Checkout "Why is DEBIAN_FRONTEND=noninteractive discouraged in Dockerfiles?"
+# from https://docs.docker.com/engine/faq/ for details.
+# ENV DEBIAN_FRONTEND noninteractive
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
+        numpy \
+        pandas \
+        scipy \
+        sklearn \
+        ipython \
+        pyspark
+
+# Install TensorFlow CPU verion from central
+RUN pip --no-cache-dir install \
+    'http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.head-cp27-cp27mu-linux_x86_64.whl'
+
+ENV SPARK_WORKER_PORT 8888
+
+EXPOSE 4040 6066 7077 8080 8888
+
+# Avoid the default Docker behavior of mapping our IP address to an unreachable host name
+
+RUN mkdir -p /phi9t
+COPY entrypoint.sh /phi9t/.
+
+WORKDIR /workspace
+
+ENTRYPOINT ["/phi9t/entrypoint.sh"]
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+IP_ADDR="$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')"
+echo "Container IP Address: $IP_ADDR"
+#export MASTER="spark://${IP_ADDR}:7077"
+export SPARK_LOCAL_IP="${IP_ADDR}"
+export SPARK_PUBLIC_DNS="${IP_ADDR}"
+
+umount /etc/hosts
+
+exec ipython -i $@
diff --git a/linter.sh b/linter.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+_bsd_="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -gt 1 ]]; then
+	target_files=(${@})
+else
+	target_files=($(git diff --name-only upstream/master HEAD))
+fi
+
+echo "${target_files[@]}"
+pushd "${_bsd_}"
+exec prospector --profile ${_bsd_}/prospector.yaml "${target_files[@]}"
+popd