From bf6f9942f3d8d610e1a4bc4509d76e7b8b608e14 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 11 Oct 2017 16:11:24 +0800 Subject: [PATCH 01/29] 1. Support NLP non-distribued training 2. Introduce Kafka to avoid broadcast huge tranning data --- .../estimators/tf_text_file_estimator.py | 312 ++++++++++++++++++ python/sparkdl/param/shared_params.py | 99 +++++- python/sparkdl/tf_fun.py | 90 +++++ python/sparkdl/transformers/named_text.py | 134 ++++++++ python/sparkdl/transformers/tf_image.py | 2 +- python/sparkdl/transformers/tf_text.py | 91 +++++ python/sparkdl/transformers/utils.py | 2 + python/tests/Test.py | 30 ++ python/tests/Test2.py | 22 ++ python/tests/resources/text/sample.txt | 4 + python/tests/transformers/tf_text_test.py | 126 +++++++ 11 files changed, 910 insertions(+), 2 deletions(-) create mode 100644 python/sparkdl/estimators/tf_text_file_estimator.py create mode 100644 python/sparkdl/tf_fun.py create mode 100644 python/sparkdl/transformers/named_text.py create mode 100644 python/sparkdl/transformers/tf_text.py create mode 100644 python/tests/Test.py create mode 100644 python/tests/Test2.py create mode 100644 python/tests/resources/text/sample.txt create mode 100644 python/tests/transformers/tf_text_test.py diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py new file mode 100644 index 00000000..278ab8e5 --- /dev/null +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -0,0 +1,312 @@ +# +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# pylint: disable=protected-access +from __future__ import absolute_import, division, print_function + +import logging +import threading +import time +import os +import shutil + +import cPickle as pickle + +from kafka import KafkaConsumer +from kafka import KafkaProducer +from pyspark.ml import Estimator + +from sparkdl.param import ( + keyword_only, HasLabelCol, HasInputCol, HasOutputCol) +from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam +import sparkdl.utils.jvmapi as JVMAPI + +__all__ = ['TFTextFileEstimator'] + +logger = logging.getLogger('sparkdl') + + +class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, MapFnParam): + """ + Build a Estimator from tensorflow or keras when backend is tensorflow. + + First,assume we have data in dataframe like following. + + .. code-block:: python + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + transformer = TFTextTransformer( + inputCol=input_col, + outputCol=output_col) + + df = transformer.transform(documentDF) + + TFTextTransformer will transform text column to `output_col`, which is 2-D array. + + Then we create a tensorflow function. + + .. code-block:: python + def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + _read_data is a data generator. args provide hyper parameteres configured in this estimator. + + here is how to use _read_data: + + .. code-block:: python + for data in _read_data(max_records=params.batch_size): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + finally we can create TFTextFileEstimator to train our model: + + .. code-block:: python + estimator = TFTextFileEstimator(inputCol="sentence_matrix", + outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1"}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df) + + """ + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + super(TFTextFileEstimator, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def fit(self, dataset, params=None): + self._validateParams() + if params is None: + paramMaps = self.getFitParam() + elif isinstance(params, (list, tuple)): + if len(params) == 0: + paramMaps = [dict()] + else: + self._validateFitParams(params) + paramMaps = params + elif isinstance(params, dict): + paramMaps = [params] + else: + raise ValueError("Params must be either a param map or a list/tuple of param maps, " + "but got %s." % type(params)) + return self._fitInParallel(dataset, paramMaps) + + def _validateParams(self): + """ + Check Param values so we can throw errors on the driver, rather than workers. + :return: True if parameters are valid + """ + if not self.isDefined(self.inputCol): + raise ValueError("Input column must be defined") + if not self.isDefined(self.outputCol): + raise ValueError("Output column must be defined") + return True + + def _fitInParallel(self, dataset, paramMaps): + + inputCol = self.getInputCol() + labelCol = self.getLabelCol() + + from time import gmtime, strftime + kafaParams = self.getKafkaParam() + topic = kafaParams["topic"] + "_" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + group_id = kafaParams["group_id"] + bootstrap_servers = kafaParams["bootstrap_servers"] + kafka_test_mode = kafaParams["test_mode"] if "test_mode" in kafaParams else False + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) if kafka_test_mode else KafkaProducer( + bootstrap_servers=bootstrap_servers) + try: + for d in d_iter: + producer.send(topic, pickle.dumps(d)) + producer.send(topic, pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + if kafka_test_mode: + _write_data() + else: + t = threading.Thread(target=_write_data) + t.start() + + stop_flag_num = dataset.rdd.getNumPartitions() + temp_item = dataset.take(1)[0] + vocab_s = temp_item["vocab_size"] + embedding_size = temp_item["embedding_size"] + + sc = JVMAPI._curr_sc() + + paramMapsRDD = sc.parallelize(paramMaps, numSlices=len(paramMaps)) + + # Obtain params for this estimator instance + baseParamMap = self.extractParamMap() + baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) + baseParamDictBc = sc.broadcast(baseParamDict) + + def _local_fit(override_param_map): + # Update params + params = baseParamDictBc.value + params["fitParam"] = override_param_map + + def _read_data(max_records=64): + consumer = KafkaMockServer() if kafka_test_mode else KafkaConsumer(topic, + group_id=group_id, + bootstrap_servers=bootstrap_servers, + auto_offset_reset="earliest", + enable_auto_commit=False + ) + try: + stop_count = 0 + fail_msg_count = 0 + while True: + if kafka_test_mode: + time.sleep(1) + messages = consumer.poll(timeout_ms=1000, max_records=max_records) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + fail_msg_count += 0 + pass + if len(group_msgs) > 0: + yield group_msgs + + if kafka_test_mode: + print( + "stop_count = {} " + "group_msgs = {} " + "stop_flag_num = {} " + "fail_msg_count = {}".format(stop_count, + len(group_msgs), + stop_flag_num, + fail_msg_count)) + + if stop_count >= stop_flag_num and len(group_msgs) == 0: + break + finally: + consumer.close() + + self.getMapFnParam()(_read_data, + feature=inputCol, + label=labelCol, + vacab_size=vocab_s, + embedding_size=embedding_size, + params=params + ) + + return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) + + def _fit(self, dataset): # pylint: disable=unused-argument + err_msgs = ["This function should not have been called", + "Please contact library maintainers to file a bug"] + raise NotImplementedError('\n'.join(err_msgs)) + + +class KafkaMockServer(object): + """ + Restrictions of KafkaMockServer: + * Make sure all data have been writen before consume. + * Poll function will just ignore max_records and just return all data in queue. + """ + _kafka_mock_server_tmp_file_ = "/tmp/mock-kafka/" + sended = False + + def __init__(self, index=0): + super(KafkaMockServer, self).__init__() + self.index = index + self.queue = [] + if not os.path.exists(self._kafka_mock_server_tmp_file_): + os.mkdir(self._kafka_mock_server_tmp_file_) + + def send(self, topic, msg): + self.queue.append(pickle.loads(msg)) + + def flush(self): + with open(self._kafka_mock_server_tmp_file_ + str(self.index), "w") as f: + pickle.dump(self.queue, f) + self.queue = [] + + def close(self): + pass + + def poll(self, timeout_ms, max_records): + if self.sended: + return {} + + records = [] + for file in os.listdir(self._kafka_mock_server_tmp_file_): + with open(self._kafka_mock_server_tmp_file_ + file) as f: + tmp = pickle.load(f) + records += tmp + result = {} + couter = 0 + for i in records: + obj = MockRecord() + obj.value = pickle.dumps(i) + couter += 1 + result[str(couter) + "_"] = [obj] + self.sended = True + return result + + +class MockRecord(list): + pass diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index e169e891..7305fc8b 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -27,6 +27,7 @@ import sparkdl.utils.keras_model as kmutil + # From pyspark def keyword_only(func): @@ -36,15 +37,75 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper +class KafkaParam(Params): + kafkaParam = Param(Params._dummy(), "kafkaParam", "kafka", typeConverter=TypeConverters.identity) + + def __init__(self): + super(KafkaParam, self).__init__() + + def setKafkaParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(kafkaParam=value) + + def getKafkaParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.kafkaParam) + + +class FitParam(Params): + fitParam = Param(Params._dummy(), "fitParam", "hyper parameter when training", + typeConverter=TypeConverters.identity) + + def __init__(self): + super(FitParam, self).__init__() + + def setFitParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(fitParam=value) + + def getFitParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.fitParam) + + +class MapFnParam(Params): + mapFnParam = Param(Params._dummy(), "mapFnParam", "Tensorflow func", typeConverter=TypeConverters.identity) + + def __init__(self): + super(MapFnParam, self).__init__() + + def setMapFnParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(mapFnParam=value) + + def getMapFnParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.mapFnParam) + + class HasInputCol(Params): """ Mixin for param inputCol: input column name. @@ -68,6 +129,42 @@ def getInputCol(self): return self.getOrDefault(self.inputCol) +class HasEmbeddingSize(Params): + """ + Mixin for param embeddingSize + """ + + embeddingSize = Param(Params._dummy(), "embeddingSize", "word embedding size", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasEmbeddingSize, self).__init__() + + def setEmbeddingSize(self, value): + return self._set(embeddingSize=value) + + def getEmbeddingSize(self): + return self.getOrDefault(self.embeddingSize) + + +class HasSequenceLength(Params): + """ + Mixin for param sequenceLength + """ + + sequenceLength = Param(Params._dummy(), "sequenceLength", "sequence length", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasSequenceLength, self).__init__() + + def setSequenceLength(self, value): + return self._set(sequenceLength=value) + + def getSequenceLength(self): + return self.getOrDefault(self.sequenceLength) + + class HasOutputCol(Params): """ Mixin for param outputCol: output column name. @@ -92,12 +189,12 @@ def getOutputCol(self): """ return self.getOrDefault(self.outputCol) + ############################################ # New in sparkdl ############################################ class SparkDLTypeConverters(object): - @staticmethod def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py new file mode 100644 index 00000000..b870f5f8 --- /dev/null +++ b/python/sparkdl/tf_fun.py @@ -0,0 +1,90 @@ +def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + sess = tf.Session() + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + + summ = tf.summary.merge_all() + + sess.run(tf.global_variables_initializer()) + + for i in range(params["epochs"]): + print("epoll {}".format(i)) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + sess.close() diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py new file mode 100644 index 00000000..ef51cd0c --- /dev/null +++ b/python/sparkdl/transformers/named_text.py @@ -0,0 +1,134 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from keras.applications.imagenet_utils import decode_predictions +import numpy as np + +from pyspark.ml import Transformer +from pyspark.ml.param import Param, Params, TypeConverters + +import sparkdl.graph.utils as tfx +from sparkdl.image.imageIO import resizeImage +import sparkdl.transformers.keras_applications as keras_apps +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) +from sparkdl.transformers.tf_text import TFTextTransformer + +SUPPORTED_MODELS = ["CNN", "LSTM"] + + +class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): + """ + todo + """ + modelName = Param(Params._dummy(), "modelName", "A deep learning model name") + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None) + """ + super(DeepTextFeaturizer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def _transform(self, dataset): + transformer = _NamedTextTransformer(inputCol=self.getInputCol(), + outputCol=self.getOutputCol(), + modelName=self.getModelName(), featurize=True) + return transformer.transform(dataset) + + +class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): + modelName = Param(Params._dummy(), "modelName", "A deep learning model name", + typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) + featurize = Param(Params._dummy(), "featurize", + "If true, output features. If false, output predictions. Either way the output is a vector.", + typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + super(_NamedTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + self._inputTensorName = None + self._outputTensorName = None + self._outputMode = None + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def setFeaturize(self, value): + return self._set(featurize=value) + + def getFeaturize(self): + return self.getOrDefault(self.featurize) + + def _transform(self, dataset): + modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) + inputCol = self.getInputCol() + resizedCol = "__sdl_textResized" + tfTransformer = TFTextTransformer(inputCol=resizedCol, + outputCol=self.getOutputCol(), + graph=modelGraphSpec["graph"], + inputTensor=modelGraphSpec["inputTensorName"], + outputTensor=modelGraphSpec["outputTensorName"], + outputMode=modelGraphSpec["outputMode"]) + resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) + result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) + return result.drop(resizedCol) + + +def _buildTFGraphForName(name, featurize): + """ + Currently only supports pre-trained models from the Keras applications module. + """ + modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) + sess = modelData["session"] + outputTensorName = modelData["outputTensorName"] + graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) + modelData["graph"] = graph + + return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index da37fcad..2ca33846 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .withColumn("__sdl_image_height", image.height) + .n("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/sparkdl/transformers/tf_text.py b/python/sparkdl/transformers/tf_text.py new file mode 100644 index 00000000..b040adc0 --- /dev/null +++ b/python/sparkdl/transformers/tf_text.py @@ -0,0 +1,91 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from pyspark.ml import Transformer +from pyspark.ml.feature import Word2Vec +from pyspark.sql.functions import udf +from pyspark.sql import functions as f +from pyspark.sql.types import * +from pyspark.sql.functions import lit +from sparkdl.param.shared_params import HasEmbeddingSize, HasSequenceLength +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol) +import re + +import sparkdl.utils.jvmapi as JVMAPI + + +class TFTextTransformer(Transformer, HasInputCol, HasOutputCol, HasEmbeddingSize, HasSequenceLength): + """ + Convert sentence/document to a 2-D Array eg. [[word embedding],[....]] in DataFrame which can be processed + directly by tensorflow or keras who's backend is tensorflow. + + Processing Steps: + + * Using Word2Vec compute Map(word -> vector) from input column, then broadcast the map. + * Process input column (which is text),split it with white space, replace word with vector, padding the result to + the same size. + * Create a new dataframe with columns like new 2-D array , vocab_size, embedding_size + * return then new dataframe + """ + VOCAB_SIZE = 'vocab_size' + EMBEDDING_SIZE = 'embedding_size' + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + super(TFTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, dataset): + word2vec = Word2Vec(vectorSize=self.getEmbeddingSize(), minCount=1, inputCol=self.getInputCol(), + outputCol="word_embedding") + word_embedding = dict( + word2vec.fit( + dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))).getVectors().rdd.map( + lambda p: (p.word, p.vector.values.tolist())).collect()) + word_embedding["unk"] = np.zeros(self.getEmbeddingSize()).tolist() + sc = JVMAPI._curr_sc() + local_word_embedding = sc.broadcast(word_embedding) + + def convert_word_to_index(s): + def _pad_sequences(sequences, maxlen=None): + new_sequences = [] + + if len(sequences) <= maxlen: + for i in range(maxlen - len(sequences)): + new_sequences.append(np.zeros(self.getEmbeddingSize()).tolist()) + return sequences + new_sequences + else: + return sequences[0:maxlen] + + new_q = [local_word_embedding.value[word] for word in re.split(r"\s+", s) if + word in local_word_embedding.value.keys()] + result = _pad_sequences(new_q, maxlen=self.getSequenceLength()) + return result + + cwti_udf = udf(convert_word_to_index, ArrayType(ArrayType(FloatType()))) + doc_martic = (dataset.withColumn(self.getOutputCol(), cwti_udf(self.getInputCol()).alias(self.getOutputCol())) + .withColumn(self.VOCAB_SIZE, lit(len(word_embedding))) + .withColumn(self.EMBEDDING_SIZE, lit(self.getEmbeddingSize())) + ) + + return doc_martic diff --git a/python/sparkdl/transformers/utils.py b/python/sparkdl/transformers/utils.py index b244365b..9964f3df 100644 --- a/python/sparkdl/transformers/utils.py +++ b/python/sparkdl/transformers/utils.py @@ -18,6 +18,8 @@ # image stuff IMAGE_INPUT_PLACEHOLDER_NAME = "sparkdl_image_input" +TEXT_INPUT_PLACEHOLDER_NAME = "sparkdl_text_input" + def imageInputPlaceholder(nChannels=None): return tf.placeholder(tf.float32, [None, None, None, nChannels], diff --git a/python/tests/Test.py b/python/tests/Test.py new file mode 100644 index 00000000..6327cda4 --- /dev/null +++ b/python/tests/Test.py @@ -0,0 +1,30 @@ +import os +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +from sparkdl import readImages +from pyspark.sql.functions import lit +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.ml.classification import LogisticRegression +from pyspark.ml import Pipeline +from sparkdl import DeepImageFeaturizer + +img_dir="/Users/allwefantasy/resources/images/flower_photos" + +tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) +daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) + +tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) +daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) +train_df = tulips_train.unionAll(daisy_train) +test_df = tulips_test.unionAll(daisy_test) + +featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") +lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") +p = Pipeline(stages=[featurizer, lr]) + +p_model = p.fit(train_df) +tested_df = p_model.transform(test_df) +evaluator = MulticlassClassificationEvaluator(metricName="accuracy") +print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) + +# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py new file mode 100644 index 00000000..b535a602 --- /dev/null +++ b/python/tests/Test2.py @@ -0,0 +1,22 @@ +import os +from pyspark import SparkContext + +from sparkdl.transformers.tf_text import TFTextTransformer + +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +input_col = "text" +output_col = "preds" + +sc = SparkContext.getOrCreate() +documentDF = sc.createDataFrame([ + ("Hi I heard about Spark".split(" "), 1), + ("I wish Java could use case classes".split(" "), 0), + ("Logistic regression models are neat".split(" "), 2) +], ["text", "preds"]) + +transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col) + +df = transformer.transform(documentDF) +df.show() \ No newline at end of file diff --git a/python/tests/resources/text/sample.txt b/python/tests/resources/text/sample.txt new file mode 100644 index 00000000..8c5e8d99 --- /dev/null +++ b/python/tests/resources/text/sample.txt @@ -0,0 +1,4 @@ +接下 来 介绍 一种 非常 重要 的 神经网络 卷积神经网络 +这种 神经 网络 在 计算机 视觉 领域 取得了 重大 的 成功,而且 在 自然语言 处理 等 其它 领域 也有 很好 应用 +深度学习 受到 大家 关注 很大 一个 原因 就是 Alex 实现 AlexNet( 一种 深度卷积神经网络 )在 LSVRC-2010 ImageNet +此后 卷积神经网络 及其 变种 被广泛 应用于 各种图像 相关 任务 \ No newline at end of file diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py new file mode 100644 index 00000000..0e8b359d --- /dev/null +++ b/python/tests/transformers/tf_text_test.py @@ -0,0 +1,126 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cPickle as pickle +import shutil +import threading + +from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer +from sparkdl.transformers.tf_text import TFTextTransformer +from sparkdl.tf_fun import map_fun +from ..tests import SparkDLTestCase + + +class TFTextTransformerTest(SparkDLTestCase): + def test_convertText(self): + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + data = df.collect() + self.assertEquals(len(data), 3) + for row in data: + self.assertEqual(len(row[output_col]), 64) + self.assertEqual(len(row[output_col][0]), 100) + + +class TFTextFileEstimatorTest(SparkDLTestCase): + def test_trainText(self): + import os + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) + + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + + # create a estimator to training where map_fun contains tensorflow's code + estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1", "test_mode": False}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df).collect() + + +class MockKakfaServerTest(SparkDLTestCase): + def test_mockKafkaServerProduce(self): + dataset = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) + try: + for d in d_iter: + producer.send("", pickle.dumps(d)) + producer.send("", pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + _write_data() + + def _consume(): + consumer = KafkaMockServer() + stop_count = 0 + while True: + messages = consumer.poll(timeout_ms=1000, max_records=64) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + print(msg_value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + pass + if stop_count >= 8: + break + self.assertEquals(stop_count, 8) + + t = threading.Thread(target=_consume) + t.start() + t2 = threading.Thread(target=_consume) + t2.start() From 3c3fd2dce20822f86bd112ed58260538980dce40 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 17:28:39 +0800 Subject: [PATCH 02/29] set test_mode to True which can avoid to kafka dependency --- python/tests/transformers/tf_text_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 0e8b359d..26f31d1f 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -69,7 +69,7 @@ def test_trainText(self): # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", - "group_id": "sdl_1", "test_mode": False}, + "group_id": "sdl_1", "test_mode": True}, fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], mapFnParam=map_fun) estimator.fit(df).collect() From e0cdad282a97a0c4541e2f8714bb5ee456b9fd70 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 18:44:23 +0800 Subject: [PATCH 03/29] clean some file --- python/sparkdl/transformers/named_text.py | 134 ---------------------- python/sparkdl/transformers/tf_image.py | 2 +- python/tests/Test.py | 30 ----- python/tests/Test2.py | 22 ---- 4 files changed, 1 insertion(+), 187 deletions(-) delete mode 100644 python/sparkdl/transformers/named_text.py delete mode 100644 python/tests/Test.py delete mode 100644 python/tests/Test2.py diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py deleted file mode 100644 index ef51cd0c..00000000 --- a/python/sparkdl/transformers/named_text.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2017 Databricks, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from keras.applications.imagenet_utils import decode_predictions -import numpy as np - -from pyspark.ml import Transformer -from pyspark.ml.param import Param, Params, TypeConverters - -import sparkdl.graph.utils as tfx -from sparkdl.image.imageIO import resizeImage -import sparkdl.transformers.keras_applications as keras_apps -from sparkdl.param import ( - keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) -from sparkdl.transformers.tf_text import TFTextTransformer - -SUPPORTED_MODELS = ["CNN", "LSTM"] - - -class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): - """ - todo - """ - modelName = Param(Params._dummy(), "modelName", "A deep learning model name") - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None) - """ - super(DeepTextFeaturizer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def _transform(self, dataset): - transformer = _NamedTextTransformer(inputCol=self.getInputCol(), - outputCol=self.getOutputCol(), - modelName=self.getModelName(), featurize=True) - return transformer.transform(dataset) - - -class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): - modelName = Param(Params._dummy(), "modelName", "A deep learning model name", - typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) - featurize = Param(Params._dummy(), "featurize", - "If true, output features. If false, output predictions. Either way the output is a vector.", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - super(_NamedTextTransformer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - self._inputTensorName = None - self._outputTensorName = None - self._outputMode = None - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def setFeaturize(self, value): - return self._set(featurize=value) - - def getFeaturize(self): - return self.getOrDefault(self.featurize) - - def _transform(self, dataset): - modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) - inputCol = self.getInputCol() - resizedCol = "__sdl_textResized" - tfTransformer = TFTextTransformer(inputCol=resizedCol, - outputCol=self.getOutputCol(), - graph=modelGraphSpec["graph"], - inputTensor=modelGraphSpec["inputTensorName"], - outputTensor=modelGraphSpec["outputTensorName"], - outputMode=modelGraphSpec["outputMode"]) - resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) - result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) - return result.drop(resizedCol) - - -def _buildTFGraphForName(name, featurize): - """ - Currently only supports pre-trained models from the Keras applications module. - """ - modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) - sess = modelData["session"] - outputTensorName = modelData["outputTensorName"] - graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) - modelData["graph"] = graph - - return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index 2ca33846..da37fcad 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .n("__sdl_image_height", image.height) + .withColumn("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/tests/Test.py b/python/tests/Test.py deleted file mode 100644 index 6327cda4..00000000 --- a/python/tests/Test.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -from sparkdl import readImages -from pyspark.sql.functions import lit -from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.ml.classification import LogisticRegression -from pyspark.ml import Pipeline -from sparkdl import DeepImageFeaturizer - -img_dir="/Users/allwefantasy/resources/images/flower_photos" - -tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) -daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) - -tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) -daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) -train_df = tulips_train.unionAll(daisy_train) -test_df = tulips_test.unionAll(daisy_test) - -featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") -lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") -p = Pipeline(stages=[featurizer, lr]) - -p_model = p.fit(train_df) -tested_df = p_model.transform(test_df) -evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) - -# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py deleted file mode 100644 index b535a602..00000000 --- a/python/tests/Test2.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -from pyspark import SparkContext - -from sparkdl.transformers.tf_text import TFTextTransformer - -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -input_col = "text" -output_col = "preds" - -sc = SparkContext.getOrCreate() -documentDF = sc.createDataFrame([ - ("Hi I heard about Spark".split(" "), 1), - ("I wish Java could use case classes".split(" "), 0), - ("Logistic regression models are neat".split(" "), 2) -], ["text", "preds"]) - -transformer = TFTextTransformer( - inputCol=input_col, outputCol=output_col) - -df = transformer.transform(documentDF) -df.show() \ No newline at end of file From afbd95cecf29615586cedb7582e3cd2770159ae1 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Sat, 14 Oct 2017 16:46:00 +0800 Subject: [PATCH 04/29] Add TFoS support --- .../estimators/tf_text_file_estimator.py | 63 +++++- python/sparkdl/param/shared_params.py | 21 ++ python/sparkdl/tf_fun.py | 204 +++++++++++------- python/tests/TFoSTest.py | 174 +++++++++++++++ python/tests/transformers/tf_text_test.py | 3 +- 5 files changed, 379 insertions(+), 86 deletions(-) create mode 100644 python/tests/TFoSTest.py diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 278ab8e5..83b647d2 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -28,10 +28,11 @@ from kafka import KafkaConsumer from kafka import KafkaProducer from pyspark.ml import Estimator +from tensorflowonspark import TFCluster from sparkdl.param import ( keyword_only, HasLabelCol, HasInputCol, HasOutputCol) -from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam +from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam, RunningMode import sparkdl.utils.jvmapi as JVMAPI __all__ = ['TFTextFileEstimator'] @@ -39,7 +40,8 @@ logger = logging.getLogger('sparkdl') -class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, MapFnParam): +class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, RunningMode, + MapFnParam): """ Build a Estimator from tensorflow or keras when backend is tensorflow. @@ -111,13 +113,15 @@ def feed_dict(batch): """ @keyword_only - def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, + runningMode="Normal", mapFnParam=None): super(TFTextFileEstimator, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, + runningMode="Normal", mapFnParam=None): kwargs = self._input_kwargs return self._set(**kwargs) @@ -136,7 +140,10 @@ def fit(self, dataset, params=None): else: raise ValueError("Params must be either a param map or a list/tuple of param maps, " "but got %s." % type(params)) - return self._fitInParallel(dataset, paramMaps) + if self.getRunningMode() == "TFoS": + return self._fitInCluster(dataset, paramMaps) + else: + return self._fitInParallel(dataset, paramMaps) def _validateParams(self): """ @@ -149,6 +156,41 @@ def _validateParams(self): raise ValueError("Output column must be defined") return True + def _clusterModelDefaultValue(self, sc, args): + if "cluster_size" not in args: + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + args['cluster_size'] = num_executors + num_ps = 1 + if "num_ps" not in args: + args['num_ps'] = 1 + if "tensorboard" not in args: + args['tensorboard'] = None + return args + + def _fitInCluster(self, dataset, paramMaps): + sc = JVMAPI._curr_sc() + + temp_item = dataset.take(1)[0] + vocab_s = temp_item["vocab_size"] + embedding_size = temp_item["embedding_size"] + + baseParamMap = self.extractParamMap() + baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) + + args = self._clusterModelDefaultValue(sc, paramMaps[0]) + args["feature"] = self.getInputCol() + args["label"] = self.getLabelCol() + args["vacab_size"] = vocab_s + args["embedding_size"] = embedding_size + args["params"] = baseParamDict + + cluster = TFCluster.run(sc, self.getMapFnParam(), args, args['cluster_size'], args['num_ps'], + args['tensorboard'], + TFCluster.InputMode.SPARK) + cluster.train(dataset.rdd, args["epochs"]) + cluster.shutdown() + def _fitInParallel(self, dataset, paramMaps): inputCol = self.getInputCol() @@ -245,12 +287,11 @@ def _read_data(max_records=64): finally: consumer.close() - self.getMapFnParam()(_read_data, - feature=inputCol, - label=labelCol, - vacab_size=vocab_s, - embedding_size=embedding_size, - params=params + self.getMapFnParam()(args=dict(feature=inputCol, + label=labelCol, + vacab_size=vocab_s, + embedding_size=embedding_size, + params=params), ctx=None, _read_data=_read_data ) return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index 7305fc8b..04d4b3cf 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -147,6 +147,27 @@ def getEmbeddingSize(self): return self.getOrDefault(self.embeddingSize) +class RunningMode(Params): + """ + Mixin for param RunningMode + * TFoS + * Normal + """ + + runningMode = Param(Params._dummy(), "runningMode", "based on TFoS or Normal which is used to " + "hyper parameter tuning", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(RunningMode, self).__init__() + + def setRunningMode(self, value): + return self._set(runningMode=value) + + def getRunningMode(self): + return self.getOrDefault(self.runningMode) + + class HasSequenceLength(Params): """ Mixin for param sequenceLength diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py index b870f5f8..68025604 100644 --- a/python/sparkdl/tf_fun.py +++ b/python/sparkdl/tf_fun.py @@ -1,11 +1,28 @@ -def map_fun(_read_data, **args): +def map_fun(args={}, ctx=None, _read_data=None): + from tensorflowonspark import TFNode + from datetime import datetime + import math + import numpy import tensorflow as tf + import time + + print(args) + EMBEDDING_SIZE = args["embedding_size"] feature = args['feature'] label = args['label'] params = args['params']['fitParam'] + print(params) SEQUENCE_LENGTH = 64 + clusterMode = False if ctx is None else True + + if clusterMode and ctx.job_name == "ps": + time.sleep((ctx.worker_num + 1) * 5) + + if clusterMode: + cluster, server = TFNode.start_cluster_server(ctx, 1) + def feed_dict(batch): # Convert from dict of named arrays to two numpy arrays of the proper type features = [] @@ -15,76 +32,115 @@ def feed_dict(batch): # print("{} {}".format(feature, features)) return features - encoder_variables_dict = { - "encoder_w1": tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), - "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), - "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), - "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") - } - - def encoder(x, name="encoder"): - with tf.name_scope(name): - encoder_w1 = encoder_variables_dict["encoder_w1"] - encoder_b1 = encoder_variables_dict["encoder_b1"] - - layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) - - encoder_w2 = encoder_variables_dict["encoder_w2"] - encoder_b2 = encoder_variables_dict["encoder_b2"] - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) - return layer_2 - - def decoder(x, name="decoder"): - with tf.name_scope(name): - decoder_w1 = tf.Variable(tf.random_normal([128, 256])) - decoder_b1 = tf.Variable(tf.random_normal([256])) - - layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) - - decoder_w2 = tf.Variable( - tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) - decoder_b2 = tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) - return layer_2 - - tf.reset_default_graph - sess = tf.Session() - - input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") - flattened = tf.reshape(input_x, - [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) - - encoder_op = encoder(flattened) - - tf.add_to_collection('encoder_op', encoder_op) - - y_pred = decoder(encoder_op) - - y_true = flattened - - with tf.name_scope("xent"): - consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), - tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), - tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) - xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) - tf.summary.scalar("xent", xent) - - with tf.name_scope("train"): - # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) - train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) - - summ = tf.summary.merge_all() - - sess.run(tf.global_variables_initializer()) - - for i in range(params["epochs"]): - print("epoll {}".format(i)) - for data in _read_data(max_records=params["batch_size"]): - batch_data = feed_dict(data) - sess.run(train_step, feed_dict={input_x: batch_data}) - - sess.close() + def build_graph(): + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + summ = tf.summary.merge_all() + global_step = tf.Variable(0) + init_op = tf.global_variables_initializer() + return input_x, init_op, train_step, xent, global_step, summ + + def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): + + logdir = TFNode.hdfs_path(ctx, params['model']) if clusterMode else None + sv = tf.train.Supervisor(is_chief=ctx.task_index == 0, + logdir=logdir, + init_op=init_op, + summary_op=None, + saver=None, + global_step=global_step, + stop_grace_secs=300, + save_model_secs=10) + with sv.managed_session(server.target) as sess: + tf_feed = TFNode.DataFeed(ctx.mgr, True) + step = 0 + + while not sv.should_stop() and not tf_feed.should_stop() and step < 100: + data = tf_feed.next_batch(params["batch_size"]) + batch_data = feed_dict(data) + step += 1 + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if sv.should_stop() or step >= args.steps: + tf_feed.terminate() + sv.stop() + + def train(input_x, init_op, train_step, xent, global_step, summ): + + with tf.Session() as sess: + sess.run(init_op) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if clusterMode and ctx.job_name == "ps": + server.join() + elif clusterMode and ctx.job_name == "worker": + with tf.device(tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % ctx.task_index, + cluster=cluster)): + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train_with_cluster(input_x, init_op, train_step, xent, global_step, summ) + else: + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train(input_x, init_op, train_step, xent, global_step, summ) \ No newline at end of file diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSTest.py new file mode 100644 index 00000000..572a1a3c --- /dev/null +++ b/python/tests/TFoSTest.py @@ -0,0 +1,174 @@ +from pyspark.sql import SparkSession +from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer +from sparkdl.transformers.tf_text import TFTextTransformer + + +def map_fun(args={}, ctx=None, _read_data=None): + from tensorflowonspark import TFNode + from datetime import datetime + import math + import numpy + import tensorflow as tf + import time + + print(args) + + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'][0] + SEQUENCE_LENGTH = 64 + + clusterMode = False if ctx is None else True + + if clusterMode and ctx.job_name == "ps": + time.sleep((ctx.worker_num + 1) * 5) + + if clusterMode: + cluster, server = TFNode.start_cluster_server(ctx, 1) + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + def build_graph(): + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + summ = tf.summary.merge_all() + global_step = tf.Variable(0) + init_op = tf.global_variables_initializer() + return input_x, init_op, train_step, xent, global_step, summ + + def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): + + logdir = TFNode.hdfs_path(ctx, params['model']) if clusterMode else None + sv = tf.train.Supervisor(is_chief=ctx.task_index == 0, + logdir=logdir, + init_op=init_op, + summary_op=None, + saver=None, + global_step=global_step, + stop_grace_secs=300, + save_model_secs=10) + with sv.managed_session(server.target) as sess: + tf_feed = TFNode.DataFeed(ctx.mgr, True) + step = 0 + + while not sv.should_stop() and not tf_feed.should_stop() and step < 100: + data = tf_feed.next_batch(params["batch_size"]) + batch_data = feed_dict(data) + step += 1 + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if sv.should_stop() or step >= args.steps: + tf_feed.terminate() + sv.stop() + + def train(input_x, init_op, train_step, xent, global_step, summ): + + with tf.Session() as sess: + sess.run(init_op) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if clusterMode and ctx.job_name == "ps": + server.join() + elif clusterMode and ctx.job_name == "worker": + with tf.device(tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % ctx.task_index, + cluster=cluster)): + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train_with_cluster(input_x, init_op, train_step, xent, global_step, summ) + else: + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train(input_x, init_op, train_step, xent, global_step, summ) + + +input_col = "text" +output_col = "sentence_matrix" + +session = SparkSession.builder.master("spark://allwefantasy:7077").appName("test").getOrCreate() +documentDF = session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) +], ["text", "preds"]) + +# transform text column to sentence_matrix column which contains 2-D array. +transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + +df = transformer.transform(documentDF) + +# create a estimator to training where map_fun contains tensorflow's code +estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 64, "model": "/tmp/model"}], + runningMode="TFoS", + mapFnParam=map_fun) +estimator.fit(df).collect() diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 26f31d1f..b23d7001 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -48,7 +48,7 @@ def test_convertText(self): class TFTextFileEstimatorTest(SparkDLTestCase): def test_trainText(self): import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) input_col = "text" @@ -71,6 +71,7 @@ def test_trainText(self): kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", "group_id": "sdl_1", "test_mode": True}, fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + runningMode="Normal", mapFnParam=map_fun) estimator.fit(df).collect() From a1c1fa08fbb4aa31f84af33abb9c20eddd712eb1 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Sat, 14 Oct 2017 20:32:44 +0800 Subject: [PATCH 05/29] Fix TFoSTest --- python/tests/TFoSTest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSTest.py index 572a1a3c..4c95c12a 100644 --- a/python/tests/TFoSTest.py +++ b/python/tests/TFoSTest.py @@ -122,7 +122,7 @@ def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): batch_data = feed_dict(data) step += 1 _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) - print("global_step:{} xent:{}".format(x, g)) + print("global_step:{} xent:{}".format(g, x)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() @@ -168,7 +168,7 @@ def train(input_x, init_op, train_step, xent, global_step, summ): # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", - fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 64, "model": "/tmp/model"}], + fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], runningMode="TFoS", mapFnParam=map_fun) estimator.fit(df).collect() From 06cad2e0b2b86722e9052d1f84f056eba0d26a80 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 11 Oct 2017 16:11:24 +0800 Subject: [PATCH 06/29] 1. Support NLP non-distribued training 2. Introduce Kafka to avoid broadcast huge tranning data --- .../estimators/tf_text_file_estimator.py | 312 ++++++++++++++++++ python/sparkdl/param/shared_params.py | 99 +++++- python/sparkdl/tf_fun.py | 90 +++++ python/sparkdl/transformers/named_text.py | 134 ++++++++ python/sparkdl/transformers/tf_image.py | 2 +- python/sparkdl/transformers/tf_text.py | 91 +++++ python/sparkdl/transformers/utils.py | 2 + python/tests/Test.py | 30 ++ python/tests/Test2.py | 22 ++ python/tests/resources/text/sample.txt | 4 + python/tests/transformers/tf_text_test.py | 126 +++++++ 11 files changed, 910 insertions(+), 2 deletions(-) create mode 100644 python/sparkdl/estimators/tf_text_file_estimator.py create mode 100644 python/sparkdl/tf_fun.py create mode 100644 python/sparkdl/transformers/named_text.py create mode 100644 python/sparkdl/transformers/tf_text.py create mode 100644 python/tests/Test.py create mode 100644 python/tests/Test2.py create mode 100644 python/tests/resources/text/sample.txt create mode 100644 python/tests/transformers/tf_text_test.py diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py new file mode 100644 index 00000000..278ab8e5 --- /dev/null +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -0,0 +1,312 @@ +# +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# pylint: disable=protected-access +from __future__ import absolute_import, division, print_function + +import logging +import threading +import time +import os +import shutil + +import cPickle as pickle + +from kafka import KafkaConsumer +from kafka import KafkaProducer +from pyspark.ml import Estimator + +from sparkdl.param import ( + keyword_only, HasLabelCol, HasInputCol, HasOutputCol) +from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam +import sparkdl.utils.jvmapi as JVMAPI + +__all__ = ['TFTextFileEstimator'] + +logger = logging.getLogger('sparkdl') + + +class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, MapFnParam): + """ + Build a Estimator from tensorflow or keras when backend is tensorflow. + + First,assume we have data in dataframe like following. + + .. code-block:: python + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + transformer = TFTextTransformer( + inputCol=input_col, + outputCol=output_col) + + df = transformer.transform(documentDF) + + TFTextTransformer will transform text column to `output_col`, which is 2-D array. + + Then we create a tensorflow function. + + .. code-block:: python + def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + _read_data is a data generator. args provide hyper parameteres configured in this estimator. + + here is how to use _read_data: + + .. code-block:: python + for data in _read_data(max_records=params.batch_size): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + finally we can create TFTextFileEstimator to train our model: + + .. code-block:: python + estimator = TFTextFileEstimator(inputCol="sentence_matrix", + outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1"}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df) + + """ + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + super(TFTextFileEstimator, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def fit(self, dataset, params=None): + self._validateParams() + if params is None: + paramMaps = self.getFitParam() + elif isinstance(params, (list, tuple)): + if len(params) == 0: + paramMaps = [dict()] + else: + self._validateFitParams(params) + paramMaps = params + elif isinstance(params, dict): + paramMaps = [params] + else: + raise ValueError("Params must be either a param map or a list/tuple of param maps, " + "but got %s." % type(params)) + return self._fitInParallel(dataset, paramMaps) + + def _validateParams(self): + """ + Check Param values so we can throw errors on the driver, rather than workers. + :return: True if parameters are valid + """ + if not self.isDefined(self.inputCol): + raise ValueError("Input column must be defined") + if not self.isDefined(self.outputCol): + raise ValueError("Output column must be defined") + return True + + def _fitInParallel(self, dataset, paramMaps): + + inputCol = self.getInputCol() + labelCol = self.getLabelCol() + + from time import gmtime, strftime + kafaParams = self.getKafkaParam() + topic = kafaParams["topic"] + "_" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + group_id = kafaParams["group_id"] + bootstrap_servers = kafaParams["bootstrap_servers"] + kafka_test_mode = kafaParams["test_mode"] if "test_mode" in kafaParams else False + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) if kafka_test_mode else KafkaProducer( + bootstrap_servers=bootstrap_servers) + try: + for d in d_iter: + producer.send(topic, pickle.dumps(d)) + producer.send(topic, pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + if kafka_test_mode: + _write_data() + else: + t = threading.Thread(target=_write_data) + t.start() + + stop_flag_num = dataset.rdd.getNumPartitions() + temp_item = dataset.take(1)[0] + vocab_s = temp_item["vocab_size"] + embedding_size = temp_item["embedding_size"] + + sc = JVMAPI._curr_sc() + + paramMapsRDD = sc.parallelize(paramMaps, numSlices=len(paramMaps)) + + # Obtain params for this estimator instance + baseParamMap = self.extractParamMap() + baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) + baseParamDictBc = sc.broadcast(baseParamDict) + + def _local_fit(override_param_map): + # Update params + params = baseParamDictBc.value + params["fitParam"] = override_param_map + + def _read_data(max_records=64): + consumer = KafkaMockServer() if kafka_test_mode else KafkaConsumer(topic, + group_id=group_id, + bootstrap_servers=bootstrap_servers, + auto_offset_reset="earliest", + enable_auto_commit=False + ) + try: + stop_count = 0 + fail_msg_count = 0 + while True: + if kafka_test_mode: + time.sleep(1) + messages = consumer.poll(timeout_ms=1000, max_records=max_records) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + fail_msg_count += 0 + pass + if len(group_msgs) > 0: + yield group_msgs + + if kafka_test_mode: + print( + "stop_count = {} " + "group_msgs = {} " + "stop_flag_num = {} " + "fail_msg_count = {}".format(stop_count, + len(group_msgs), + stop_flag_num, + fail_msg_count)) + + if stop_count >= stop_flag_num and len(group_msgs) == 0: + break + finally: + consumer.close() + + self.getMapFnParam()(_read_data, + feature=inputCol, + label=labelCol, + vacab_size=vocab_s, + embedding_size=embedding_size, + params=params + ) + + return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) + + def _fit(self, dataset): # pylint: disable=unused-argument + err_msgs = ["This function should not have been called", + "Please contact library maintainers to file a bug"] + raise NotImplementedError('\n'.join(err_msgs)) + + +class KafkaMockServer(object): + """ + Restrictions of KafkaMockServer: + * Make sure all data have been writen before consume. + * Poll function will just ignore max_records and just return all data in queue. + """ + _kafka_mock_server_tmp_file_ = "/tmp/mock-kafka/" + sended = False + + def __init__(self, index=0): + super(KafkaMockServer, self).__init__() + self.index = index + self.queue = [] + if not os.path.exists(self._kafka_mock_server_tmp_file_): + os.mkdir(self._kafka_mock_server_tmp_file_) + + def send(self, topic, msg): + self.queue.append(pickle.loads(msg)) + + def flush(self): + with open(self._kafka_mock_server_tmp_file_ + str(self.index), "w") as f: + pickle.dump(self.queue, f) + self.queue = [] + + def close(self): + pass + + def poll(self, timeout_ms, max_records): + if self.sended: + return {} + + records = [] + for file in os.listdir(self._kafka_mock_server_tmp_file_): + with open(self._kafka_mock_server_tmp_file_ + file) as f: + tmp = pickle.load(f) + records += tmp + result = {} + couter = 0 + for i in records: + obj = MockRecord() + obj.value = pickle.dumps(i) + couter += 1 + result[str(couter) + "_"] = [obj] + self.sended = True + return result + + +class MockRecord(list): + pass diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index e169e891..7305fc8b 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -27,6 +27,7 @@ import sparkdl.utils.keras_model as kmutil + # From pyspark def keyword_only(func): @@ -36,15 +37,75 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper +class KafkaParam(Params): + kafkaParam = Param(Params._dummy(), "kafkaParam", "kafka", typeConverter=TypeConverters.identity) + + def __init__(self): + super(KafkaParam, self).__init__() + + def setKafkaParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(kafkaParam=value) + + def getKafkaParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.kafkaParam) + + +class FitParam(Params): + fitParam = Param(Params._dummy(), "fitParam", "hyper parameter when training", + typeConverter=TypeConverters.identity) + + def __init__(self): + super(FitParam, self).__init__() + + def setFitParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(fitParam=value) + + def getFitParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.fitParam) + + +class MapFnParam(Params): + mapFnParam = Param(Params._dummy(), "mapFnParam", "Tensorflow func", typeConverter=TypeConverters.identity) + + def __init__(self): + super(MapFnParam, self).__init__() + + def setMapFnParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(mapFnParam=value) + + def getMapFnParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.mapFnParam) + + class HasInputCol(Params): """ Mixin for param inputCol: input column name. @@ -68,6 +129,42 @@ def getInputCol(self): return self.getOrDefault(self.inputCol) +class HasEmbeddingSize(Params): + """ + Mixin for param embeddingSize + """ + + embeddingSize = Param(Params._dummy(), "embeddingSize", "word embedding size", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasEmbeddingSize, self).__init__() + + def setEmbeddingSize(self, value): + return self._set(embeddingSize=value) + + def getEmbeddingSize(self): + return self.getOrDefault(self.embeddingSize) + + +class HasSequenceLength(Params): + """ + Mixin for param sequenceLength + """ + + sequenceLength = Param(Params._dummy(), "sequenceLength", "sequence length", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasSequenceLength, self).__init__() + + def setSequenceLength(self, value): + return self._set(sequenceLength=value) + + def getSequenceLength(self): + return self.getOrDefault(self.sequenceLength) + + class HasOutputCol(Params): """ Mixin for param outputCol: output column name. @@ -92,12 +189,12 @@ def getOutputCol(self): """ return self.getOrDefault(self.outputCol) + ############################################ # New in sparkdl ############################################ class SparkDLTypeConverters(object): - @staticmethod def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py new file mode 100644 index 00000000..b870f5f8 --- /dev/null +++ b/python/sparkdl/tf_fun.py @@ -0,0 +1,90 @@ +def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + sess = tf.Session() + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + + summ = tf.summary.merge_all() + + sess.run(tf.global_variables_initializer()) + + for i in range(params["epochs"]): + print("epoll {}".format(i)) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + sess.close() diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py new file mode 100644 index 00000000..ef51cd0c --- /dev/null +++ b/python/sparkdl/transformers/named_text.py @@ -0,0 +1,134 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from keras.applications.imagenet_utils import decode_predictions +import numpy as np + +from pyspark.ml import Transformer +from pyspark.ml.param import Param, Params, TypeConverters + +import sparkdl.graph.utils as tfx +from sparkdl.image.imageIO import resizeImage +import sparkdl.transformers.keras_applications as keras_apps +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) +from sparkdl.transformers.tf_text import TFTextTransformer + +SUPPORTED_MODELS = ["CNN", "LSTM"] + + +class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): + """ + todo + """ + modelName = Param(Params._dummy(), "modelName", "A deep learning model name") + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None) + """ + super(DeepTextFeaturizer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def _transform(self, dataset): + transformer = _NamedTextTransformer(inputCol=self.getInputCol(), + outputCol=self.getOutputCol(), + modelName=self.getModelName(), featurize=True) + return transformer.transform(dataset) + + +class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): + modelName = Param(Params._dummy(), "modelName", "A deep learning model name", + typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) + featurize = Param(Params._dummy(), "featurize", + "If true, output features. If false, output predictions. Either way the output is a vector.", + typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + super(_NamedTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + self._inputTensorName = None + self._outputTensorName = None + self._outputMode = None + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def setFeaturize(self, value): + return self._set(featurize=value) + + def getFeaturize(self): + return self.getOrDefault(self.featurize) + + def _transform(self, dataset): + modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) + inputCol = self.getInputCol() + resizedCol = "__sdl_textResized" + tfTransformer = TFTextTransformer(inputCol=resizedCol, + outputCol=self.getOutputCol(), + graph=modelGraphSpec["graph"], + inputTensor=modelGraphSpec["inputTensorName"], + outputTensor=modelGraphSpec["outputTensorName"], + outputMode=modelGraphSpec["outputMode"]) + resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) + result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) + return result.drop(resizedCol) + + +def _buildTFGraphForName(name, featurize): + """ + Currently only supports pre-trained models from the Keras applications module. + """ + modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) + sess = modelData["session"] + outputTensorName = modelData["outputTensorName"] + graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) + modelData["graph"] = graph + + return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index da37fcad..2ca33846 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .withColumn("__sdl_image_height", image.height) + .n("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/sparkdl/transformers/tf_text.py b/python/sparkdl/transformers/tf_text.py new file mode 100644 index 00000000..b040adc0 --- /dev/null +++ b/python/sparkdl/transformers/tf_text.py @@ -0,0 +1,91 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from pyspark.ml import Transformer +from pyspark.ml.feature import Word2Vec +from pyspark.sql.functions import udf +from pyspark.sql import functions as f +from pyspark.sql.types import * +from pyspark.sql.functions import lit +from sparkdl.param.shared_params import HasEmbeddingSize, HasSequenceLength +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol) +import re + +import sparkdl.utils.jvmapi as JVMAPI + + +class TFTextTransformer(Transformer, HasInputCol, HasOutputCol, HasEmbeddingSize, HasSequenceLength): + """ + Convert sentence/document to a 2-D Array eg. [[word embedding],[....]] in DataFrame which can be processed + directly by tensorflow or keras who's backend is tensorflow. + + Processing Steps: + + * Using Word2Vec compute Map(word -> vector) from input column, then broadcast the map. + * Process input column (which is text),split it with white space, replace word with vector, padding the result to + the same size. + * Create a new dataframe with columns like new 2-D array , vocab_size, embedding_size + * return then new dataframe + """ + VOCAB_SIZE = 'vocab_size' + EMBEDDING_SIZE = 'embedding_size' + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + super(TFTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, dataset): + word2vec = Word2Vec(vectorSize=self.getEmbeddingSize(), minCount=1, inputCol=self.getInputCol(), + outputCol="word_embedding") + word_embedding = dict( + word2vec.fit( + dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))).getVectors().rdd.map( + lambda p: (p.word, p.vector.values.tolist())).collect()) + word_embedding["unk"] = np.zeros(self.getEmbeddingSize()).tolist() + sc = JVMAPI._curr_sc() + local_word_embedding = sc.broadcast(word_embedding) + + def convert_word_to_index(s): + def _pad_sequences(sequences, maxlen=None): + new_sequences = [] + + if len(sequences) <= maxlen: + for i in range(maxlen - len(sequences)): + new_sequences.append(np.zeros(self.getEmbeddingSize()).tolist()) + return sequences + new_sequences + else: + return sequences[0:maxlen] + + new_q = [local_word_embedding.value[word] for word in re.split(r"\s+", s) if + word in local_word_embedding.value.keys()] + result = _pad_sequences(new_q, maxlen=self.getSequenceLength()) + return result + + cwti_udf = udf(convert_word_to_index, ArrayType(ArrayType(FloatType()))) + doc_martic = (dataset.withColumn(self.getOutputCol(), cwti_udf(self.getInputCol()).alias(self.getOutputCol())) + .withColumn(self.VOCAB_SIZE, lit(len(word_embedding))) + .withColumn(self.EMBEDDING_SIZE, lit(self.getEmbeddingSize())) + ) + + return doc_martic diff --git a/python/sparkdl/transformers/utils.py b/python/sparkdl/transformers/utils.py index b244365b..9964f3df 100644 --- a/python/sparkdl/transformers/utils.py +++ b/python/sparkdl/transformers/utils.py @@ -18,6 +18,8 @@ # image stuff IMAGE_INPUT_PLACEHOLDER_NAME = "sparkdl_image_input" +TEXT_INPUT_PLACEHOLDER_NAME = "sparkdl_text_input" + def imageInputPlaceholder(nChannels=None): return tf.placeholder(tf.float32, [None, None, None, nChannels], diff --git a/python/tests/Test.py b/python/tests/Test.py new file mode 100644 index 00000000..6327cda4 --- /dev/null +++ b/python/tests/Test.py @@ -0,0 +1,30 @@ +import os +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +from sparkdl import readImages +from pyspark.sql.functions import lit +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.ml.classification import LogisticRegression +from pyspark.ml import Pipeline +from sparkdl import DeepImageFeaturizer + +img_dir="/Users/allwefantasy/resources/images/flower_photos" + +tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) +daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) + +tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) +daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) +train_df = tulips_train.unionAll(daisy_train) +test_df = tulips_test.unionAll(daisy_test) + +featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") +lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") +p = Pipeline(stages=[featurizer, lr]) + +p_model = p.fit(train_df) +tested_df = p_model.transform(test_df) +evaluator = MulticlassClassificationEvaluator(metricName="accuracy") +print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) + +# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py new file mode 100644 index 00000000..b535a602 --- /dev/null +++ b/python/tests/Test2.py @@ -0,0 +1,22 @@ +import os +from pyspark import SparkContext + +from sparkdl.transformers.tf_text import TFTextTransformer + +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +input_col = "text" +output_col = "preds" + +sc = SparkContext.getOrCreate() +documentDF = sc.createDataFrame([ + ("Hi I heard about Spark".split(" "), 1), + ("I wish Java could use case classes".split(" "), 0), + ("Logistic regression models are neat".split(" "), 2) +], ["text", "preds"]) + +transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col) + +df = transformer.transform(documentDF) +df.show() \ No newline at end of file diff --git a/python/tests/resources/text/sample.txt b/python/tests/resources/text/sample.txt new file mode 100644 index 00000000..8c5e8d99 --- /dev/null +++ b/python/tests/resources/text/sample.txt @@ -0,0 +1,4 @@ +接下 来 介绍 一种 非常 重要 的 神经网络 卷积神经网络 +这种 神经 网络 在 计算机 视觉 领域 取得了 重大 的 成功,而且 在 自然语言 处理 等 其它 领域 也有 很好 应用 +深度学习 受到 大家 关注 很大 一个 原因 就是 Alex 实现 AlexNet( 一种 深度卷积神经网络 )在 LSVRC-2010 ImageNet +此后 卷积神经网络 及其 变种 被广泛 应用于 各种图像 相关 任务 \ No newline at end of file diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py new file mode 100644 index 00000000..0e8b359d --- /dev/null +++ b/python/tests/transformers/tf_text_test.py @@ -0,0 +1,126 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cPickle as pickle +import shutil +import threading + +from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer +from sparkdl.transformers.tf_text import TFTextTransformer +from sparkdl.tf_fun import map_fun +from ..tests import SparkDLTestCase + + +class TFTextTransformerTest(SparkDLTestCase): + def test_convertText(self): + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + data = df.collect() + self.assertEquals(len(data), 3) + for row in data: + self.assertEqual(len(row[output_col]), 64) + self.assertEqual(len(row[output_col][0]), 100) + + +class TFTextFileEstimatorTest(SparkDLTestCase): + def test_trainText(self): + import os + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) + + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + + # create a estimator to training where map_fun contains tensorflow's code + estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1", "test_mode": False}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df).collect() + + +class MockKakfaServerTest(SparkDLTestCase): + def test_mockKafkaServerProduce(self): + dataset = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) + try: + for d in d_iter: + producer.send("", pickle.dumps(d)) + producer.send("", pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + _write_data() + + def _consume(): + consumer = KafkaMockServer() + stop_count = 0 + while True: + messages = consumer.poll(timeout_ms=1000, max_records=64) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + print(msg_value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + pass + if stop_count >= 8: + break + self.assertEquals(stop_count, 8) + + t = threading.Thread(target=_consume) + t.start() + t2 = threading.Thread(target=_consume) + t2.start() From 931d60327918f510e8225b4379d05e1017fd4e45 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 17:28:39 +0800 Subject: [PATCH 07/29] set test_mode to True which can avoid to kafka dependency --- python/tests/transformers/tf_text_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 0e8b359d..26f31d1f 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -69,7 +69,7 @@ def test_trainText(self): # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", - "group_id": "sdl_1", "test_mode": False}, + "group_id": "sdl_1", "test_mode": True}, fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], mapFnParam=map_fun) estimator.fit(df).collect() From d3c8a0c309cc30c1c8c7ca5718fa40cd5badd07f Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 18:44:23 +0800 Subject: [PATCH 08/29] clean some file --- python/sparkdl/transformers/named_text.py | 134 ---------------------- python/sparkdl/transformers/tf_image.py | 2 +- python/tests/Test.py | 30 ----- python/tests/Test2.py | 22 ---- 4 files changed, 1 insertion(+), 187 deletions(-) delete mode 100644 python/sparkdl/transformers/named_text.py delete mode 100644 python/tests/Test.py delete mode 100644 python/tests/Test2.py diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py deleted file mode 100644 index ef51cd0c..00000000 --- a/python/sparkdl/transformers/named_text.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2017 Databricks, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from keras.applications.imagenet_utils import decode_predictions -import numpy as np - -from pyspark.ml import Transformer -from pyspark.ml.param import Param, Params, TypeConverters - -import sparkdl.graph.utils as tfx -from sparkdl.image.imageIO import resizeImage -import sparkdl.transformers.keras_applications as keras_apps -from sparkdl.param import ( - keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) -from sparkdl.transformers.tf_text import TFTextTransformer - -SUPPORTED_MODELS = ["CNN", "LSTM"] - - -class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): - """ - todo - """ - modelName = Param(Params._dummy(), "modelName", "A deep learning model name") - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None) - """ - super(DeepTextFeaturizer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def _transform(self, dataset): - transformer = _NamedTextTransformer(inputCol=self.getInputCol(), - outputCol=self.getOutputCol(), - modelName=self.getModelName(), featurize=True) - return transformer.transform(dataset) - - -class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): - modelName = Param(Params._dummy(), "modelName", "A deep learning model name", - typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) - featurize = Param(Params._dummy(), "featurize", - "If true, output features. If false, output predictions. Either way the output is a vector.", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - super(_NamedTextTransformer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - self._inputTensorName = None - self._outputTensorName = None - self._outputMode = None - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def setFeaturize(self, value): - return self._set(featurize=value) - - def getFeaturize(self): - return self.getOrDefault(self.featurize) - - def _transform(self, dataset): - modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) - inputCol = self.getInputCol() - resizedCol = "__sdl_textResized" - tfTransformer = TFTextTransformer(inputCol=resizedCol, - outputCol=self.getOutputCol(), - graph=modelGraphSpec["graph"], - inputTensor=modelGraphSpec["inputTensorName"], - outputTensor=modelGraphSpec["outputTensorName"], - outputMode=modelGraphSpec["outputMode"]) - resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) - result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) - return result.drop(resizedCol) - - -def _buildTFGraphForName(name, featurize): - """ - Currently only supports pre-trained models from the Keras applications module. - """ - modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) - sess = modelData["session"] - outputTensorName = modelData["outputTensorName"] - graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) - modelData["graph"] = graph - - return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index 2ca33846..da37fcad 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .n("__sdl_image_height", image.height) + .withColumn("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/tests/Test.py b/python/tests/Test.py deleted file mode 100644 index 6327cda4..00000000 --- a/python/tests/Test.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -from sparkdl import readImages -from pyspark.sql.functions import lit -from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.ml.classification import LogisticRegression -from pyspark.ml import Pipeline -from sparkdl import DeepImageFeaturizer - -img_dir="/Users/allwefantasy/resources/images/flower_photos" - -tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) -daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) - -tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) -daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) -train_df = tulips_train.unionAll(daisy_train) -test_df = tulips_test.unionAll(daisy_test) - -featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") -lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") -p = Pipeline(stages=[featurizer, lr]) - -p_model = p.fit(train_df) -tested_df = p_model.transform(test_df) -evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) - -# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py deleted file mode 100644 index b535a602..00000000 --- a/python/tests/Test2.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -from pyspark import SparkContext - -from sparkdl.transformers.tf_text import TFTextTransformer - -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -input_col = "text" -output_col = "preds" - -sc = SparkContext.getOrCreate() -documentDF = sc.createDataFrame([ - ("Hi I heard about Spark".split(" "), 1), - ("I wish Java could use case classes".split(" "), 0), - ("Logistic regression models are neat".split(" "), 2) -], ["text", "preds"]) - -transformer = TFTextTransformer( - inputCol=input_col, outputCol=output_col) - -df = transformer.transform(documentDF) -df.show() \ No newline at end of file From 6566c831614b937cdbe6bc74058350c856905da9 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Sat, 14 Oct 2017 16:46:00 +0800 Subject: [PATCH 09/29] Add TFoS support --- .../estimators/tf_text_file_estimator.py | 63 +++++- python/sparkdl/param/shared_params.py | 21 ++ python/sparkdl/tf_fun.py | 204 +++++++++++------- python/tests/TFoSTest.py | 174 +++++++++++++++ python/tests/transformers/tf_text_test.py | 3 +- 5 files changed, 379 insertions(+), 86 deletions(-) create mode 100644 python/tests/TFoSTest.py diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 278ab8e5..83b647d2 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -28,10 +28,11 @@ from kafka import KafkaConsumer from kafka import KafkaProducer from pyspark.ml import Estimator +from tensorflowonspark import TFCluster from sparkdl.param import ( keyword_only, HasLabelCol, HasInputCol, HasOutputCol) -from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam +from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam, RunningMode import sparkdl.utils.jvmapi as JVMAPI __all__ = ['TFTextFileEstimator'] @@ -39,7 +40,8 @@ logger = logging.getLogger('sparkdl') -class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, MapFnParam): +class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, RunningMode, + MapFnParam): """ Build a Estimator from tensorflow or keras when backend is tensorflow. @@ -111,13 +113,15 @@ def feed_dict(batch): """ @keyword_only - def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, + runningMode="Normal", mapFnParam=None): super(TFTextFileEstimator, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, + runningMode="Normal", mapFnParam=None): kwargs = self._input_kwargs return self._set(**kwargs) @@ -136,7 +140,10 @@ def fit(self, dataset, params=None): else: raise ValueError("Params must be either a param map or a list/tuple of param maps, " "but got %s." % type(params)) - return self._fitInParallel(dataset, paramMaps) + if self.getRunningMode() == "TFoS": + return self._fitInCluster(dataset, paramMaps) + else: + return self._fitInParallel(dataset, paramMaps) def _validateParams(self): """ @@ -149,6 +156,41 @@ def _validateParams(self): raise ValueError("Output column must be defined") return True + def _clusterModelDefaultValue(self, sc, args): + if "cluster_size" not in args: + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + args['cluster_size'] = num_executors + num_ps = 1 + if "num_ps" not in args: + args['num_ps'] = 1 + if "tensorboard" not in args: + args['tensorboard'] = None + return args + + def _fitInCluster(self, dataset, paramMaps): + sc = JVMAPI._curr_sc() + + temp_item = dataset.take(1)[0] + vocab_s = temp_item["vocab_size"] + embedding_size = temp_item["embedding_size"] + + baseParamMap = self.extractParamMap() + baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) + + args = self._clusterModelDefaultValue(sc, paramMaps[0]) + args["feature"] = self.getInputCol() + args["label"] = self.getLabelCol() + args["vacab_size"] = vocab_s + args["embedding_size"] = embedding_size + args["params"] = baseParamDict + + cluster = TFCluster.run(sc, self.getMapFnParam(), args, args['cluster_size'], args['num_ps'], + args['tensorboard'], + TFCluster.InputMode.SPARK) + cluster.train(dataset.rdd, args["epochs"]) + cluster.shutdown() + def _fitInParallel(self, dataset, paramMaps): inputCol = self.getInputCol() @@ -245,12 +287,11 @@ def _read_data(max_records=64): finally: consumer.close() - self.getMapFnParam()(_read_data, - feature=inputCol, - label=labelCol, - vacab_size=vocab_s, - embedding_size=embedding_size, - params=params + self.getMapFnParam()(args=dict(feature=inputCol, + label=labelCol, + vacab_size=vocab_s, + embedding_size=embedding_size, + params=params), ctx=None, _read_data=_read_data ) return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index 7305fc8b..04d4b3cf 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -147,6 +147,27 @@ def getEmbeddingSize(self): return self.getOrDefault(self.embeddingSize) +class RunningMode(Params): + """ + Mixin for param RunningMode + * TFoS + * Normal + """ + + runningMode = Param(Params._dummy(), "runningMode", "based on TFoS or Normal which is used to " + "hyper parameter tuning", + typeConverter=TypeConverters.toString) + + def __init__(self): + super(RunningMode, self).__init__() + + def setRunningMode(self, value): + return self._set(runningMode=value) + + def getRunningMode(self): + return self.getOrDefault(self.runningMode) + + class HasSequenceLength(Params): """ Mixin for param sequenceLength diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py index b870f5f8..68025604 100644 --- a/python/sparkdl/tf_fun.py +++ b/python/sparkdl/tf_fun.py @@ -1,11 +1,28 @@ -def map_fun(_read_data, **args): +def map_fun(args={}, ctx=None, _read_data=None): + from tensorflowonspark import TFNode + from datetime import datetime + import math + import numpy import tensorflow as tf + import time + + print(args) + EMBEDDING_SIZE = args["embedding_size"] feature = args['feature'] label = args['label'] params = args['params']['fitParam'] + print(params) SEQUENCE_LENGTH = 64 + clusterMode = False if ctx is None else True + + if clusterMode and ctx.job_name == "ps": + time.sleep((ctx.worker_num + 1) * 5) + + if clusterMode: + cluster, server = TFNode.start_cluster_server(ctx, 1) + def feed_dict(batch): # Convert from dict of named arrays to two numpy arrays of the proper type features = [] @@ -15,76 +32,115 @@ def feed_dict(batch): # print("{} {}".format(feature, features)) return features - encoder_variables_dict = { - "encoder_w1": tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), - "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), - "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), - "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") - } - - def encoder(x, name="encoder"): - with tf.name_scope(name): - encoder_w1 = encoder_variables_dict["encoder_w1"] - encoder_b1 = encoder_variables_dict["encoder_b1"] - - layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) - - encoder_w2 = encoder_variables_dict["encoder_w2"] - encoder_b2 = encoder_variables_dict["encoder_b2"] - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) - return layer_2 - - def decoder(x, name="decoder"): - with tf.name_scope(name): - decoder_w1 = tf.Variable(tf.random_normal([128, 256])) - decoder_b1 = tf.Variable(tf.random_normal([256])) - - layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) - - decoder_w2 = tf.Variable( - tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) - decoder_b2 = tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) - return layer_2 - - tf.reset_default_graph - sess = tf.Session() - - input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") - flattened = tf.reshape(input_x, - [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) - - encoder_op = encoder(flattened) - - tf.add_to_collection('encoder_op', encoder_op) - - y_pred = decoder(encoder_op) - - y_true = flattened - - with tf.name_scope("xent"): - consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), - tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), - tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) - xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) - tf.summary.scalar("xent", xent) - - with tf.name_scope("train"): - # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) - train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) - - summ = tf.summary.merge_all() - - sess.run(tf.global_variables_initializer()) - - for i in range(params["epochs"]): - print("epoll {}".format(i)) - for data in _read_data(max_records=params["batch_size"]): - batch_data = feed_dict(data) - sess.run(train_step, feed_dict={input_x: batch_data}) - - sess.close() + def build_graph(): + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + summ = tf.summary.merge_all() + global_step = tf.Variable(0) + init_op = tf.global_variables_initializer() + return input_x, init_op, train_step, xent, global_step, summ + + def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): + + logdir = TFNode.hdfs_path(ctx, params['model']) if clusterMode else None + sv = tf.train.Supervisor(is_chief=ctx.task_index == 0, + logdir=logdir, + init_op=init_op, + summary_op=None, + saver=None, + global_step=global_step, + stop_grace_secs=300, + save_model_secs=10) + with sv.managed_session(server.target) as sess: + tf_feed = TFNode.DataFeed(ctx.mgr, True) + step = 0 + + while not sv.should_stop() and not tf_feed.should_stop() and step < 100: + data = tf_feed.next_batch(params["batch_size"]) + batch_data = feed_dict(data) + step += 1 + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if sv.should_stop() or step >= args.steps: + tf_feed.terminate() + sv.stop() + + def train(input_x, init_op, train_step, xent, global_step, summ): + + with tf.Session() as sess: + sess.run(init_op) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if clusterMode and ctx.job_name == "ps": + server.join() + elif clusterMode and ctx.job_name == "worker": + with tf.device(tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % ctx.task_index, + cluster=cluster)): + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train_with_cluster(input_x, init_op, train_step, xent, global_step, summ) + else: + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train(input_x, init_op, train_step, xent, global_step, summ) \ No newline at end of file diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSTest.py new file mode 100644 index 00000000..572a1a3c --- /dev/null +++ b/python/tests/TFoSTest.py @@ -0,0 +1,174 @@ +from pyspark.sql import SparkSession +from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer +from sparkdl.transformers.tf_text import TFTextTransformer + + +def map_fun(args={}, ctx=None, _read_data=None): + from tensorflowonspark import TFNode + from datetime import datetime + import math + import numpy + import tensorflow as tf + import time + + print(args) + + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'][0] + SEQUENCE_LENGTH = 64 + + clusterMode = False if ctx is None else True + + if clusterMode and ctx.job_name == "ps": + time.sleep((ctx.worker_num + 1) * 5) + + if clusterMode: + cluster, server = TFNode.start_cluster_server(ctx, 1) + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + def build_graph(): + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + summ = tf.summary.merge_all() + global_step = tf.Variable(0) + init_op = tf.global_variables_initializer() + return input_x, init_op, train_step, xent, global_step, summ + + def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): + + logdir = TFNode.hdfs_path(ctx, params['model']) if clusterMode else None + sv = tf.train.Supervisor(is_chief=ctx.task_index == 0, + logdir=logdir, + init_op=init_op, + summary_op=None, + saver=None, + global_step=global_step, + stop_grace_secs=300, + save_model_secs=10) + with sv.managed_session(server.target) as sess: + tf_feed = TFNode.DataFeed(ctx.mgr, True) + step = 0 + + while not sv.should_stop() and not tf_feed.should_stop() and step < 100: + data = tf_feed.next_batch(params["batch_size"]) + batch_data = feed_dict(data) + step += 1 + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if sv.should_stop() or step >= args.steps: + tf_feed.terminate() + sv.stop() + + def train(input_x, init_op, train_step, xent, global_step, summ): + + with tf.Session() as sess: + sess.run(init_op) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) + print("global_step:{} xent:{}".format(x, g)) + + if clusterMode and ctx.job_name == "ps": + server.join() + elif clusterMode and ctx.job_name == "worker": + with tf.device(tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % ctx.task_index, + cluster=cluster)): + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train_with_cluster(input_x, init_op, train_step, xent, global_step, summ) + else: + input_x, init_op, train_step, xent, global_step, summ = build_graph() + train(input_x, init_op, train_step, xent, global_step, summ) + + +input_col = "text" +output_col = "sentence_matrix" + +session = SparkSession.builder.master("spark://allwefantasy:7077").appName("test").getOrCreate() +documentDF = session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) +], ["text", "preds"]) + +# transform text column to sentence_matrix column which contains 2-D array. +transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + +df = transformer.transform(documentDF) + +# create a estimator to training where map_fun contains tensorflow's code +estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 64, "model": "/tmp/model"}], + runningMode="TFoS", + mapFnParam=map_fun) +estimator.fit(df).collect() diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 26f31d1f..b23d7001 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -48,7 +48,7 @@ def test_convertText(self): class TFTextFileEstimatorTest(SparkDLTestCase): def test_trainText(self): import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) input_col = "text" @@ -71,6 +71,7 @@ def test_trainText(self): kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", "group_id": "sdl_1", "test_mode": True}, fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + runningMode="Normal", mapFnParam=map_fun) estimator.fit(df).collect() From e99862b232fa3e9bcc41c27a69e8eff94af6e6fc Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Sat, 14 Oct 2017 20:32:44 +0800 Subject: [PATCH 10/29] Fix TFoSTest --- python/tests/TFoSTest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSTest.py index 572a1a3c..4c95c12a 100644 --- a/python/tests/TFoSTest.py +++ b/python/tests/TFoSTest.py @@ -122,7 +122,7 @@ def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): batch_data = feed_dict(data) step += 1 _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) - print("global_step:{} xent:{}".format(x, g)) + print("global_step:{} xent:{}".format(g, x)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() @@ -168,7 +168,7 @@ def train(input_x, init_op, train_step, xent, global_step, summ): # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", - fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 64, "model": "/tmp/model"}], + fitParam=[{"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], runningMode="TFoS", mapFnParam=map_fun) estimator.fit(df).collect() From ce68c298f13b42fcfce80448ffe6456af3013126 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Sun, 15 Oct 2017 11:59:11 +0800 Subject: [PATCH 11/29] add TFoS test --- python/tests/TFoSBaseSparkTest.py | 42 ++++++++++++++++++ python/tests/transformers/tf_text_test.py | 54 +++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 python/tests/TFoSBaseSparkTest.py diff --git a/python/tests/TFoSBaseSparkTest.py b/python/tests/TFoSBaseSparkTest.py new file mode 100644 index 00000000..afc6992d --- /dev/null +++ b/python/tests/TFoSBaseSparkTest.py @@ -0,0 +1,42 @@ +import sys +from pyspark import SparkContext, SparkConf +from pyspark.sql import SparkSession + +if sys.version_info[:2] <= (2, 6): + try: + import unittest2 as unittest + except ImportError: + sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') + sys.exit(1) +else: + import unittest + + +class TFoSBaseSparkTest(unittest.TestCase): + """Base class for unittests using Spark. Sets up and tears down a cluster per test class""" + + @classmethod + def setUpClass(cls): + import os + master = os.getenv('MASTER') + assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env." + + num_workers = os.getenv('SPARK_WORKER_INSTANCES') + assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env." + cls.num_workers = int(num_workers) + + spark_jars = os.getenv('SPARK_CLASSPATH') + assert spark_jars and 'tensorflow-hadoop' in spark_jars, "Please add path to tensorflow-hadoop-*.jar to SPARK_CLASSPATH." + + cls.conf = SparkConf().set('spark.jars', spark_jars) + cls.sc = SparkContext(master, cls.__name__, conf=cls.conf) + cls.spark = SparkSession.builder.getOrCreate() + + @classmethod + def tearDownClass(cls): + cls.spark.stop() + cls.sc.stop() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index b23d7001..07411c4c 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -16,9 +16,12 @@ import shutil import threading +from tensorflowonspark import TFNode + from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer from sparkdl.tf_fun import map_fun +from ..tests import TFoSBaseSparkTest from ..tests import SparkDLTestCase @@ -76,6 +79,57 @@ def test_trainText(self): estimator.fit(df).collect() +class TFTextFileEstimatorOnTFoSTest(TFoSBaseSparkTest): + def trainText(self): + """ + To make this test work,Please: + 1. Start a Spark standalone cluster and export MASTER to your env, + 2. Make sure spark-deep-learning assembly in spark classpath. + 3. Change method 'trainText' to 'test_trainText' + """ + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + + def map_fun(args={}, ctx=None, _read_data=None): + import time + self.assertTrue(ctx is not None) + self.assertTrue(_read_data is None) + self.assertTrue(args["params"]["fitParam"][0]["cluster_size"] == 2) + clusterMode = ctx is not None + if clusterMode and ctx.job_name == "ps": + time.sleep((ctx.worker_num + 1) * 5) + + if clusterMode: + cluster, server = TFNode.start_cluster_server(ctx, 1) + + data = TFNode.DataFeed(ctx.mgr, True) + batch1 = data.next_batch(1) + self.assertTrue(len(batch1) == 1) + self.assertTrue(len(batch1[0]) == 64) + self.assertTrue(len(batch1[0][0]) == 100) + # consume all + data.next_batch(100) + + estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + fitParam=[ + {"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], + runningMode="TFoS", + mapFnParam=map_fun) + estimator.fit(df).collect() + + class MockKakfaServerTest(SparkDLTestCase): def test_mockKafkaServerProduce(self): dataset = self.session.createDataFrame([ From 196128a62a9d5afd734e4456a54a3d32755faa42 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 10:15:45 +0800 Subject: [PATCH 12/29] example --- .../estimators/tf_text_file_estimator.py | 2 +- python/tests/TFoSTest.py | 1 + python/tests/transformers/tf_text_test.py | 100 +++++++++--------- 3 files changed, 52 insertions(+), 51 deletions(-) diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 83b647d2..7058c154 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -65,7 +65,7 @@ class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, Kaf Then we create a tensorflow function. .. code-block:: python - def map_fun(_read_data, **args): + def map_fun(args={}, ctx=None, _read_data=None): import tensorflow as tf EMBEDDING_SIZE = args["embedding_size"] feature = args['feature'] diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSTest.py index 4c95c12a..b21c3206 100644 --- a/python/tests/TFoSTest.py +++ b/python/tests/TFoSTest.py @@ -132,6 +132,7 @@ def train(input_x, init_op, train_step, xent, global_step, summ): with tf.Session() as sess: sess.run(init_op) + ## for i in range(echo) for data in _read_data(max_records=params["batch_size"]): batch_data = feed_dict(data) _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 07411c4c..41f45bc7 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -21,7 +21,6 @@ from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer from sparkdl.tf_fun import map_fun -from ..tests import TFoSBaseSparkTest from ..tests import SparkDLTestCase @@ -41,6 +40,7 @@ def test_convertText(self): inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) df = transformer.transform(documentDF) + df.show() data = df.collect() self.assertEquals(len(data), 3) for row in data: @@ -79,55 +79,55 @@ def test_trainText(self): estimator.fit(df).collect() -class TFTextFileEstimatorOnTFoSTest(TFoSBaseSparkTest): - def trainText(self): - """ - To make this test work,Please: - 1. Start a Spark standalone cluster and export MASTER to your env, - 2. Make sure spark-deep-learning assembly in spark classpath. - 3. Change method 'trainText' to 'test_trainText' - """ - input_col = "text" - output_col = "sentence_matrix" - - documentDF = self.session.createDataFrame([ - ("Hi I heard about Spark", 1), - ("I wish Java could use case classes", 0), - ("Logistic regression models are neat", 2) - ], ["text", "preds"]) - - # transform text column to sentence_matrix column which contains 2-D array. - transformer = TFTextTransformer( - inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) - - df = transformer.transform(documentDF) - - def map_fun(args={}, ctx=None, _read_data=None): - import time - self.assertTrue(ctx is not None) - self.assertTrue(_read_data is None) - self.assertTrue(args["params"]["fitParam"][0]["cluster_size"] == 2) - clusterMode = ctx is not None - if clusterMode and ctx.job_name == "ps": - time.sleep((ctx.worker_num + 1) * 5) - - if clusterMode: - cluster, server = TFNode.start_cluster_server(ctx, 1) - - data = TFNode.DataFeed(ctx.mgr, True) - batch1 = data.next_batch(1) - self.assertTrue(len(batch1) == 1) - self.assertTrue(len(batch1[0]) == 64) - self.assertTrue(len(batch1[0][0]) == 100) - # consume all - data.next_batch(100) - - estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", - fitParam=[ - {"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], - runningMode="TFoS", - mapFnParam=map_fun) - estimator.fit(df).collect() +# class TFTextFileEstimatorOnTFoSTest(TFoSBaseSparkTest): +# def trainText(self): +# """ +# To make this test work,Please: +# 1. Start a Spark standalone cluster and export MASTER to your env, +# 2. Make sure spark-deep-learning assembly in spark classpath. +# 3. Change method 'trainText' to 'test_trainText' +# """ +# input_col = "text" +# output_col = "sentence_matrix" +# +# documentDF = self.session.createDataFrame([ +# ("Hi I heard about Spark", 1), +# ("I wish Java could use case classes", 0), +# ("Logistic regression models are neat", 2) +# ], ["text", "preds"]) +# +# # transform text column to sentence_matrix column which contains 2-D array. +# transformer = TFTextTransformer( +# inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) +# +# df = transformer.transform(documentDF) +# +# def map_fun(args={}, ctx=None, _read_data=None): +# import time +# self.assertTrue(ctx is not None) +# self.assertTrue(_read_data is None) +# self.assertTrue(args["params"]["fitParam"][0]["cluster_size"] == 2) +# clusterMode = ctx is not None +# if clusterMode and ctx.job_name == "ps": +# time.sleep((ctx.worker_num + 1) * 5) +# +# if clusterMode: +# cluster, server = TFNode.start_cluster_server(ctx, 1) +# +# data = TFNode.DataFeed(ctx.mgr, True) +# batch1 = data.next_batch(1) +# self.assertTrue(len(batch1) == 1) +# self.assertTrue(len(batch1[0]) == 64) +# self.assertTrue(len(batch1[0][0]) == 100) +# # consume all +# data.next_batch(100) +# +# estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", +# fitParam=[ +# {"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], +# runningMode="TFoS", +# mapFnParam=map_fun) +# estimator.fit(df).collect() class MockKakfaServerTest(SparkDLTestCase): From 4e8b11ed1d686c2bf859be2b8519e8f45d7fa0a6 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 10:25:48 +0800 Subject: [PATCH 13/29] move tensorflow map_fun to tf_text_test.py and modify the signature to support integrating TFoS infuture --- python/requirements.txt | 1 + .../estimators/tf_text_file_estimator.py | 11 +-- python/sparkdl/tf_fun.py | 90 ------------------ python/tests/transformers/tf_text_test.py | 93 ++++++++++++++++++- 4 files changed, 97 insertions(+), 98 deletions(-) delete mode 100644 python/sparkdl/tf_fun.py diff --git a/python/requirements.txt b/python/requirements.txt index a98a4d17..39981df5 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,3 +9,4 @@ pygments>=2.2.0 tensorflow==1.3.0 pandas>=0.19.1 six>=1.10.0 +kafka-python>=1.3.5 diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 278ab8e5..1f2fb116 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -245,12 +245,11 @@ def _read_data(max_records=64): finally: consumer.close() - self.getMapFnParam()(_read_data, - feature=inputCol, - label=labelCol, - vacab_size=vocab_s, - embedding_size=embedding_size, - params=params + self.getMapFnParam()(args={"feature": inputCol, + "label": labelCol, + "vacab_size": vocab_s, + "embedding_size": embedding_size, + "params": params}, ctx=None, _read_data=_read_data, ) return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py deleted file mode 100644 index b870f5f8..00000000 --- a/python/sparkdl/tf_fun.py +++ /dev/null @@ -1,90 +0,0 @@ -def map_fun(_read_data, **args): - import tensorflow as tf - EMBEDDING_SIZE = args["embedding_size"] - feature = args['feature'] - label = args['label'] - params = args['params']['fitParam'] - SEQUENCE_LENGTH = 64 - - def feed_dict(batch): - # Convert from dict of named arrays to two numpy arrays of the proper type - features = [] - for i in batch: - features.append(i['sentence_matrix']) - - # print("{} {}".format(feature, features)) - return features - - encoder_variables_dict = { - "encoder_w1": tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), - "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), - "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), - "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") - } - - def encoder(x, name="encoder"): - with tf.name_scope(name): - encoder_w1 = encoder_variables_dict["encoder_w1"] - encoder_b1 = encoder_variables_dict["encoder_b1"] - - layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) - - encoder_w2 = encoder_variables_dict["encoder_w2"] - encoder_b2 = encoder_variables_dict["encoder_b2"] - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) - return layer_2 - - def decoder(x, name="decoder"): - with tf.name_scope(name): - decoder_w1 = tf.Variable(tf.random_normal([128, 256])) - decoder_b1 = tf.Variable(tf.random_normal([256])) - - layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) - - decoder_w2 = tf.Variable( - tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) - decoder_b2 = tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) - return layer_2 - - tf.reset_default_graph - sess = tf.Session() - - input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") - flattened = tf.reshape(input_x, - [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) - - encoder_op = encoder(flattened) - - tf.add_to_collection('encoder_op', encoder_op) - - y_pred = decoder(encoder_op) - - y_true = flattened - - with tf.name_scope("xent"): - consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), - tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), - tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) - xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) - tf.summary.scalar("xent", xent) - - with tf.name_scope("train"): - # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) - train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) - - summ = tf.summary.merge_all() - - sess.run(tf.global_variables_initializer()) - - for i in range(params["epochs"]): - print("epoll {}".format(i)) - for data in _read_data(max_records=params["batch_size"]): - batch_data = feed_dict(data) - sess.run(train_step, feed_dict={input_x: batch_data}) - - sess.close() diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 26f31d1f..25cb1e0d 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -18,10 +18,99 @@ from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer -from sparkdl.tf_fun import map_fun from ..tests import SparkDLTestCase +def map_fun(args={}, ctx=None, _read_data=None): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + sess = tf.Session() + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + + summ = tf.summary.merge_all() + + sess.run(tf.global_variables_initializer()) + + for i in range(params["epochs"]): + print("epoll {}".format(i)) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + sess.close() + + class TFTextTransformerTest(SparkDLTestCase): def test_convertText(self): input_col = "text" @@ -48,7 +137,7 @@ def test_convertText(self): class TFTextFileEstimatorTest(SparkDLTestCase): def test_trainText(self): import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) input_col = "text" From 15a0c400df7586516a6e03c652279ab80d011f0f Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 11 Oct 2017 16:11:24 +0800 Subject: [PATCH 14/29] 1. Support NLP non-distribued training 2. Introduce Kafka to avoid broadcast huge tranning data --- .../estimators/tf_text_file_estimator.py | 312 ++++++++++++++++++ python/sparkdl/param/shared_params.py | 99 +++++- python/sparkdl/tf_fun.py | 90 +++++ python/sparkdl/transformers/named_text.py | 134 ++++++++ python/sparkdl/transformers/tf_image.py | 2 +- python/sparkdl/transformers/tf_text.py | 91 +++++ python/sparkdl/transformers/utils.py | 2 + python/tests/Test.py | 30 ++ python/tests/Test2.py | 22 ++ python/tests/resources/text/sample.txt | 4 + python/tests/transformers/tf_text_test.py | 126 +++++++ 11 files changed, 910 insertions(+), 2 deletions(-) create mode 100644 python/sparkdl/estimators/tf_text_file_estimator.py create mode 100644 python/sparkdl/tf_fun.py create mode 100644 python/sparkdl/transformers/named_text.py create mode 100644 python/sparkdl/transformers/tf_text.py create mode 100644 python/tests/Test.py create mode 100644 python/tests/Test2.py create mode 100644 python/tests/resources/text/sample.txt create mode 100644 python/tests/transformers/tf_text_test.py diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py new file mode 100644 index 00000000..278ab8e5 --- /dev/null +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -0,0 +1,312 @@ +# +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# pylint: disable=protected-access +from __future__ import absolute_import, division, print_function + +import logging +import threading +import time +import os +import shutil + +import cPickle as pickle + +from kafka import KafkaConsumer +from kafka import KafkaProducer +from pyspark.ml import Estimator + +from sparkdl.param import ( + keyword_only, HasLabelCol, HasInputCol, HasOutputCol) +from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam +import sparkdl.utils.jvmapi as JVMAPI + +__all__ = ['TFTextFileEstimator'] + +logger = logging.getLogger('sparkdl') + + +class TFTextFileEstimator(Estimator, HasInputCol, HasOutputCol, HasLabelCol, KafkaParam, FitParam, MapFnParam): + """ + Build a Estimator from tensorflow or keras when backend is tensorflow. + + First,assume we have data in dataframe like following. + + .. code-block:: python + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + transformer = TFTextTransformer( + inputCol=input_col, + outputCol=output_col) + + df = transformer.transform(documentDF) + + TFTextTransformer will transform text column to `output_col`, which is 2-D array. + + Then we create a tensorflow function. + + .. code-block:: python + def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + _read_data is a data generator. args provide hyper parameteres configured in this estimator. + + here is how to use _read_data: + + .. code-block:: python + for data in _read_data(max_records=params.batch_size): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + finally we can create TFTextFileEstimator to train our model: + + .. code-block:: python + estimator = TFTextFileEstimator(inputCol="sentence_matrix", + outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1"}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df) + + """ + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + super(TFTextFileEstimator, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, labelCol=None, kafkaParam=None, fitParam=None, mapFnParam=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def fit(self, dataset, params=None): + self._validateParams() + if params is None: + paramMaps = self.getFitParam() + elif isinstance(params, (list, tuple)): + if len(params) == 0: + paramMaps = [dict()] + else: + self._validateFitParams(params) + paramMaps = params + elif isinstance(params, dict): + paramMaps = [params] + else: + raise ValueError("Params must be either a param map or a list/tuple of param maps, " + "but got %s." % type(params)) + return self._fitInParallel(dataset, paramMaps) + + def _validateParams(self): + """ + Check Param values so we can throw errors on the driver, rather than workers. + :return: True if parameters are valid + """ + if not self.isDefined(self.inputCol): + raise ValueError("Input column must be defined") + if not self.isDefined(self.outputCol): + raise ValueError("Output column must be defined") + return True + + def _fitInParallel(self, dataset, paramMaps): + + inputCol = self.getInputCol() + labelCol = self.getLabelCol() + + from time import gmtime, strftime + kafaParams = self.getKafkaParam() + topic = kafaParams["topic"] + "_" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + group_id = kafaParams["group_id"] + bootstrap_servers = kafaParams["bootstrap_servers"] + kafka_test_mode = kafaParams["test_mode"] if "test_mode" in kafaParams else False + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) if kafka_test_mode else KafkaProducer( + bootstrap_servers=bootstrap_servers) + try: + for d in d_iter: + producer.send(topic, pickle.dumps(d)) + producer.send(topic, pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + if kafka_test_mode: + _write_data() + else: + t = threading.Thread(target=_write_data) + t.start() + + stop_flag_num = dataset.rdd.getNumPartitions() + temp_item = dataset.take(1)[0] + vocab_s = temp_item["vocab_size"] + embedding_size = temp_item["embedding_size"] + + sc = JVMAPI._curr_sc() + + paramMapsRDD = sc.parallelize(paramMaps, numSlices=len(paramMaps)) + + # Obtain params for this estimator instance + baseParamMap = self.extractParamMap() + baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) + baseParamDictBc = sc.broadcast(baseParamDict) + + def _local_fit(override_param_map): + # Update params + params = baseParamDictBc.value + params["fitParam"] = override_param_map + + def _read_data(max_records=64): + consumer = KafkaMockServer() if kafka_test_mode else KafkaConsumer(topic, + group_id=group_id, + bootstrap_servers=bootstrap_servers, + auto_offset_reset="earliest", + enable_auto_commit=False + ) + try: + stop_count = 0 + fail_msg_count = 0 + while True: + if kafka_test_mode: + time.sleep(1) + messages = consumer.poll(timeout_ms=1000, max_records=max_records) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + fail_msg_count += 0 + pass + if len(group_msgs) > 0: + yield group_msgs + + if kafka_test_mode: + print( + "stop_count = {} " + "group_msgs = {} " + "stop_flag_num = {} " + "fail_msg_count = {}".format(stop_count, + len(group_msgs), + stop_flag_num, + fail_msg_count)) + + if stop_count >= stop_flag_num and len(group_msgs) == 0: + break + finally: + consumer.close() + + self.getMapFnParam()(_read_data, + feature=inputCol, + label=labelCol, + vacab_size=vocab_s, + embedding_size=embedding_size, + params=params + ) + + return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) + + def _fit(self, dataset): # pylint: disable=unused-argument + err_msgs = ["This function should not have been called", + "Please contact library maintainers to file a bug"] + raise NotImplementedError('\n'.join(err_msgs)) + + +class KafkaMockServer(object): + """ + Restrictions of KafkaMockServer: + * Make sure all data have been writen before consume. + * Poll function will just ignore max_records and just return all data in queue. + """ + _kafka_mock_server_tmp_file_ = "/tmp/mock-kafka/" + sended = False + + def __init__(self, index=0): + super(KafkaMockServer, self).__init__() + self.index = index + self.queue = [] + if not os.path.exists(self._kafka_mock_server_tmp_file_): + os.mkdir(self._kafka_mock_server_tmp_file_) + + def send(self, topic, msg): + self.queue.append(pickle.loads(msg)) + + def flush(self): + with open(self._kafka_mock_server_tmp_file_ + str(self.index), "w") as f: + pickle.dump(self.queue, f) + self.queue = [] + + def close(self): + pass + + def poll(self, timeout_ms, max_records): + if self.sended: + return {} + + records = [] + for file in os.listdir(self._kafka_mock_server_tmp_file_): + with open(self._kafka_mock_server_tmp_file_ + file) as f: + tmp = pickle.load(f) + records += tmp + result = {} + couter = 0 + for i in records: + obj = MockRecord() + obj.value = pickle.dumps(i) + couter += 1 + result[str(couter) + "_"] = [obj] + self.sended = True + return result + + +class MockRecord(list): + pass diff --git a/python/sparkdl/param/shared_params.py b/python/sparkdl/param/shared_params.py index e169e891..7305fc8b 100644 --- a/python/sparkdl/param/shared_params.py +++ b/python/sparkdl/param/shared_params.py @@ -27,6 +27,7 @@ import sparkdl.utils.keras_model as kmutil + # From pyspark def keyword_only(func): @@ -36,15 +37,75 @@ def keyword_only(func): .. note:: Should only be used to wrap a method where first arg is `self` """ + @wraps(func) def wrapper(self, *args, **kwargs): if len(args) > 0: raise TypeError("Method %s forces keyword arguments." % func.__name__) self._input_kwargs = kwargs return func(self, **kwargs) + return wrapper +class KafkaParam(Params): + kafkaParam = Param(Params._dummy(), "kafkaParam", "kafka", typeConverter=TypeConverters.identity) + + def __init__(self): + super(KafkaParam, self).__init__() + + def setKafkaParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(kafkaParam=value) + + def getKafkaParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.kafkaParam) + + +class FitParam(Params): + fitParam = Param(Params._dummy(), "fitParam", "hyper parameter when training", + typeConverter=TypeConverters.identity) + + def __init__(self): + super(FitParam, self).__init__() + + def setFitParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(fitParam=value) + + def getFitParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.fitParam) + + +class MapFnParam(Params): + mapFnParam = Param(Params._dummy(), "mapFnParam", "Tensorflow func", typeConverter=TypeConverters.identity) + + def __init__(self): + super(MapFnParam, self).__init__() + + def setMapFnParam(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(mapFnParam=value) + + def getMapFnParam(self): + """ + Gets the value of inputCol or its default value. + """ + return self.getOrDefault(self.mapFnParam) + + class HasInputCol(Params): """ Mixin for param inputCol: input column name. @@ -68,6 +129,42 @@ def getInputCol(self): return self.getOrDefault(self.inputCol) +class HasEmbeddingSize(Params): + """ + Mixin for param embeddingSize + """ + + embeddingSize = Param(Params._dummy(), "embeddingSize", "word embedding size", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasEmbeddingSize, self).__init__() + + def setEmbeddingSize(self, value): + return self._set(embeddingSize=value) + + def getEmbeddingSize(self): + return self.getOrDefault(self.embeddingSize) + + +class HasSequenceLength(Params): + """ + Mixin for param sequenceLength + """ + + sequenceLength = Param(Params._dummy(), "sequenceLength", "sequence length", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasSequenceLength, self).__init__() + + def setSequenceLength(self, value): + return self._set(sequenceLength=value) + + def getSequenceLength(self): + return self.getOrDefault(self.sequenceLength) + + class HasOutputCol(Params): """ Mixin for param outputCol: output column name. @@ -92,12 +189,12 @@ def getOutputCol(self): """ return self.getOrDefault(self.outputCol) + ############################################ # New in sparkdl ############################################ class SparkDLTypeConverters(object): - @staticmethod def toStringOrTFTensor(value): if isinstance(value, tf.Tensor): diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py new file mode 100644 index 00000000..b870f5f8 --- /dev/null +++ b/python/sparkdl/tf_fun.py @@ -0,0 +1,90 @@ +def map_fun(_read_data, **args): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + feature = args['feature'] + label = args['label'] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + sess = tf.Session() + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + + summ = tf.summary.merge_all() + + sess.run(tf.global_variables_initializer()) + + for i in range(params["epochs"]): + print("epoll {}".format(i)) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + sess.close() diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py new file mode 100644 index 00000000..ef51cd0c --- /dev/null +++ b/python/sparkdl/transformers/named_text.py @@ -0,0 +1,134 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from keras.applications.imagenet_utils import decode_predictions +import numpy as np + +from pyspark.ml import Transformer +from pyspark.ml.param import Param, Params, TypeConverters + +import sparkdl.graph.utils as tfx +from sparkdl.image.imageIO import resizeImage +import sparkdl.transformers.keras_applications as keras_apps +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) +from sparkdl.transformers.tf_text import TFTextTransformer + +SUPPORTED_MODELS = ["CNN", "LSTM"] + + +class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): + """ + todo + """ + modelName = Param(Params._dummy(), "modelName", "A deep learning model name") + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None) + """ + super(DeepTextFeaturizer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def _transform(self, dataset): + transformer = _NamedTextTransformer(inputCol=self.getInputCol(), + outputCol=self.getOutputCol(), + modelName=self.getModelName(), featurize=True) + return transformer.transform(dataset) + + +class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): + modelName = Param(Params._dummy(), "modelName", "A deep learning model name", + typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) + featurize = Param(Params._dummy(), "featurize", + "If true, output features. If false, output predictions. Either way the output is a vector.", + typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + super(_NamedTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + self._inputTensorName = None + self._outputTensorName = None + self._outputMode = None + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): + """ + setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) + """ + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def setModelName(self, value): + return self._set(modelName=value) + + def getModelName(self): + return self.getOrDefault(self.modelName) + + def setFeaturize(self, value): + return self._set(featurize=value) + + def getFeaturize(self): + return self.getOrDefault(self.featurize) + + def _transform(self, dataset): + modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) + inputCol = self.getInputCol() + resizedCol = "__sdl_textResized" + tfTransformer = TFTextTransformer(inputCol=resizedCol, + outputCol=self.getOutputCol(), + graph=modelGraphSpec["graph"], + inputTensor=modelGraphSpec["inputTensorName"], + outputTensor=modelGraphSpec["outputTensorName"], + outputMode=modelGraphSpec["outputMode"]) + resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) + result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) + return result.drop(resizedCol) + + +def _buildTFGraphForName(name, featurize): + """ + Currently only supports pre-trained models from the Keras applications module. + """ + modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) + sess = modelData["session"] + outputTensorName = modelData["outputTensorName"] + graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) + modelData["graph"] = graph + + return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index da37fcad..2ca33846 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .withColumn("__sdl_image_height", image.height) + .n("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/sparkdl/transformers/tf_text.py b/python/sparkdl/transformers/tf_text.py new file mode 100644 index 00000000..b040adc0 --- /dev/null +++ b/python/sparkdl/transformers/tf_text.py @@ -0,0 +1,91 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from pyspark.ml import Transformer +from pyspark.ml.feature import Word2Vec +from pyspark.sql.functions import udf +from pyspark.sql import functions as f +from pyspark.sql.types import * +from pyspark.sql.functions import lit +from sparkdl.param.shared_params import HasEmbeddingSize, HasSequenceLength +from sparkdl.param import ( + keyword_only, HasInputCol, HasOutputCol) +import re + +import sparkdl.utils.jvmapi as JVMAPI + + +class TFTextTransformer(Transformer, HasInputCol, HasOutputCol, HasEmbeddingSize, HasSequenceLength): + """ + Convert sentence/document to a 2-D Array eg. [[word embedding],[....]] in DataFrame which can be processed + directly by tensorflow or keras who's backend is tensorflow. + + Processing Steps: + + * Using Word2Vec compute Map(word -> vector) from input column, then broadcast the map. + * Process input column (which is text),split it with white space, replace word with vector, padding the result to + the same size. + * Create a new dataframe with columns like new 2-D array , vocab_size, embedding_size + * return then new dataframe + """ + VOCAB_SIZE = 'vocab_size' + EMBEDDING_SIZE = 'embedding_size' + + @keyword_only + def __init__(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + super(TFTextTransformer, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLength=64): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, dataset): + word2vec = Word2Vec(vectorSize=self.getEmbeddingSize(), minCount=1, inputCol=self.getInputCol(), + outputCol="word_embedding") + word_embedding = dict( + word2vec.fit( + dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))).getVectors().rdd.map( + lambda p: (p.word, p.vector.values.tolist())).collect()) + word_embedding["unk"] = np.zeros(self.getEmbeddingSize()).tolist() + sc = JVMAPI._curr_sc() + local_word_embedding = sc.broadcast(word_embedding) + + def convert_word_to_index(s): + def _pad_sequences(sequences, maxlen=None): + new_sequences = [] + + if len(sequences) <= maxlen: + for i in range(maxlen - len(sequences)): + new_sequences.append(np.zeros(self.getEmbeddingSize()).tolist()) + return sequences + new_sequences + else: + return sequences[0:maxlen] + + new_q = [local_word_embedding.value[word] for word in re.split(r"\s+", s) if + word in local_word_embedding.value.keys()] + result = _pad_sequences(new_q, maxlen=self.getSequenceLength()) + return result + + cwti_udf = udf(convert_word_to_index, ArrayType(ArrayType(FloatType()))) + doc_martic = (dataset.withColumn(self.getOutputCol(), cwti_udf(self.getInputCol()).alias(self.getOutputCol())) + .withColumn(self.VOCAB_SIZE, lit(len(word_embedding))) + .withColumn(self.EMBEDDING_SIZE, lit(self.getEmbeddingSize())) + ) + + return doc_martic diff --git a/python/sparkdl/transformers/utils.py b/python/sparkdl/transformers/utils.py index b244365b..9964f3df 100644 --- a/python/sparkdl/transformers/utils.py +++ b/python/sparkdl/transformers/utils.py @@ -18,6 +18,8 @@ # image stuff IMAGE_INPUT_PLACEHOLDER_NAME = "sparkdl_image_input" +TEXT_INPUT_PLACEHOLDER_NAME = "sparkdl_text_input" + def imageInputPlaceholder(nChannels=None): return tf.placeholder(tf.float32, [None, None, None, nChannels], diff --git a/python/tests/Test.py b/python/tests/Test.py new file mode 100644 index 00000000..6327cda4 --- /dev/null +++ b/python/tests/Test.py @@ -0,0 +1,30 @@ +import os +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +from sparkdl import readImages +from pyspark.sql.functions import lit +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.ml.classification import LogisticRegression +from pyspark.ml import Pipeline +from sparkdl import DeepImageFeaturizer + +img_dir="/Users/allwefantasy/resources/images/flower_photos" + +tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) +daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) + +tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) +daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) +train_df = tulips_train.unionAll(daisy_train) +test_df = tulips_test.unionAll(daisy_test) + +featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") +lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") +p = Pipeline(stages=[featurizer, lr]) + +p_model = p.fit(train_df) +tested_df = p_model.transform(test_df) +evaluator = MulticlassClassificationEvaluator(metricName="accuracy") +print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) + +# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py new file mode 100644 index 00000000..b535a602 --- /dev/null +++ b/python/tests/Test2.py @@ -0,0 +1,22 @@ +import os +from pyspark import SparkContext + +from sparkdl.transformers.tf_text import TFTextTransformer + +os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' + +input_col = "text" +output_col = "preds" + +sc = SparkContext.getOrCreate() +documentDF = sc.createDataFrame([ + ("Hi I heard about Spark".split(" "), 1), + ("I wish Java could use case classes".split(" "), 0), + ("Logistic regression models are neat".split(" "), 2) +], ["text", "preds"]) + +transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col) + +df = transformer.transform(documentDF) +df.show() \ No newline at end of file diff --git a/python/tests/resources/text/sample.txt b/python/tests/resources/text/sample.txt new file mode 100644 index 00000000..8c5e8d99 --- /dev/null +++ b/python/tests/resources/text/sample.txt @@ -0,0 +1,4 @@ +接下 来 介绍 一种 非常 重要 的 神经网络 卷积神经网络 +这种 神经 网络 在 计算机 视觉 领域 取得了 重大 的 成功,而且 在 自然语言 处理 等 其它 领域 也有 很好 应用 +深度学习 受到 大家 关注 很大 一个 原因 就是 Alex 实现 AlexNet( 一种 深度卷积神经网络 )在 LSVRC-2010 ImageNet +此后 卷积神经网络 及其 变种 被广泛 应用于 各种图像 相关 任务 \ No newline at end of file diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py new file mode 100644 index 00000000..0e8b359d --- /dev/null +++ b/python/tests/transformers/tf_text_test.py @@ -0,0 +1,126 @@ +# Copyright 2017 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cPickle as pickle +import shutil +import threading + +from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer +from sparkdl.transformers.tf_text import TFTextTransformer +from sparkdl.tf_fun import map_fun +from ..tests import SparkDLTestCase + + +class TFTextTransformerTest(SparkDLTestCase): + def test_convertText(self): + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + data = df.collect() + self.assertEquals(len(data), 3) + for row in data: + self.assertEqual(len(row[output_col]), 64) + self.assertEqual(len(row[output_col][0]), 100) + + +class TFTextFileEstimatorTest(SparkDLTestCase): + def test_trainText(self): + import os + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) + + input_col = "text" + output_col = "sentence_matrix" + + documentDF = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + # transform text column to sentence_matrix column which contains 2-D array. + transformer = TFTextTransformer( + inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) + + df = transformer.transform(documentDF) + + # create a estimator to training where map_fun contains tensorflow's code + estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", + kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "group_id": "sdl_1", "test_mode": False}, + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], + mapFnParam=map_fun) + estimator.fit(df).collect() + + +class MockKakfaServerTest(SparkDLTestCase): + def test_mockKafkaServerProduce(self): + dataset = self.session.createDataFrame([ + ("Hi I heard about Spark", 1), + ("I wish Java could use case classes", 0), + ("Logistic regression models are neat", 2) + ], ["text", "preds"]) + + def _write_data(): + def _write_partition(index, d_iter): + producer = KafkaMockServer(index) + try: + for d in d_iter: + producer.send("", pickle.dumps(d)) + producer.send("", pickle.dumps("_stop_")) + producer.flush() + finally: + producer.close() + return [] + + dataset.rdd.mapPartitionsWithIndex(_write_partition).count() + + _write_data() + + def _consume(): + consumer = KafkaMockServer() + stop_count = 0 + while True: + messages = consumer.poll(timeout_ms=1000, max_records=64) + group_msgs = [] + for tp, records in messages.items(): + for record in records: + try: + msg_value = pickle.loads(record.value) + print(msg_value) + if msg_value == "_stop_": + stop_count += 1 + else: + group_msgs.append(msg_value) + except: + pass + if stop_count >= 8: + break + self.assertEquals(stop_count, 8) + + t = threading.Thread(target=_consume) + t.start() + t2 = threading.Thread(target=_consume) + t2.start() From 08e61f34c3329210cb1fc9f4d0365dfe41d00aaa Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 17:28:39 +0800 Subject: [PATCH 15/29] set test_mode to True which can avoid to kafka dependency --- python/tests/transformers/tf_text_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 0e8b359d..26f31d1f 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -69,7 +69,7 @@ def test_trainText(self): # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", - "group_id": "sdl_1", "test_mode": False}, + "group_id": "sdl_1", "test_mode": True}, fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], mapFnParam=map_fun) estimator.fit(df).collect() From e51c508b50baaf0458add2f216e48aaeb607ba7a Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Fri, 13 Oct 2017 18:44:23 +0800 Subject: [PATCH 16/29] clean some file --- python/sparkdl/transformers/named_text.py | 134 ---------------------- python/sparkdl/transformers/tf_image.py | 2 +- python/tests/Test.py | 30 ----- python/tests/Test2.py | 22 ---- 4 files changed, 1 insertion(+), 187 deletions(-) delete mode 100644 python/sparkdl/transformers/named_text.py delete mode 100644 python/tests/Test.py delete mode 100644 python/tests/Test2.py diff --git a/python/sparkdl/transformers/named_text.py b/python/sparkdl/transformers/named_text.py deleted file mode 100644 index ef51cd0c..00000000 --- a/python/sparkdl/transformers/named_text.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2017 Databricks, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from keras.applications.imagenet_utils import decode_predictions -import numpy as np - -from pyspark.ml import Transformer -from pyspark.ml.param import Param, Params, TypeConverters - -import sparkdl.graph.utils as tfx -from sparkdl.image.imageIO import resizeImage -import sparkdl.transformers.keras_applications as keras_apps -from sparkdl.param import ( - keyword_only, HasInputCol, HasOutputCol, SparkDLTypeConverters) -from sparkdl.transformers.tf_text import TFTextTransformer - -SUPPORTED_MODELS = ["CNN", "LSTM"] - - -class DeepTextFeaturizer(Transformer, HasInputCol, HasOutputCol): - """ - todo - """ - modelName = Param(Params._dummy(), "modelName", "A deep learning model name") - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None) - """ - super(DeepTextFeaturizer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def _transform(self, dataset): - transformer = _NamedTextTransformer(inputCol=self.getInputCol(), - outputCol=self.getOutputCol(), - modelName=self.getModelName(), featurize=True) - return transformer.transform(dataset) - - -class _NamedTextTransformer(Transformer, HasInputCol, HasOutputCol): - modelName = Param(Params._dummy(), "modelName", "A deep learning model name", - typeConverter=SparkDLTypeConverters.supportedNameConverter(SUPPORTED_MODELS)) - featurize = Param(Params._dummy(), "featurize", - "If true, output features. If false, output predictions. Either way the output is a vector.", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - super(_NamedTextTransformer, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - self._inputTensorName = None - self._outputTensorName = None - self._outputMode = None - - @keyword_only - def setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False): - """ - setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False) - """ - kwargs = self._input_kwargs - self._set(**kwargs) - return self - - def setModelName(self, value): - return self._set(modelName=value) - - def getModelName(self): - return self.getOrDefault(self.modelName) - - def setFeaturize(self, value): - return self._set(featurize=value) - - def getFeaturize(self): - return self.getOrDefault(self.featurize) - - def _transform(self, dataset): - modelGraphSpec = _buildTFGraphForName(self.getModelName(), self.getFeaturize()) - inputCol = self.getInputCol() - resizedCol = "__sdl_textResized" - tfTransformer = TFTextTransformer(inputCol=resizedCol, - outputCol=self.getOutputCol(), - graph=modelGraphSpec["graph"], - inputTensor=modelGraphSpec["inputTensorName"], - outputTensor=modelGraphSpec["outputTensorName"], - outputMode=modelGraphSpec["outputMode"]) - resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"]) - result = tfTransformer.transform(dataset.withColumn(resizedCol, resizeUdf(inputCol))) - return result.drop(resizedCol) - - -def _buildTFGraphForName(name, featurize): - """ - Currently only supports pre-trained models from the Keras applications module. - """ - modelData = keras_apps.getKerasApplicationModel(name).getModelData(featurize) - sess = modelData["session"] - outputTensorName = modelData["outputTensorName"] - graph = tfx.strip_and_freeze_until([outputTensorName], sess.graph, sess, return_graph=True) - modelData["graph"] = graph - - return modelData diff --git a/python/sparkdl/transformers/tf_image.py b/python/sparkdl/transformers/tf_image.py index 2ca33846..da37fcad 100644 --- a/python/sparkdl/transformers/tf_image.py +++ b/python/sparkdl/transformers/tf_image.py @@ -120,7 +120,7 @@ def _transform(self, dataset): with final_graph.as_default(): image = dataset[self.getInputCol()] image_df_exploded = (dataset - .n("__sdl_image_height", image.height) + .withColumn("__sdl_image_height", image.height) .withColumn("__sdl_image_width", image.width) .withColumn("__sdl_image_nchannels", image.nChannels) .withColumn("__sdl_image_data", image.data) diff --git a/python/tests/Test.py b/python/tests/Test.py deleted file mode 100644 index 6327cda4..00000000 --- a/python/tests/Test.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -from sparkdl import readImages -from pyspark.sql.functions import lit -from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.ml.classification import LogisticRegression -from pyspark.ml import Pipeline -from sparkdl import DeepImageFeaturizer - -img_dir="/Users/allwefantasy/resources/images/flower_photos" - -tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) -daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) - -tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) -daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) -train_df = tulips_train.unionAll(daisy_train) -test_df = tulips_test.unionAll(daisy_test) - -featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") -lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") -p = Pipeline(stages=[featurizer, lr]) - -p_model = p.fit(train_df) -tested_df = p_model.transform(test_df) -evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label")))) - -# h5py,pil \ No newline at end of file diff --git a/python/tests/Test2.py b/python/tests/Test2.py deleted file mode 100644 index b535a602..00000000 --- a/python/tests/Test2.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -from pyspark import SparkContext - -from sparkdl.transformers.tf_text import TFTextTransformer - -os.environ['PYSPARK_PYTHON'] = '/Users/allwefantasy/python2.7/tensorflow/bin/python' - -input_col = "text" -output_col = "preds" - -sc = SparkContext.getOrCreate() -documentDF = sc.createDataFrame([ - ("Hi I heard about Spark".split(" "), 1), - ("I wish Java could use case classes".split(" "), 0), - ("Logistic regression models are neat".split(" "), 2) -], ["text", "preds"]) - -transformer = TFTextTransformer( - inputCol=input_col, outputCol=output_col) - -df = transformer.transform(documentDF) -df.show() \ No newline at end of file From 65a469497037749ca813c68dc389b3e9168b640c Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 14 Oct 2017 09:12:50 -0700 Subject: [PATCH 17/29] [#55] fix TFImageTransformer example in docs (#58) --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5cd68ba7..fa4a16d9 100644 --- a/README.md +++ b/README.md @@ -131,16 +131,16 @@ Spark DataFrames are a natural construct for applying deep learning models to a ```python from sparkdl import readImages, TFImageTransformer + import sparkdl.graph.utils as tfx from sparkdl.transformers import utils import tensorflow as tf - g = tf.Graph() - with g.as_default(): + graph = tf.Graph() + with tf.Session(graph=graph) as sess: image_arr = utils.imageInputPlaceholder() resized_images = tf.image.resize_images(image_arr, (299, 299)) - # the following step is not necessary for this graph, but can be for graphs with variables, etc - frozen_graph = utils.stripAndFreezeGraph(g.as_graph_def(add_shapes=True), tf.Session(graph=g), - [resized_images]) + frozen_graph = tfx.strip_and_freeze_until([resized_images], graph, sess, + return_graph=True) transformer = TFImageTransformer(inputCol="image", outputCol="predictions", graph=frozen_graph, inputTensor=image_arr, outputTensor=resized_images, @@ -241,7 +241,7 @@ registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"), ``` +### Estimator ## Releases: * 0.1.0 initial release - From b812764cdef23d7523490babf47a6ab970386186 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 10:25:48 +0800 Subject: [PATCH 18/29] move tensorflow map_fun to tf_text_test.py and modify the signature to support integrating TFoS infuture --- python/requirements.txt | 1 + .../estimators/tf_text_file_estimator.py | 11 +-- python/sparkdl/tf_fun.py | 90 ------------------ python/tests/transformers/tf_text_test.py | 93 ++++++++++++++++++- 4 files changed, 97 insertions(+), 98 deletions(-) delete mode 100644 python/sparkdl/tf_fun.py diff --git a/python/requirements.txt b/python/requirements.txt index a98a4d17..39981df5 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -9,3 +9,4 @@ pygments>=2.2.0 tensorflow==1.3.0 pandas>=0.19.1 six>=1.10.0 +kafka-python>=1.3.5 diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 278ab8e5..1f2fb116 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -245,12 +245,11 @@ def _read_data(max_records=64): finally: consumer.close() - self.getMapFnParam()(_read_data, - feature=inputCol, - label=labelCol, - vacab_size=vocab_s, - embedding_size=embedding_size, - params=params + self.getMapFnParam()(args={"feature": inputCol, + "label": labelCol, + "vacab_size": vocab_s, + "embedding_size": embedding_size, + "params": params}, ctx=None, _read_data=_read_data, ) return paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py deleted file mode 100644 index b870f5f8..00000000 --- a/python/sparkdl/tf_fun.py +++ /dev/null @@ -1,90 +0,0 @@ -def map_fun(_read_data, **args): - import tensorflow as tf - EMBEDDING_SIZE = args["embedding_size"] - feature = args['feature'] - label = args['label'] - params = args['params']['fitParam'] - SEQUENCE_LENGTH = 64 - - def feed_dict(batch): - # Convert from dict of named arrays to two numpy arrays of the proper type - features = [] - for i in batch: - features.append(i['sentence_matrix']) - - # print("{} {}".format(feature, features)) - return features - - encoder_variables_dict = { - "encoder_w1": tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), - "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), - "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), - "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") - } - - def encoder(x, name="encoder"): - with tf.name_scope(name): - encoder_w1 = encoder_variables_dict["encoder_w1"] - encoder_b1 = encoder_variables_dict["encoder_b1"] - - layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) - - encoder_w2 = encoder_variables_dict["encoder_w2"] - encoder_b2 = encoder_variables_dict["encoder_b2"] - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) - return layer_2 - - def decoder(x, name="decoder"): - with tf.name_scope(name): - decoder_w1 = tf.Variable(tf.random_normal([128, 256])) - decoder_b1 = tf.Variable(tf.random_normal([256])) - - layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) - - decoder_w2 = tf.Variable( - tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) - decoder_b2 = tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) - return layer_2 - - tf.reset_default_graph - sess = tf.Session() - - input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") - flattened = tf.reshape(input_x, - [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) - - encoder_op = encoder(flattened) - - tf.add_to_collection('encoder_op', encoder_op) - - y_pred = decoder(encoder_op) - - y_true = flattened - - with tf.name_scope("xent"): - consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), - tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), - tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) - xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) - tf.summary.scalar("xent", xent) - - with tf.name_scope("train"): - # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) - train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) - - summ = tf.summary.merge_all() - - sess.run(tf.global_variables_initializer()) - - for i in range(params["epochs"]): - print("epoll {}".format(i)) - for data in _read_data(max_records=params["batch_size"]): - batch_data = feed_dict(data) - sess.run(train_step, feed_dict={input_x: batch_data}) - - sess.close() diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 26f31d1f..25cb1e0d 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -18,10 +18,99 @@ from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer -from sparkdl.tf_fun import map_fun from ..tests import SparkDLTestCase +def map_fun(args={}, ctx=None, _read_data=None): + import tensorflow as tf + EMBEDDING_SIZE = args["embedding_size"] + params = args['params']['fitParam'] + SEQUENCE_LENGTH = 64 + + def feed_dict(batch): + # Convert from dict of named arrays to two numpy arrays of the proper type + features = [] + for i in batch: + features.append(i['sentence_matrix']) + + # print("{} {}".format(feature, features)) + return features + + encoder_variables_dict = { + "encoder_w1": tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), + "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), + "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), + "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") + } + + def encoder(x, name="encoder"): + with tf.name_scope(name): + encoder_w1 = encoder_variables_dict["encoder_w1"] + encoder_b1 = encoder_variables_dict["encoder_b1"] + + layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) + + encoder_w2 = encoder_variables_dict["encoder_w2"] + encoder_b2 = encoder_variables_dict["encoder_b2"] + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) + return layer_2 + + def decoder(x, name="decoder"): + with tf.name_scope(name): + decoder_w1 = tf.Variable(tf.random_normal([128, 256])) + decoder_b1 = tf.Variable(tf.random_normal([256])) + + layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) + + decoder_w2 = tf.Variable( + tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) + decoder_b2 = tf.Variable( + tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) + + layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) + return layer_2 + + tf.reset_default_graph + sess = tf.Session() + + input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") + flattened = tf.reshape(input_x, + [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) + + encoder_op = encoder(flattened) + + tf.add_to_collection('encoder_op', encoder_op) + + y_pred = decoder(encoder_op) + + y_true = flattened + + with tf.name_scope("xent"): + consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), + tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), + tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) + xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) + tf.summary.scalar("xent", xent) + + with tf.name_scope("train"): + # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) + train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) + + summ = tf.summary.merge_all() + + sess.run(tf.global_variables_initializer()) + + for i in range(params["epochs"]): + print("epoll {}".format(i)) + for data in _read_data(max_records=params["batch_size"]): + batch_data = feed_dict(data) + sess.run(train_step, feed_dict={input_x: batch_data}) + + sess.close() + + class TFTextTransformerTest(SparkDLTestCase): def test_convertText(self): input_col = "text" @@ -48,7 +137,7 @@ def test_convertText(self): class TFTextFileEstimatorTest(SparkDLTestCase): def test_trainText(self): import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) input_col = "text" From e277b24b0889005da2b3387edb11c569e0118aeb Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 11:07:40 +0800 Subject: [PATCH 19/29] fix code style in TFTextTransformer --- python/sparkdl/transformers/tf_text.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/sparkdl/transformers/tf_text.py b/python/sparkdl/transformers/tf_text.py index b040adc0..c224ec34 100644 --- a/python/sparkdl/transformers/tf_text.py +++ b/python/sparkdl/transformers/tf_text.py @@ -58,10 +58,12 @@ def setParams(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLe def _transform(self, dataset): word2vec = Word2Vec(vectorSize=self.getEmbeddingSize(), minCount=1, inputCol=self.getInputCol(), outputCol="word_embedding") - word_embedding = dict( - word2vec.fit( - dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))).getVectors().rdd.map( - lambda p: (p.word, p.vector.values.tolist())).collect()) + word2vecModel = word2vec.fit( + dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))) + + word_embedding = dict(word2vecModel.getVectors().rdd.map( + lambda p: (p.word, p.vector.values.tolist())).collect()) + word_embedding["unk"] = np.zeros(self.getEmbeddingSize()).tolist() sc = JVMAPI._curr_sc() local_word_embedding = sc.broadcast(word_embedding) From edd359c2911dd9aa45de75ca0b297dccdd68c265 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 15:03:23 +0800 Subject: [PATCH 20/29] make sure TFTextTransformer will pass the ./python/run-tests.sh --- .gitignore | 1 + python/sparkdl/transformers/tf_text.py | 21 +++++++++++++++++---- python/tests/transformers/tf_text_test.py | 3 ++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 26ae9a84..7f594401 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ README.org .cache/ .history/ .lib/ +.coverage dist/* target/ lib_managed/ diff --git a/python/sparkdl/transformers/tf_text.py b/python/sparkdl/transformers/tf_text.py index c224ec34..dc4fc40d 100644 --- a/python/sparkdl/transformers/tf_text.py +++ b/python/sparkdl/transformers/tf_text.py @@ -56,16 +56,29 @@ def setParams(self, inputCol=None, outputCol=None, embeddingSize=100, sequenceLe return self._set(**kwargs) def _transform(self, dataset): + + sc = JVMAPI._curr_sc() + word2vec = Word2Vec(vectorSize=self.getEmbeddingSize(), minCount=1, inputCol=self.getInputCol(), outputCol="word_embedding") - word2vecModel = word2vec.fit( - dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))) - word_embedding = dict(word2vecModel.getVectors().rdd.map( + vectorsDf = word2vec.fit( + dataset.select(f.split(self.getInputCol(), "\\s+").alias(self.getInputCol()))).getVectors() + + """ + It's strange here that after calling getVectors the df._sc._jsc will lose and this is + only happens when you run it with ./python/run-tests.sh script. + We add this code to make it pass the test. However it seems this will hit + "org.apache.spark.SparkException: EOF reached before Python server acknowledged" error. + """ + if vectorsDf._sc._jsc is None: + vectorsDf._sc._jsc = sc._jsc + + word_embedding = dict(vectorsDf.rdd.map( lambda p: (p.word, p.vector.values.tolist())).collect()) word_embedding["unk"] = np.zeros(self.getEmbeddingSize()).tolist() - sc = JVMAPI._curr_sc() + local_word_embedding = sc.broadcast(word_embedding) def convert_word_to_index(s): diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 25cb1e0d..62e44f15 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -16,6 +16,8 @@ import shutil import threading +from pyspark import SparkContext, SQLContext + from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer from ..tests import SparkDLTestCase @@ -115,7 +117,6 @@ class TFTextTransformerTest(SparkDLTestCase): def test_convertText(self): input_col = "text" output_col = "sentence_matrix" - documentDF = self.session.createDataFrame([ ("Hi I heard about Spark", 1), ("I wish Java could use case classes", 0), From b2550c38aca9c5020689a9ade2f22ad508290467 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 15:08:02 +0800 Subject: [PATCH 21/29] fix conflict --- python/tests/transformers/tf_text_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 84829c82..25cb1e0d 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -115,7 +115,7 @@ class TFTextTransformerTest(SparkDLTestCase): def test_convertText(self): input_col = "text" output_col = "sentence_matrix" - + documentDF = self.session.createDataFrame([ ("Hi I heard about Spark", 1), ("I wish Java could use case classes", 0), From 6012b3523b6721bb02aa0dec3dd99e83578cdd3d Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 15:28:39 +0800 Subject: [PATCH 22/29] add tensorflowonspark to python/requirements.txt --- python/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/requirements.txt b/python/requirements.txt index 39981df5..25efe89e 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -10,3 +10,7 @@ tensorflow==1.3.0 pandas>=0.19.1 six>=1.10.0 kafka-python>=1.3.5 +tensorflowonspark>=1.0.5 +tensorflow-tensorboard>=0.1.6 + + From 0039a5a5748831cacd57a1c7b423eec5178fc6b6 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 16:05:42 +0800 Subject: [PATCH 23/29] fix pickle import for python 2/3 --- python/sparkdl/__init__.py | 4 +- .../estimators/tf_text_file_estimator.py | 7 +- python/sparkdl/tf_fun.py | 146 ------------------ python/tests/transformers/tf_text_test.py | 11 +- 4 files changed, 15 insertions(+), 153 deletions(-) delete mode 100644 python/sparkdl/tf_fun.py diff --git a/python/sparkdl/__init__.py b/python/sparkdl/__init__.py index aa15059a..2b557d54 100644 --- a/python/sparkdl/__init__.py +++ b/python/sparkdl/__init__.py @@ -21,7 +21,7 @@ __all__ = [ 'imageSchema', 'imageType', 'readImages', - 'TFImageTransformer', + 'TFImageTransformer', 'TFTextTransformer', 'DeepImagePredictor', 'DeepImageFeaturizer', - 'KerasImageFileTransformer', + 'KerasImageFileTransformer', 'TFTextFileEstimator', 'imageInputPlaceholder'] diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 3adcb65d..31bdb70c 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -21,7 +21,7 @@ import threading import time import os -import cPickle as pickle +import sys from kafka import KafkaConsumer from kafka import KafkaProducer @@ -33,6 +33,11 @@ from sparkdl.param.shared_params import KafkaParam, FitParam, MapFnParam, RunningMode import sparkdl.utils.jvmapi as JVMAPI +if sys.version_info[:2] <= (2, 7): + import cPickle as pickle +else: + import _pickle as pickle + __all__ = ['TFTextFileEstimator'] logger = logging.getLogger('sparkdl') diff --git a/python/sparkdl/tf_fun.py b/python/sparkdl/tf_fun.py deleted file mode 100644 index 68025604..00000000 --- a/python/sparkdl/tf_fun.py +++ /dev/null @@ -1,146 +0,0 @@ -def map_fun(args={}, ctx=None, _read_data=None): - from tensorflowonspark import TFNode - from datetime import datetime - import math - import numpy - import tensorflow as tf - import time - - print(args) - - EMBEDDING_SIZE = args["embedding_size"] - feature = args['feature'] - label = args['label'] - params = args['params']['fitParam'] - print(params) - SEQUENCE_LENGTH = 64 - - clusterMode = False if ctx is None else True - - if clusterMode and ctx.job_name == "ps": - time.sleep((ctx.worker_num + 1) * 5) - - if clusterMode: - cluster, server = TFNode.start_cluster_server(ctx, 1) - - def feed_dict(batch): - # Convert from dict of named arrays to two numpy arrays of the proper type - features = [] - for i in batch: - features.append(i['sentence_matrix']) - - # print("{} {}".format(feature, features)) - return features - - def build_graph(): - encoder_variables_dict = { - "encoder_w1": tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE, 256]), name="encoder_w1"), - "encoder_b1": tf.Variable(tf.random_normal([256]), name="encoder_b1"), - "encoder_w2": tf.Variable(tf.random_normal([256, 128]), name="encoder_w2"), - "encoder_b2": tf.Variable(tf.random_normal([128]), name="encoder_b2") - } - - def encoder(x, name="encoder"): - with tf.name_scope(name): - encoder_w1 = encoder_variables_dict["encoder_w1"] - encoder_b1 = encoder_variables_dict["encoder_b1"] - - layer_1 = tf.nn.sigmoid(tf.matmul(x, encoder_w1) + encoder_b1) - - encoder_w2 = encoder_variables_dict["encoder_w2"] - encoder_b2 = encoder_variables_dict["encoder_b2"] - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, encoder_w2) + encoder_b2) - return layer_2 - - def decoder(x, name="decoder"): - with tf.name_scope(name): - decoder_w1 = tf.Variable(tf.random_normal([128, 256])) - decoder_b1 = tf.Variable(tf.random_normal([256])) - - layer_1 = tf.nn.sigmoid(tf.matmul(x, decoder_w1) + decoder_b1) - - decoder_w2 = tf.Variable( - tf.random_normal([256, SEQUENCE_LENGTH * EMBEDDING_SIZE])) - decoder_b2 = tf.Variable( - tf.random_normal([SEQUENCE_LENGTH * EMBEDDING_SIZE])) - - layer_2 = tf.nn.sigmoid(tf.matmul(layer_1, decoder_w2) + decoder_b2) - return layer_2 - - tf.reset_default_graph - - input_x = tf.placeholder(tf.float32, [None, SEQUENCE_LENGTH, EMBEDDING_SIZE], name="input_x") - flattened = tf.reshape(input_x, - [-1, SEQUENCE_LENGTH * EMBEDDING_SIZE]) - - encoder_op = encoder(flattened) - - tf.add_to_collection('encoder_op', encoder_op) - - y_pred = decoder(encoder_op) - - y_true = flattened - - with tf.name_scope("xent"): - consine = tf.div(tf.reduce_sum(tf.multiply(y_pred, y_true), 1), - tf.multiply(tf.sqrt(tf.reduce_sum(tf.multiply(y_pred, y_pred), 1)), - tf.sqrt(tf.reduce_sum(tf.multiply(y_true, y_true), 1)))) - xent = tf.reduce_sum(tf.subtract(tf.constant(1.0), consine)) - tf.summary.scalar("xent", xent) - - with tf.name_scope("train"): - # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xent) - train_step = tf.train.RMSPropOptimizer(0.01).minimize(xent) - summ = tf.summary.merge_all() - global_step = tf.Variable(0) - init_op = tf.global_variables_initializer() - return input_x, init_op, train_step, xent, global_step, summ - - def train_with_cluster(input_x, init_op, train_step, xent, global_step, summ): - - logdir = TFNode.hdfs_path(ctx, params['model']) if clusterMode else None - sv = tf.train.Supervisor(is_chief=ctx.task_index == 0, - logdir=logdir, - init_op=init_op, - summary_op=None, - saver=None, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=10) - with sv.managed_session(server.target) as sess: - tf_feed = TFNode.DataFeed(ctx.mgr, True) - step = 0 - - while not sv.should_stop() and not tf_feed.should_stop() and step < 100: - data = tf_feed.next_batch(params["batch_size"]) - batch_data = feed_dict(data) - step += 1 - _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) - print("global_step:{} xent:{}".format(x, g)) - - if sv.should_stop() or step >= args.steps: - tf_feed.terminate() - sv.stop() - - def train(input_x, init_op, train_step, xent, global_step, summ): - - with tf.Session() as sess: - sess.run(init_op) - for data in _read_data(max_records=params["batch_size"]): - batch_data = feed_dict(data) - _, x, g = sess.run([train_step, xent, global_step], feed_dict={input_x: batch_data}) - print("global_step:{} xent:{}".format(x, g)) - - if clusterMode and ctx.job_name == "ps": - server.join() - elif clusterMode and ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - input_x, init_op, train_step, xent, global_step, summ = build_graph() - train_with_cluster(input_x, init_op, train_step, xent, global_step, summ) - else: - input_x, init_op, train_step, xent, global_step, summ = build_graph() - train(input_x, init_op, train_step, xent, global_step, summ) \ No newline at end of file diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index 3b895fed..ffdfe49c 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -12,16 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import cPickle as pickle + import shutil import threading - -from tensorflowonspark import TFNode - +import sys from sparkdl.estimators.tf_text_file_estimator import TFTextFileEstimator, KafkaMockServer from sparkdl.transformers.tf_text import TFTextTransformer from ..tests import SparkDLTestCase +if sys.version_info[:2] <= (2, 7): + import cPickle as pickle +else: + import _pickle as pickle + def map_fun(args={}, ctx=None, _read_data=None): import tensorflow as tf From 4574d91cc49479520a0edf779bc3568ac8dbe0fe Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 17:24:43 +0800 Subject: [PATCH 24/29] rm /tmp/mock-kafka/ before run test --- python/tests/transformers/tf_text_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index ffdfe49c..b713f582 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -224,6 +224,10 @@ def test_trainText(self): class MockKakfaServerTest(SparkDLTestCase): def test_mockKafkaServerProduce(self): + import os + if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): + shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) + dataset = self.session.createDataFrame([ ("Hi I heard about Spark", 1), ("I wish Java could use case classes", 0), From 4e2202a4ad2dfbd0855784cb0bc2b1d6ff4e3b35 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 18:09:53 +0800 Subject: [PATCH 25/29] kafka file conflics --- python/sparkdl/estimators/tf_text_file_estimator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 31bdb70c..c7250b82 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -311,7 +311,8 @@ class KafkaMockServer(object): * Make sure all data have been writen before consume. * Poll function will just ignore max_records and just return all data in queue. """ - _kafka_mock_server_tmp_file_ = "/tmp/mock-kafka/" + import tempfile + _kafka_mock_server_tmp_file_ = tempfile.mkdtemp() sended = False def __init__(self, index=0): From 43c4cf78e0c2d1cc47c43bc0c3c412ca61ada27f Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 20:05:55 +0800 Subject: [PATCH 26/29] fix --- .../estimators/tf_text_file_estimator.py | 24 ++--- python/tests/transformers/tf_text_test.py | 99 +++---------------- 2 files changed, 25 insertions(+), 98 deletions(-) diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index c7250b82..86c2596f 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -205,10 +205,11 @@ def _fitInParallel(self, dataset, paramMaps): group_id = kafaParams["group_id"] bootstrap_servers = kafaParams["bootstrap_servers"] kafka_test_mode = kafaParams["test_mode"] if "test_mode" in kafaParams else False + mock_kafka_file = kafaParams["mock_kafka_file"] if kafka_test_mode else None def _write_data(): def _write_partition(index, d_iter): - producer = KafkaMockServer(index) if kafka_test_mode else KafkaProducer( + producer = KafkaMockServer(index, mock_kafka_file) if kafka_test_mode else KafkaProducer( bootstrap_servers=bootstrap_servers) try: for d in d_iter: @@ -247,12 +248,12 @@ def _local_fit(override_param_map): params["fitParam"] = override_param_map def _read_data(max_records=64): - consumer = KafkaMockServer() if kafka_test_mode else KafkaConsumer(topic, - group_id=group_id, - bootstrap_servers=bootstrap_servers, - auto_offset_reset="earliest", - enable_auto_commit=False - ) + consumer = KafkaMockServer(0, mock_kafka_file) if kafka_test_mode else KafkaConsumer(topic, + group_id=group_id, + bootstrap_servers=bootstrap_servers, + auto_offset_reset="earliest", + enable_auto_commit=False + ) try: stop_count = 0 fail_msg_count = 0 @@ -312,13 +313,14 @@ class KafkaMockServer(object): * Poll function will just ignore max_records and just return all data in queue. """ import tempfile - _kafka_mock_server_tmp_file_ = tempfile.mkdtemp() + _kafka_mock_server_tmp_file_ = None sended = False - def __init__(self, index=0): + def __init__(self, index=0, mock_kafka_file=None): super(KafkaMockServer, self).__init__() self.index = index self.queue = [] + self._kafka_mock_server_tmp_file_ = mock_kafka_file if not os.path.exists(self._kafka_mock_server_tmp_file_): os.mkdir(self._kafka_mock_server_tmp_file_) @@ -326,7 +328,7 @@ def send(self, topic, msg): self.queue.append(pickle.loads(msg)) def flush(self): - with open(self._kafka_mock_server_tmp_file_ + str(self.index), "w") as f: + with open(self._kafka_mock_server_tmp_file_ + "/" + str(self.index), "w") as f: pickle.dump(self.queue, f) self.queue = [] @@ -339,7 +341,7 @@ def poll(self, timeout_ms, max_records): records = [] for file in os.listdir(self._kafka_mock_server_tmp_file_): - with open(self._kafka_mock_server_tmp_file_ + file) as f: + with open(self._kafka_mock_server_tmp_file_ + "/" + file) as f: tmp = pickle.load(f) records += tmp result = {} diff --git a/python/tests/transformers/tf_text_test.py b/python/tests/transformers/tf_text_test.py index b713f582..942cf874 100644 --- a/python/tests/transformers/tf_text_test.py +++ b/python/tests/transformers/tf_text_test.py @@ -116,36 +116,8 @@ def decoder(x, name="decoder"): sess.close() -class TFTextTransformerTest(SparkDLTestCase): - def test_convertText(self): - input_col = "text" - output_col = "sentence_matrix" - - documentDF = self.session.createDataFrame([ - ("Hi I heard about Spark", 1), - ("I wish Java could use case classes", 0), - ("Logistic regression models are neat", 2) - ], ["text", "preds"]) - - # transform text column to sentence_matrix column which contains 2-D array. - transformer = TFTextTransformer( - inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) - - df = transformer.transform(documentDF) - df.show() - data = df.collect() - self.assertEquals(len(data), 3) - for row in data: - self.assertEqual(len(row[output_col]), 64) - self.assertEqual(len(row[output_col][0]), 100) - - class TFTextFileEstimatorTest(SparkDLTestCase): def test_trainText(self): - import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): - shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) - input_col = "text" output_col = "sentence_matrix" @@ -160,74 +132,24 @@ def test_trainText(self): inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) df = transformer.transform(documentDF) - + import tempfile + mock_kafka_file = tempfile.mkdtemp() # create a estimator to training where map_fun contains tensorflow's code estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", kafkaParam={"bootstrap_servers": ["127.0.0.1"], "topic": "test", + "mock_kafka_file": mock_kafka_file, "group_id": "sdl_1", "test_mode": True}, - fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], runningMode="Normal", + fitParam=[{"epochs": 5, "batch_size": 64}, {"epochs": 5, "batch_size": 1}], mapFnParam=map_fun) estimator.fit(df).collect() - - -# class TFTextFileEstimatorOnTFoSTest(TFoSBaseSparkTest): -# def trainText(self): -# """ -# To make this test work,Please: -# 1. Start a Spark standalone cluster and export MASTER to your env, -# 2. Make sure spark-deep-learning assembly in spark classpath. -# 3. Change method 'trainText' to 'test_trainText' -# """ -# input_col = "text" -# output_col = "sentence_matrix" -# -# documentDF = self.session.createDataFrame([ -# ("Hi I heard about Spark", 1), -# ("I wish Java could use case classes", 0), -# ("Logistic regression models are neat", 2) -# ], ["text", "preds"]) -# -# # transform text column to sentence_matrix column which contains 2-D array. -# transformer = TFTextTransformer( -# inputCol=input_col, outputCol=output_col, embeddingSize=100, sequenceLength=64) -# -# df = transformer.transform(documentDF) -# -# def map_fun(args={}, ctx=None, _read_data=None): -# import time -# self.assertTrue(ctx is not None) -# self.assertTrue(_read_data is None) -# self.assertTrue(args["params"]["fitParam"][0]["cluster_size"] == 2) -# clusterMode = ctx is not None -# if clusterMode and ctx.job_name == "ps": -# time.sleep((ctx.worker_num + 1) * 5) -# -# if clusterMode: -# cluster, server = TFNode.start_cluster_server(ctx, 1) -# -# data = TFNode.DataFeed(ctx.mgr, True) -# batch1 = data.next_batch(1) -# self.assertTrue(len(batch1) == 1) -# self.assertTrue(len(batch1[0]) == 64) -# self.assertTrue(len(batch1[0][0]) == 100) -# # consume all -# data.next_batch(100) -# -# estimator = TFTextFileEstimator(inputCol="sentence_matrix", outputCol="sentence_matrix", labelCol="preds", -# fitParam=[ -# {"epochs": 1, "cluster_size": 2, "batch_size": 1, "model": "/tmp/model"}], -# runningMode="TFoS", -# mapFnParam=map_fun) -# estimator.fit(df).collect() + shutil.rmtree(mock_kafka_file) class MockKakfaServerTest(SparkDLTestCase): def test_mockKafkaServerProduce(self): - import os - if os.path.exists(KafkaMockServer()._kafka_mock_server_tmp_file_): - shutil.rmtree(KafkaMockServer()._kafka_mock_server_tmp_file_) - + import tempfile + mock_kafka_file = tempfile.mkdtemp() dataset = self.session.createDataFrame([ ("Hi I heard about Spark", 1), ("I wish Java could use case classes", 0), @@ -236,7 +158,7 @@ def test_mockKafkaServerProduce(self): def _write_data(): def _write_partition(index, d_iter): - producer = KafkaMockServer(index) + producer = KafkaMockServer(index, mock_kafka_file) try: for d in d_iter: producer.send("", pickle.dumps(d)) @@ -251,7 +173,7 @@ def _write_partition(index, d_iter): _write_data() def _consume(): - consumer = KafkaMockServer() + consumer = KafkaMockServer(0, mock_kafka_file) stop_count = 0 while True: messages = consumer.poll(timeout_ms=1000, max_records=64) @@ -275,3 +197,6 @@ def _consume(): t.start() t2 = threading.Thread(target=_consume) t2.start() + import time + time.sleep(10) + shutil.rmtree(mock_kafka_file) From 713946b008b620ea1a8df92aa13c2ae485075d5c Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 22:04:54 +0800 Subject: [PATCH 27/29] fix pickle in python 3 --- python/sparkdl/estimators/tf_text_file_estimator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sparkdl/estimators/tf_text_file_estimator.py b/python/sparkdl/estimators/tf_text_file_estimator.py index 86c2596f..6a1898d7 100644 --- a/python/sparkdl/estimators/tf_text_file_estimator.py +++ b/python/sparkdl/estimators/tf_text_file_estimator.py @@ -328,7 +328,7 @@ def send(self, topic, msg): self.queue.append(pickle.loads(msg)) def flush(self): - with open(self._kafka_mock_server_tmp_file_ + "/" + str(self.index), "w") as f: + with open(self._kafka_mock_server_tmp_file_ + "/" + str(self.index), "wb") as f: pickle.dump(self.queue, f) self.queue = [] @@ -341,7 +341,7 @@ def poll(self, timeout_ms, max_records): records = [] for file in os.listdir(self._kafka_mock_server_tmp_file_): - with open(self._kafka_mock_server_tmp_file_ + "/" + file) as f: + with open(self._kafka_mock_server_tmp_file_ + "/" + file, "wb") as f: tmp = pickle.load(f) records += tmp result = {} From 63e326588431dee667d1d30452776955e2acbd68 Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 22:31:35 +0800 Subject: [PATCH 28/29] changing TFoSTest.py to TFoSExample.py to avoid unit test --- python/tests/{TFoSTest.py => TFoSExample.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/tests/{TFoSTest.py => TFoSExample.py} (100%) diff --git a/python/tests/TFoSTest.py b/python/tests/TFoSExample.py similarity index 100% rename from python/tests/TFoSTest.py rename to python/tests/TFoSExample.py From bbfcb2052c40a4a69f0eec20da737627c83dfdde Mon Sep 17 00:00:00 2001 From: WilliamZhu Date: Wed, 18 Oct 2017 23:02:45 +0800 Subject: [PATCH 29/29] move TFoSExample from tests --- python/{tests => }/TFoSExample.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/{tests => }/TFoSExample.py (100%) diff --git a/python/tests/TFoSExample.py b/python/TFoSExample.py similarity index 100% rename from python/tests/TFoSExample.py rename to python/TFoSExample.py