From 050d421d016639171f9e648f0d277233cccb5a7a Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Wed, 22 Nov 2023 18:51:54 -0800 Subject: [PATCH 1/3] Port sentence_embeddings_with_sbert to Keras 3 Only tf backend for now, and `jit_compile=False` to avoid a XLA error with variables on the wrong device. https://www.tensorflow.org/xla/known_issues#tfvariable_on_a_different_device The long term fix is probably to move all preprocessing out of the call graph of these models (no `Input(dtype="string")`), but for now we can do the simple fix. --- .../sentence_embeddings_with_sbert.ipynb | 50 ++-- .../nlp/md/sentence_embeddings_with_sbert.md | 214 +++++++++++------- .../nlp/sentence_embeddings_with_sbert.py | 49 ++-- scripts/examples_master.py | 6 + 4 files changed, 186 insertions(+), 133 deletions(-) diff --git a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb index 6cc3a958f5..5db7431274 100644 --- a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb +++ b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb @@ -64,7 +64,7 @@ "Let's install and import the libraries we need. We'll be using the KerasNLP library in\n", "this example.\n", "\n", - "We will also enable [mixed perceciosn](https://www.tensorflow.org/guide/mixed_precision)\n", + "We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision)\n", "training. This will help us reduce the training time." ] }, @@ -76,7 +76,8 @@ }, "outputs": [], "source": [ - "!pip install keras-nlp -q" + "!pip install -q --upgrade keras-nlp\n", + "!pip install -q --upgrade keras # Upgrade to Keras 3." ] }, { @@ -87,15 +88,17 @@ }, "outputs": [], "source": [ + "import os\n", + "\n", + "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n", + "\n", + "import keras\n", "import keras_nlp\n", "import tensorflow as tf\n", "import tensorflow_datasets as tfds\n", "import sklearn.cluster as cluster\n", "\n", - "from tensorflow import keras\n", - "\n", - "policy = keras.mixed_precision.Policy(\"mixed_float16\")\n", - "keras.mixed_precision.set_global_policy(policy)" + "keras.mixed_precision.set_global_policy(\"mixed_float16\")" ] }, { @@ -254,13 +257,13 @@ "source": [ "preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n", "backbone = keras_nlp.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n", - "inputs = keras.Input(shape=(1), dtype=\"string\", name=\"sentence\")\n", + "inputs = keras.Input(shape=(1,), dtype=\"string\", name=\"sentence\")\n", "x = preprocessor(inputs)\n", "h = backbone(x)\n", "embedding = keras.layers.GlobalAveragePooling1D(name=\"pooling_layer\")(\n", " h, x[\"padding_mask\"]\n", ")\n", - "n_embedding = tf.linalg.normalize(embedding, axis=1)[0]\n", + "n_embedding = keras.layers.UnitNormalization(axis=1)(embedding)\n", "roberta_normal_encoder = keras.Model(inputs=inputs, outputs=n_embedding)\n", "\n", "roberta_normal_encoder.summary()" @@ -295,11 +298,11 @@ "\n", "class RegressionSiamese(keras.Model):\n", " def __init__(self, encoder, **kwargs):\n", - " inputs = keras.Input(shape=(2), dtype=\"string\", name=\"sentences\")\n", - " sen1, sen2 = tf.split(inputs, num_or_size_splits=2, axis=1, name=\"split\")\n", + " inputs = keras.Input(shape=(2,), dtype=\"string\", name=\"sentences\")\n", + " sen1, sen2 = keras.ops.split(inputs, 2, axis=1)\n", " u = encoder(sen1)\n", " v = encoder(sen2)\n", - " cosine_similarity_scores = tf.matmul(u, tf.transpose(v))\n", + " cosine_similarity_scores = keras.ops.matmul(u, keras.ops.transpose(v))\n", "\n", " super().__init__(\n", " inputs=inputs,\n", @@ -373,6 +376,7 @@ "roberta_regression_siamese.compile(\n", " loss=keras.losses.MeanSquaredError(),\n", " optimizer=keras.optimizers.Adam(2e-5),\n", + " jit_compile=False,\n", ")\n", "\n", "roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1)" @@ -525,7 +529,7 @@ "source": [ "preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n", "backbone = keras_nlp.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n", - "input = keras.Input(shape=(1), dtype=\"string\", name=\"sentence\")\n", + "input = keras.Input(shape=(1,), dtype=\"string\", name=\"sentence\")\n", "\n", "x = preprocessor(input)\n", "h = backbone(x)\n", @@ -564,22 +568,21 @@ "\n", "class TripletSiamese(keras.Model):\n", " def __init__(self, encoder, **kwargs):\n", - "\n", - " anchor = keras.Input(shape=(1), dtype=\"string\")\n", - " positive = keras.Input(shape=(1), dtype=\"string\")\n", - " negative = keras.Input(shape=(1), dtype=\"string\")\n", + " anchor = keras.Input(shape=(1,), dtype=\"string\")\n", + " positive = keras.Input(shape=(1,), dtype=\"string\")\n", + " negative = keras.Input(shape=(1,), dtype=\"string\")\n", "\n", " ea = encoder(anchor)\n", " ep = encoder(positive)\n", " en = encoder(negative)\n", "\n", - " positive_dist = tf.math.reduce_sum(tf.math.pow(ea - ep, 2), axis=1)\n", - " negative_dist = tf.math.reduce_sum(tf.math.pow(ea - en, 2), axis=1)\n", + " positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1)\n", + " negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1)\n", "\n", - " positive_dist = tf.math.sqrt(positive_dist)\n", - " negative_dist = tf.math.sqrt(negative_dist)\n", + " positive_dist = keras.ops.sqrt(positive_dist)\n", + " negative_dist = keras.ops.sqrt(negative_dist)\n", "\n", - " output = tf.stack([positive_dist, negative_dist], axis=0)\n", + " output = keras.ops.stack([positive_dist, negative_dist], axis=0)\n", "\n", " super().__init__(inputs=[anchor, positive, negative], outputs=output, **kwargs)\n", "\n", @@ -627,8 +630,8 @@ " def call(self, y_true, y_pred):\n", " positive_dist, negative_dist = tf.unstack(y_pred, axis=0)\n", "\n", - " losses = tf.nn.relu(positive_dist - negative_dist + self.margin)\n", - " return tf.math.reduce_mean(losses, axis=0)\n", + " losses = keras.ops.relu(positive_dist - negative_dist + self.margin)\n", + " return keras.ops.mean(losses, axis=0)\n", "" ] }, @@ -657,6 +660,7 @@ "roberta_triplet_siamese.compile(\n", " loss=TripletLoss(),\n", " optimizer=keras.optimizers.Adam(2e-5),\n", + " jit_compile=False,\n", ")\n", "\n", "roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1)" diff --git a/examples/nlp/md/sentence_embeddings_with_sbert.md b/examples/nlp/md/sentence_embeddings_with_sbert.md index 2f4a4b428b..157f5c8594 100644 --- a/examples/nlp/md/sentence_embeddings_with_sbert.md +++ b/examples/nlp/md/sentence_embeddings_with_sbert.md @@ -49,25 +49,28 @@ This method of fine-tuning was introduced in Let's install and import the libraries we need. We'll be using the KerasNLP library in this example. -We will also enable [mixed perceciosn](https://www.tensorflow.org/guide/mixed_precision) +We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision) training. This will help us reduce the training time. ```python -!pip install keras-nlp -q +!pip install -q --upgrade keras-nlp +!pip install -q --upgrade keras # Upgrade to Keras 3. ``` ```python +import os + +os.environ["KERAS_BACKEND"] = "tensorflow" + +import keras import keras_nlp import tensorflow as tf import tensorflow_datasets as tfds import sklearn.cluster as cluster -from tensorflow import keras - -policy = keras.mixed_precision.Policy("mixed_float16") -keras.mixed_precision.set_global_policy(policy) +keras.mixed_precision.set_global_policy("mixed_float16") ``` --- @@ -224,48 +227,65 @@ layer to exclude padded tokens from being averaged. ```python preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en") backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en") -inputs = keras.Input(shape=(1), dtype="string", name="sentence") +inputs = keras.Input(shape=(1,), dtype="string", name="sentence") x = preprocessor(inputs) h = backbone(x) embedding = keras.layers.GlobalAveragePooling1D(name="pooling_layer")( h, x["padding_mask"] ) -n_embedding = tf.linalg.normalize(embedding, axis=1)[0] +n_embedding = keras.layers.UnitNormalization(axis=1)(embedding) roberta_normal_encoder = keras.Model(inputs=inputs, outputs=n_embedding) roberta_normal_encoder.summary() ``` -
-``` -Model: "model" -__________________________________________________________________________________________________ - Layer (type) Output Shape Param # Connected to -================================================================================================== - sentence (InputLayer) [(None, 1)] 0 [] - - roberta_preprocessor (RobertaP {'token_ids': (None 0 ['sentence[0][0]'] - reprocessor) , 512), - 'padding_mask': (N - one, 512)} - - roberta_backbone (RobertaBackb (None, None, 768) 124052736 ['roberta_preprocessor[0][0]', - one) 'roberta_preprocessor[0][1]'] - - pooling_layer (GlobalAveragePo (None, 768) 0 ['roberta_backbone[0][0]', - oling1D) 'roberta_preprocessor[0][0]'] - - tf.linalg.normalize (TFOpLambd ((None, 768), 0 ['pooling_layer[0][0]'] - a) (None, 1)) - -================================================================================================== -Total params: 124,052,736 -Trainable params: 124,052,736 -Non-trainable params: 0 -__________________________________________________________________________________________________ -``` -
+
Model: "functional_1"
+
+ + + + +
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Layer (type)         Output Shape       Param #  Connected to         ┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ sentence            │ (None, 1)         │       0 │ -                    │
+│ (InputLayer)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ roberta_preprocess… │ [(None, 512),     │       0 │ sentence[0][0]       │
+│ (RobertaPreprocess… │ (None, 512)]      │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ roberta_backbone    │ (None, 512, 768)  │ 124,05… │ roberta_preprocesso… │
+│ (RobertaBackbone)   │                   │         │ roberta_preprocesso… │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ pooling_layer       │ (None, 768)       │       0 │ roberta_backbone[0]… │
+│ (GlobalAveragePool… │                   │         │ roberta_preprocesso… │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ unit_normalization  │ (None, 768)       │       0 │ pooling_layer[0][0]  │
+│ (UnitNormalization) │                   │         │                      │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+
+ + + + +
 Total params: 124,052,736 (473.22 MB)
+
+ + + + +
 Trainable params: 124,052,736 (473.22 MB)
+
+ + + + +
 Non-trainable params: 0 (0.00 B)
+
+ + + #### Build the Siamese network with the regression objective function. It's described above that the Siamese network has two or more subnetworks, and for this @@ -282,11 +302,11 @@ sentences. class RegressionSiamese(keras.Model): def __init__(self, encoder, **kwargs): - inputs = keras.Input(shape=(2), dtype="string", name="sentences") - sen1, sen2 = tf.split(inputs, num_or_size_splits=2, axis=1, name="split") + inputs = keras.Input(shape=(2,), dtype="string", name="sentences") + sen1, sen2 = keras.ops.split(inputs, 2, axis=1) u = encoder(sen1) v = encoder(sen2) - cosine_similarity_scores = tf.matmul(u, tf.transpose(v)) + cosine_similarity_scores = keras.ops.matmul(u, keras.ops.transpose(v)) super().__init__( inputs=inputs, @@ -326,9 +346,9 @@ for i, sim in enumerate(cosine_similarity_scores[0]):
``` -cosine similarity score between sentence 1 and the query = 0.966796875 -cosine similarity score between sentence 2 and the query = 0.9765625 -cosine similarity score between sentence 3 and the query = 0.9931640625 +cosine similarity score between sentence 1 and the query = 0.96630859375 +cosine similarity score between sentence 2 and the query = 0.97607421875 +cosine similarity score between sentence 3 and the query = 0.99365234375 ```
@@ -342,6 +362,7 @@ roberta_regression_siamese = RegressionSiamese(roberta_normal_encoder) roberta_regression_siamese.compile( loss=keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(2e-5), + jit_compile=False, ) roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1) @@ -349,9 +370,9 @@ roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1)
``` -300/300 [==============================] - 541s 1s/step - loss: 0.3977 - val_loss: 0.4083 + 300/300 ━━━━━━━━━━━━━━━━━━━━ 116s 304ms/step - loss: 0.4691 - val_loss: 0.4052 - + ```
@@ -381,9 +402,9 @@ for i, sim in enumerate(cosine_simalarities[0]):
``` -cosine similarity between sentence 1 and the query = 0.1326904296875 -cosine similarity between sentence 2 and the query = 0.458740234375 -cosine similarity between sentence 3 and the query = 0.79931640625 +cosine similarity between sentence 1 and the query = 0.050384521484375 +cosine similarity between sentence 2 and the query = 0.468505859375 +cosine similarity between sentence 3 and the query = 0.669921875 ```
@@ -466,7 +487,7 @@ sentence. ```python preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en") backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en") -input = keras.Input(shape=(1), dtype="string", name="sentence") +input = keras.Input(shape=(1,), dtype="string", name="sentence") x = preprocessor(input) h = backbone(x) @@ -480,33 +501,50 @@ roberta_encoder = keras.Model(inputs=input, outputs=embedding) roberta_encoder.summary() ``` -
-``` -Model: "model_1" -__________________________________________________________________________________________________ - Layer (type) Output Shape Param # Connected to -================================================================================================== - sentence (InputLayer) [(None, 1)] 0 [] - - roberta_preprocessor_1 (Robert {'token_ids': (None 0 ['sentence[0][0]'] - aPreprocessor) , 512), - 'padding_mask': (N - one, 512)} - - roberta_backbone_1 (RobertaBac (None, None, 768) 124052736 ['roberta_preprocessor_1[0][0]', - kbone) 'roberta_preprocessor_1[0][1]'] - - pooling_layer (GlobalAveragePo (None, 768) 0 ['roberta_backbone_1[0][0]', - oling1D) 'roberta_preprocessor_1[0][0]'] - -================================================================================================== -Total params: 124,052,736 -Trainable params: 124,052,736 -Non-trainable params: 0 -__________________________________________________________________________________________________ -``` -
+
Model: "functional_3"
+
+ + + + +
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Layer (type)         Output Shape       Param #  Connected to         ┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ sentence            │ (None, 1)         │       0 │ -                    │
+│ (InputLayer)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ roberta_preprocess… │ [(None, 512),     │       0 │ sentence[0][0]       │
+│ (RobertaPreprocess… │ (None, 512)]      │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ roberta_backbone_1  │ (None, 512, 768)  │ 124,05… │ roberta_preprocesso… │
+│ (RobertaBackbone)   │                   │         │ roberta_preprocesso… │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ pooling_layer       │ (None, 768)       │       0 │ roberta_backbone_1[ │
+│ (GlobalAveragePool… │                   │         │ roberta_preprocesso… │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+
+ + + + +
 Total params: 124,052,736 (473.22 MB)
+
+ + + + +
 Trainable params: 124,052,736 (473.22 MB)
+
+ + + + +
 Non-trainable params: 0 (0.00 B)
+
+ + + #### Build the Siamese network with the triplet objective function For the Siamese network with the triplet objective function, we will build the model with @@ -519,22 +557,21 @@ embedding for each sentence, and we will calculate the `positive_dist` and class TripletSiamese(keras.Model): def __init__(self, encoder, **kwargs): - - anchor = keras.Input(shape=(1), dtype="string") - positive = keras.Input(shape=(1), dtype="string") - negative = keras.Input(shape=(1), dtype="string") + anchor = keras.Input(shape=(1,), dtype="string") + positive = keras.Input(shape=(1,), dtype="string") + negative = keras.Input(shape=(1,), dtype="string") ea = encoder(anchor) ep = encoder(positive) en = encoder(negative) - positive_dist = tf.math.reduce_sum(tf.math.pow(ea - ep, 2), axis=1) - negative_dist = tf.math.reduce_sum(tf.math.pow(ea - en, 2), axis=1) + positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1) + negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1) - positive_dist = tf.math.sqrt(positive_dist) - negative_dist = tf.math.sqrt(negative_dist) + positive_dist = keras.ops.sqrt(positive_dist) + negative_dist = keras.ops.sqrt(negative_dist) - output = tf.stack([positive_dist, negative_dist], axis=0) + output = keras.ops.stack([positive_dist, negative_dist], axis=0) super().__init__(inputs=[anchor, positive, negative], outputs=output, **kwargs) @@ -569,8 +606,8 @@ class TripletLoss(keras.losses.Loss): def call(self, y_true, y_pred): positive_dist, negative_dist = tf.unstack(y_pred, axis=0) - losses = tf.nn.relu(positive_dist - negative_dist + self.margin) - return tf.math.reduce_mean(losses, axis=0) + losses = keras.ops.relu(positive_dist - negative_dist + self.margin) + return keras.ops.mean(losses, axis=0) ``` @@ -586,6 +623,7 @@ roberta_triplet_siamese = TripletSiamese(roberta_encoder) roberta_triplet_siamese.compile( loss=TripletLoss(), optimizer=keras.optimizers.Adam(2e-5), + jit_compile=False, ) roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1) @@ -593,9 +631,9 @@ roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1)
``` -200/200 [==============================] - 641s 3s/step - loss: 0.7426 - val_loss: 0.6533 + 200/200 ━━━━━━━━━━━━━━━━━━━━ 128s 472ms/step - loss: 0.7984 - val_loss: 0.6179 - + ```
diff --git a/examples/nlp/sentence_embeddings_with_sbert.py b/examples/nlp/sentence_embeddings_with_sbert.py index 90266d6d8e..4a994fe6bd 100644 --- a/examples/nlp/sentence_embeddings_with_sbert.py +++ b/examples/nlp/sentence_embeddings_with_sbert.py @@ -47,23 +47,26 @@ Let's install and import the libraries we need. We'll be using the KerasNLP library in this example. -We will also enable [mixed perceciosn](https://www.tensorflow.org/guide/mixed_precision) +We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision) training. This will help us reduce the training time. """ """shell -pip install keras-nlp -q +pip install -q --upgrade keras-nlp +pip install -q --upgrade keras # Upgrade to Keras 3. """ +import os + +os.environ["KERAS_BACKEND"] = "tensorflow" + +import keras import keras_nlp import tensorflow as tf import tensorflow_datasets as tfds import sklearn.cluster as cluster -from tensorflow import keras - -policy = keras.mixed_precision.Policy("mixed_float16") -keras.mixed_precision.set_global_policy(policy) +keras.mixed_precision.set_global_policy("mixed_float16") """ ## Fine-tune the model using siamese networks @@ -170,13 +173,13 @@ def prepare_dataset(dataset, num_batchs, batch_size): preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en") backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en") -inputs = keras.Input(shape=(1), dtype="string", name="sentence") +inputs = keras.Input(shape=(1,), dtype="string", name="sentence") x = preprocessor(inputs) h = backbone(x) embedding = keras.layers.GlobalAveragePooling1D(name="pooling_layer")( h, x["padding_mask"] ) -n_embedding = tf.linalg.normalize(embedding, axis=1)[0] +n_embedding = keras.layers.UnitNormalization(axis=1)(embedding) roberta_normal_encoder = keras.Model(inputs=inputs, outputs=n_embedding) roberta_normal_encoder.summary() @@ -197,11 +200,11 @@ def prepare_dataset(dataset, num_batchs, batch_size): class RegressionSiamese(keras.Model): def __init__(self, encoder, **kwargs): - inputs = keras.Input(shape=(2), dtype="string", name="sentences") - sen1, sen2 = tf.split(inputs, num_or_size_splits=2, axis=1, name="split") + inputs = keras.Input(shape=(2,), dtype="string", name="sentences") + sen1, sen2 = keras.ops.split(inputs, 2, axis=1) u = encoder(sen1) v = encoder(sen2) - cosine_similarity_scores = tf.matmul(u, tf.transpose(v)) + cosine_similarity_scores = keras.ops.matmul(u, keras.ops.transpose(v)) super().__init__( inputs=inputs, @@ -247,6 +250,7 @@ def get_encoder(self): roberta_regression_siamese.compile( loss=keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(2e-5), + jit_compile=False, ) roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1) @@ -345,7 +349,7 @@ def prepare_wiki_data(dataset, num_batchs): preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en") backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en") -input = keras.Input(shape=(1), dtype="string", name="sentence") +input = keras.Input(shape=(1,), dtype="string", name="sentence") x = preprocessor(input) h = backbone(x) @@ -370,21 +374,21 @@ def prepare_wiki_data(dataset, num_batchs): class TripletSiamese(keras.Model): def __init__(self, encoder, **kwargs): - anchor = keras.Input(shape=(1), dtype="string") - positive = keras.Input(shape=(1), dtype="string") - negative = keras.Input(shape=(1), dtype="string") + anchor = keras.Input(shape=(1,), dtype="string") + positive = keras.Input(shape=(1,), dtype="string") + negative = keras.Input(shape=(1,), dtype="string") ea = encoder(anchor) ep = encoder(positive) en = encoder(negative) - positive_dist = tf.math.reduce_sum(tf.math.pow(ea - ep, 2), axis=1) - negative_dist = tf.math.reduce_sum(tf.math.pow(ea - en, 2), axis=1) + positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1) + negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1) - positive_dist = tf.math.sqrt(positive_dist) - negative_dist = tf.math.sqrt(negative_dist) + positive_dist = keras.ops.sqrt(positive_dist) + negative_dist = keras.ops.sqrt(negative_dist) - output = tf.stack([positive_dist, negative_dist], axis=0) + output = keras.ops.stack([positive_dist, negative_dist], axis=0) super().__init__(inputs=[anchor, positive, negative], outputs=output, **kwargs) @@ -418,8 +422,8 @@ def __init__(self, margin=1, **kwargs): def call(self, y_true, y_pred): positive_dist, negative_dist = tf.unstack(y_pred, axis=0) - losses = tf.nn.relu(positive_dist - negative_dist + self.margin) - return tf.math.reduce_mean(losses, axis=0) + losses = keras.ops.relu(positive_dist - negative_dist + self.margin) + return keras.ops.mean(losses, axis=0) """ @@ -434,6 +438,7 @@ def call(self, y_true, y_pred): roberta_triplet_siamese.compile( loss=TripletLoss(), optimizer=keras.optimizers.Adam(2e-5), + jit_compile=False, ) roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1) diff --git a/scripts/examples_master.py b/scripts/examples_master.py index bfbafdd152..99a3dcc221 100644 --- a/scripts/examples_master.py +++ b/scripts/examples_master.py @@ -478,6 +478,12 @@ "subcategory": "Text similarity search", "keras_3": True, }, + { + "path": "sentence_embeddings_with_sbert", + "title": "Sentence embeddings using Siamese RoBERTa-networks", + "subcategory": "Text similarity search", + "keras_3": True, + }, # Language modeling { "path": "masked_language_modeling", From c5ad0524bd24f70a53348bd1ef253982b1bd2a86 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Mon, 27 Nov 2023 13:29:19 -0800 Subject: [PATCH 2/3] Fix typos --- .../sentence_embeddings_with_sbert.ipynb | 26 ++++++------ .../nlp/md/sentence_embeddings_with_sbert.md | 42 +++++++++---------- .../nlp/sentence_embeddings_with_sbert.py | 26 ++++++------ 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb index 5db7431274..8d7ce2019b 100644 --- a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb +++ b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb @@ -169,8 +169,8 @@ "TRAIN_BATCH_SIZE = 6\n", "VALIDATION_BATCH_SIZE = 8\n", "\n", - "TRAIN_NUM_BATCHS = 300\n", - "VALIDATION_NUM_BATCHS = 40\n", + "TRAIN_NUM_BATCHES = 300\n", + "VALIDATION_NUM_BATCHES = 40\n", "\n", "AUTOTUNE = tf.data.experimental.AUTOTUNE\n", "\n", @@ -179,7 +179,7 @@ " return (x / 2.5) - 1\n", "\n", "\n", - "def prepare_dataset(dataset, num_batchs, batch_size):\n", + "def prepare_dataset(dataset, num_batches, batch_size):\n", " dataset = dataset.map(\n", " lambda z: (\n", " [z[\"sentence1\"], z[\"sentence2\"]],\n", @@ -188,7 +188,7 @@ " num_parallel_calls=AUTOTUNE,\n", " )\n", " dataset = dataset.batch(batch_size)\n", - " dataset = dataset.take(num_batchs)\n", + " dataset = dataset.take(num_batches)\n", " dataset = dataset.prefetch(AUTOTUNE)\n", " return dataset\n", "\n", @@ -198,8 +198,8 @@ ")\n", "stsb_train, stsb_valid = stsb_ds[\"train\"], stsb_ds[\"validation\"]\n", "\n", - "stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHS, TRAIN_BATCH_SIZE)\n", - "stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHS, VALIDATION_BATCH_SIZE)" + "stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHES, TRAIN_BATCH_SIZE)\n", + "stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHES, VALIDATION_BATCH_SIZE)" ] }, { @@ -472,17 +472,17 @@ }, "outputs": [], "source": [ - "NUM_TRAIN_BATCHS = 200\n", - "NUM_TEST_BATCHS = 75\n", + "NUM_TRAIN_BATCHES = 200\n", + "NUM_TEST_BATCHES = 75\n", "AUTOTUNE = tf.data.experimental.AUTOTUNE\n", "\n", "\n", - "def prepare_wiki_data(dataset, num_batchs):\n", + "def prepare_wiki_data(dataset, num_batches):\n", " dataset = dataset.map(\n", " lambda z: ((z[\"Sentence1\"], z[\"Sentence2\"], z[\"Sentence3\"]), 0)\n", " )\n", " dataset = dataset.batch(6)\n", - " dataset = dataset.take(num_batchs)\n", + " dataset = dataset.take(num_batches)\n", " dataset = dataset.prefetch(AUTOTUNE)\n", " return dataset\n", "\n", @@ -498,8 +498,8 @@ " num_epochs=1,\n", ")\n", "\n", - "wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHS)\n", - "wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHS)" + "wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHES)\n", + "wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHES)" ] }, { @@ -691,7 +691,7 @@ " \"How can I improve my English?\",\n", " \"How to earn money online?\",\n", " \"How do I earn money online?\",\n", - " \"How to work and ean money through internet?\",\n", + " \"How to work and earn money through internet?\",\n", "]\n", "\n", "encoder = roberta_triplet_siamese.get_encoder()\n", diff --git a/examples/nlp/md/sentence_embeddings_with_sbert.md b/examples/nlp/md/sentence_embeddings_with_sbert.md index 157f5c8594..1f6b5dafed 100644 --- a/examples/nlp/md/sentence_embeddings_with_sbert.md +++ b/examples/nlp/md/sentence_embeddings_with_sbert.md @@ -115,8 +115,8 @@ divide the labels by 2.5 and subtract 1. TRAIN_BATCH_SIZE = 6 VALIDATION_BATCH_SIZE = 8 -TRAIN_NUM_BATCHS = 300 -VALIDATION_NUM_BATCHS = 40 +TRAIN_NUM_BATCHES = 300 +VALIDATION_NUM_BATCHES = 40 AUTOTUNE = tf.data.experimental.AUTOTUNE @@ -125,7 +125,7 @@ def change_range(x): return (x / 2.5) - 1 -def prepare_dataset(dataset, num_batchs, batch_size): +def prepare_dataset(dataset, num_batches, batch_size): dataset = dataset.map( lambda z: ( [z["sentence1"], z["sentence2"]], @@ -134,7 +134,7 @@ def prepare_dataset(dataset, num_batchs, batch_size): num_parallel_calls=AUTOTUNE, ) dataset = dataset.batch(batch_size) - dataset = dataset.take(num_batchs) + dataset = dataset.take(num_batches) dataset = dataset.prefetch(AUTOTUNE) return dataset @@ -144,8 +144,8 @@ stsb_ds = tfds.load( ) stsb_train, stsb_valid = stsb_ds["train"], stsb_ds["validation"] -stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHS, TRAIN_BATCH_SIZE) -stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHS, VALIDATION_BATCH_SIZE) +stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHES, TRAIN_BATCH_SIZE) +stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHES, VALIDATION_BATCH_SIZE) ``` Let's see examples from the dataset of two sentenses and their similarity. @@ -370,9 +370,9 @@ roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1)
``` - 300/300 ━━━━━━━━━━━━━━━━━━━━ 116s 304ms/step - loss: 0.4691 - val_loss: 0.4052 + 300/300 ━━━━━━━━━━━━━━━━━━━━ 115s 297ms/step - loss: 0.4751 - val_loss: 0.4025 - + ```
@@ -402,9 +402,9 @@ for i, sim in enumerate(cosine_simalarities[0]):
``` -cosine similarity between sentence 1 and the query = 0.050384521484375 -cosine similarity between sentence 2 and the query = 0.468505859375 -cosine similarity between sentence 3 and the query = 0.669921875 +cosine similarity between sentence 1 and the query = 0.10986328125 +cosine similarity between sentence 2 and the query = 0.53466796875 +cosine similarity between sentence 3 and the query = 0.83544921875 ```
@@ -434,17 +434,17 @@ example, we will only use 1200 triplets for training and 300 for testing. ``` ```python -NUM_TRAIN_BATCHS = 200 -NUM_TEST_BATCHS = 75 +NUM_TRAIN_BATCHES = 200 +NUM_TEST_BATCHES = 75 AUTOTUNE = tf.data.experimental.AUTOTUNE -def prepare_wiki_data(dataset, num_batchs): +def prepare_wiki_data(dataset, num_batches): dataset = dataset.map( lambda z: ((z["Sentence1"], z["Sentence2"], z["Sentence3"]), 0) ) dataset = dataset.batch(6) - dataset = dataset.take(num_batchs) + dataset = dataset.take(num_batches) dataset = dataset.prefetch(AUTOTUNE) return dataset @@ -460,8 +460,8 @@ wiki_test = tf.data.experimental.make_csv_dataset( num_epochs=1, ) -wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHS) -wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHS) +wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHES) +wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHES) ```
``` @@ -631,9 +631,9 @@ roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1)
``` - 200/200 ━━━━━━━━━━━━━━━━━━━━ 128s 472ms/step - loss: 0.7984 - val_loss: 0.6179 + 200/200 ━━━━━━━━━━━━━━━━━━━━ 128s 467ms/step - loss: 0.7822 - val_loss: 0.7126 - + ```
@@ -649,7 +649,7 @@ questions = [ "How can I improve my English?", "How to earn money online?", "How do I earn money online?", - "How to work and ean money through internet?", + "How to work and earn money through internet?", ] encoder = roberta_triplet_siamese.get_encoder() @@ -667,7 +667,7 @@ sentence (How to be good at speaking English?) belongs to cluster 1 sentence (How can I improve my English?) belongs to cluster 1 sentence (How to earn money online?) belongs to cluster 0 sentence (How do I earn money online?) belongs to cluster 0 -sentence (How to work and ean money through internet?) belongs to cluster 0 +sentence (How to work and earn money through internet?) belongs to cluster 0 ```
\ No newline at end of file diff --git a/examples/nlp/sentence_embeddings_with_sbert.py b/examples/nlp/sentence_embeddings_with_sbert.py index 4a994fe6bd..3f98562b64 100644 --- a/examples/nlp/sentence_embeddings_with_sbert.py +++ b/examples/nlp/sentence_embeddings_with_sbert.py @@ -113,8 +113,8 @@ TRAIN_BATCH_SIZE = 6 VALIDATION_BATCH_SIZE = 8 -TRAIN_NUM_BATCHS = 300 -VALIDATION_NUM_BATCHS = 40 +TRAIN_NUM_BATCHES = 300 +VALIDATION_NUM_BATCHES = 40 AUTOTUNE = tf.data.experimental.AUTOTUNE @@ -123,7 +123,7 @@ def change_range(x): return (x / 2.5) - 1 -def prepare_dataset(dataset, num_batchs, batch_size): +def prepare_dataset(dataset, num_batches, batch_size): dataset = dataset.map( lambda z: ( [z["sentence1"], z["sentence2"]], @@ -132,7 +132,7 @@ def prepare_dataset(dataset, num_batchs, batch_size): num_parallel_calls=AUTOTUNE, ) dataset = dataset.batch(batch_size) - dataset = dataset.take(num_batchs) + dataset = dataset.take(num_batches) dataset = dataset.prefetch(AUTOTUNE) return dataset @@ -142,8 +142,8 @@ def prepare_dataset(dataset, num_batchs, batch_size): ) stsb_train, stsb_valid = stsb_ds["train"], stsb_ds["validation"] -stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHS, TRAIN_BATCH_SIZE) -stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHS, VALIDATION_BATCH_SIZE) +stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHES, TRAIN_BATCH_SIZE) +stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHES, VALIDATION_BATCH_SIZE) """ Let's see examples from the dataset of two sentenses and their similarity. @@ -306,17 +306,17 @@ def get_encoder(self): unzip wikipedia-sections-triplets.zip -d wikipedia-sections-triplets """ -NUM_TRAIN_BATCHS = 200 -NUM_TEST_BATCHS = 75 +NUM_TRAIN_BATCHES = 200 +NUM_TEST_BATCHES = 75 AUTOTUNE = tf.data.experimental.AUTOTUNE -def prepare_wiki_data(dataset, num_batchs): +def prepare_wiki_data(dataset, num_batches): dataset = dataset.map( lambda z: ((z["Sentence1"], z["Sentence2"], z["Sentence3"]), 0) ) dataset = dataset.batch(6) - dataset = dataset.take(num_batchs) + dataset = dataset.take(num_batches) dataset = dataset.prefetch(AUTOTUNE) return dataset @@ -332,8 +332,8 @@ def prepare_wiki_data(dataset, num_batchs): num_epochs=1, ) -wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHS) -wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHS) +wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHES) +wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHES) """ #### Build the encoder model @@ -455,7 +455,7 @@ def call(self, y_true, y_pred): "How can I improve my English?", "How to earn money online?", "How do I earn money online?", - "How to work and ean money through internet?", + "How to work and earn money through internet?", ] encoder = roberta_triplet_siamese.get_encoder() From 73b40fdeeae54b63186779203b185a8f99fc9a8f Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Mon, 27 Nov 2023 13:46:09 -0800 Subject: [PATCH 3/3] ops.square --- examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb | 4 ++-- examples/nlp/md/sentence_embeddings_with_sbert.md | 4 ++-- examples/nlp/sentence_embeddings_with_sbert.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb index 8d7ce2019b..4d0d1841d9 100644 --- a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb +++ b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb @@ -576,8 +576,8 @@ " ep = encoder(positive)\n", " en = encoder(negative)\n", "\n", - " positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1)\n", - " negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1)\n", + " positive_dist = keras.ops.sum(keras.ops.square(ea - ep), axis=1)\n", + " negative_dist = keras.ops.sum(keras.ops.square(ea - en), axis=1)\n", "\n", " positive_dist = keras.ops.sqrt(positive_dist)\n", " negative_dist = keras.ops.sqrt(negative_dist)\n", diff --git a/examples/nlp/md/sentence_embeddings_with_sbert.md b/examples/nlp/md/sentence_embeddings_with_sbert.md index 1f6b5dafed..b40a19b62a 100644 --- a/examples/nlp/md/sentence_embeddings_with_sbert.md +++ b/examples/nlp/md/sentence_embeddings_with_sbert.md @@ -565,8 +565,8 @@ class TripletSiamese(keras.Model): ep = encoder(positive) en = encoder(negative) - positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1) - negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1) + positive_dist = keras.ops.sum(keras.ops.square(ea - ep), axis=1) + negative_dist = keras.ops.sum(keras.ops.square(ea - en), axis=1) positive_dist = keras.ops.sqrt(positive_dist) negative_dist = keras.ops.sqrt(negative_dist) diff --git a/examples/nlp/sentence_embeddings_with_sbert.py b/examples/nlp/sentence_embeddings_with_sbert.py index 3f98562b64..2e3158ed82 100644 --- a/examples/nlp/sentence_embeddings_with_sbert.py +++ b/examples/nlp/sentence_embeddings_with_sbert.py @@ -382,8 +382,8 @@ def __init__(self, encoder, **kwargs): ep = encoder(positive) en = encoder(negative) - positive_dist = keras.ops.sum(keras.ops.power(ea - ep, 2), axis=1) - negative_dist = keras.ops.sum(keras.ops.power(ea - en, 2), axis=1) + positive_dist = keras.ops.sum(keras.ops.square(ea - ep), axis=1) + negative_dist = keras.ops.sum(keras.ops.square(ea - en), axis=1) positive_dist = keras.ops.sqrt(positive_dist) negative_dist = keras.ops.sqrt(negative_dist)