From ea2584b4f7ac4ef8a114e3ea85bee2458e9f122c Mon Sep 17 00:00:00 2001
From: MaartenGr <maarten_grootendorst@hotmail.com>
Date: Sun, 15 Oct 2023 09:33:53 +0200
Subject: [PATCH] Update docs

---
 README.md                             | 8 ++++----
 bertopic/_bertopic.py                 | 3 +--
 docs/getting_started/online/online.md | 6 +++++-
 docs/index.md                         | 8 ++++----
 4 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/README.md b/README.md
index 63ba5b35..3f080631 100644
--- a/README.md
+++ b/README.md
@@ -33,12 +33,12 @@ BERTopic supports all kinds of topic modeling techniques:
  <tr>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html">Multimodal</a></td>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html">Multi-aspect</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#text-generation-prompts">Text Generation/LLM</a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/llm.html">Text Generation/LLM</a></td>
  </tr>
  <tr>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models *new!*</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zeroshot *new!*</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words *new!*</a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zeroshot <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words <b>(new!)</b></a></td>
  </tr>
 </table>
 
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index c834d24e..272ab779 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -3804,14 +3804,13 @@ def _c_tf_idf(self,
         if self.ctfidf_model.seed_words and self.seed_topic_list:
             seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
             multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])
-            multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier,words)])
+            multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])
         elif self.ctfidf_model.seed_words:
             multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])
         elif self.seed_topic_list:
             seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
             multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])
         
-
         if fit:
             self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)
 
diff --git a/docs/getting_started/online/online.md b/docs/getting_started/online/online.md
index deb7451b..86e28240 100644
--- a/docs/getting_started/online/online.md
+++ b/docs/getting_started/online/online.md
@@ -13,7 +13,7 @@ In BERTopic, online topic modeling can be a bit tricky as there are several step
 3. Cluster reduced embeddings
 4. Tokenize topics
 5. Extract topic words
-6. Diversify topic words
+6. (Optional) Fine-tune topic words
 
 For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6. 
 
@@ -28,6 +28,10 @@ This means that we will need online variants for steps 2 through 4. Steps 2 and
 Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer).
 
 
+!!! Tip
+    If you want to use the original UMAP and HDBSCAN models instead, consider using the [**merge model**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html)
+    functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging.
+
 ## **Example**
 
 Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally. 
diff --git a/docs/index.md b/docs/index.md
index 6a60404e..594164e2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,12 +30,12 @@ BERTopic supports all kinds of topic modeling techniques:
  <tr>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html">Multimodal</a></td>
     <td><a href="https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html">Multi-aspect</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#text-generation-prompts">Text Generation</a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/llm">Text Generation</a></td>
  </tr>
  <tr>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models *new!*</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zeroshot *new!*</a></td>
-    <td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words *new!*</a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zeroshot <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models <b>(new!)</b></a></td>
+    <td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words <b>(new!)</b></a></td>
  </tr>
 </table>