From ea2584b4f7ac4ef8a114e3ea85bee2458e9f122c Mon Sep 17 00:00:00 2001 From: MaartenGr Date: Sun, 15 Oct 2023 09:33:53 +0200 Subject: [PATCH] Update docs --- README.md | 8 ++++---- bertopic/_bertopic.py | 3 +-- docs/getting_started/online/online.md | 6 +++++- docs/index.md | 8 ++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 63ba5b35..3f080631 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,12 @@ BERTopic supports all kinds of topic modeling techniques: Multimodal Multi-aspect - Text Generation/LLM + Text Generation/LLM - Merge Models *new!* - Zeroshot *new!* - Seed Words *new!* + Zeroshot (new!) + Merge Models (new!) + Seed Words (new!) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index c834d24e..272ab779 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -3804,14 +3804,13 @@ def _c_tf_idf(self, if self.ctfidf_model.seed_words and self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) - multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier,words)]) + multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) elif self.ctfidf_model.seed_words: multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) elif self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) - if fit: self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier) diff --git a/docs/getting_started/online/online.md b/docs/getting_started/online/online.md index deb7451b..86e28240 100644 --- a/docs/getting_started/online/online.md +++ b/docs/getting_started/online/online.md @@ -13,7 +13,7 @@ In BERTopic, online topic modeling can be a bit tricky as there are several step 3. Cluster reduced embeddings 4. Tokenize topics 5. Extract topic words -6. Diversify topic words +6. (Optional) Fine-tune topic words For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6. @@ -28,6 +28,10 @@ This means that we will need online variants for steps 2 through 4. Steps 2 and Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer). +!!! Tip + If you want to use the original UMAP and HDBSCAN models instead, consider using the [**merge model**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) + functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. + ## **Example** Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally. diff --git a/docs/index.md b/docs/index.md index 6a60404e..594164e2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -30,12 +30,12 @@ BERTopic supports all kinds of topic modeling techniques: Multimodal Multi-aspect - Text Generation + Text Generation - Merge Models *new!* - Zeroshot *new!* - Seed Words *new!* + Zeroshot (new!) + Merge Models (new!) + Seed Words (new!)