@@ -159,8 +159,8 @@ import openai
from bertopic.representation import OpenAI
# Fine-tune topic representations with GPT
-openai.api_key = "sk-..."
-representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
+client = openai.OpenAI(api_key="sk-...")
+representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
topic_model = BERTopic(representation_model=representation_model)
```
@@ -259,6 +259,7 @@ There are many different use cases in which topic modeling can be used. As such,
| [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` |
| [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` |
| [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` |
+| [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` |
| [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` |
diff --git a/bertopic/__init__.py b/bertopic/__init__.py
index 533092fa..a8784ace 100644
--- a/bertopic/__init__.py
+++ b/bertopic/__init__.py
@@ -1,6 +1,6 @@
from bertopic._bertopic import BERTopic
-__version__ = "0.15.0"
+__version__ = "0.16.0"
__all__ = [
"BERTopic",
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index bf4613f1..22b6c03c 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -24,7 +24,6 @@
from collections import defaultdict, Counter
from scipy.sparse import csr_matrix
from scipy.cluster import hierarchy as sch
-from scipy.spatial.distance import squareform
# Typing
import sys
@@ -91,7 +90,7 @@ class BERTopic:
topic_labels_ (Mapping[int, str]) : The default labels for each topic.
custom_labels_ (List[str]) : Custom labels for each topic.
topic_embeddings_ (np.ndarray) : The embeddings for each topic. It is calculated by taking the
- weighted average of word embeddings in a topic based on their c-TF-IDF values.
+ centroid embedding of each cluster.
representative_docs_ (Mapping[int, str]) : The representative documents for each topic.
Examples:
@@ -131,6 +130,8 @@ def __init__(self,
low_memory: bool = False,
calculate_probabilities: bool = False,
seed_topic_list: List[List[str]] = None,
+ zeroshot_topic_list: List[str] = None,
+ zeroshot_min_similarity: float = .7,
embedding_model=None,
umap_model: UMAP = None,
hdbscan_model: hdbscan.HDBSCAN = None,
@@ -147,7 +148,7 @@ def __init__(self,
supported languages see bertopic.backend.languages. Select
"multilingual" to load in the `paraphrase-multilingual-MiniLM-L12-v2`
sentence-transformers model that supports 50+ languages.
- NOTE: This is not used if `embedding_model` is used.
+ NOTE: This is not used if `embedding_model` is used.
top_n_words: The number of words per topic to extract. Setting this
too high can negatively impact topic embeddings as topics
are typically best represented by at most 10 words.
@@ -157,25 +158,32 @@ def __init__(self,
NOTE: This param will not be used if you pass in your own
CountVectorizer.
min_topic_size: The minimum size of the topic. Increasing this value will lead
- to a lower number of clusters/topics.
- NOTE: This param will not be used if you are not using HDBSCAN.
+ to a lower number of clusters/topics and vice versa.
+ It is the same parameter as `min_cluster_size` in HDBSCAN.
+ NOTE: This param will not be used if you are `hdbscan_model`.
nr_topics: Specifying the number of topics will reduce the initial
number of topics to the value specified. This reduction can take
a while as each reduction in topics (-1) activates a c-TF-IDF
calculation. If this is set to None, no reduction is applied. Use
"auto" to automatically reduce topics using HDBSCAN.
+ NOTE: Controlling the number of topics is best done by adjusting
+ `min_topic_size` first before adjusting this parameter.
low_memory: Sets UMAP low memory to True to make sure less memory is used.
NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP
this parameter will not be used.
calculate_probabilities: Calculate the probabilities of all topics
per document instead of the probability of the assigned
topic per document. This could slow down the extraction
- of topics if you have many documents (> 100_000).
+ of topics if you have many documents (> 100_000).
NOTE: If false you cannot use the corresponding
visualization method `visualize_probabilities`.
NOTE: This is an approximation of topic probabilities
as used in HDBSCAN and not an exact representation.
seed_topic_list: A list of seed words per topic to converge around
+ zeroshot_topic_list: A list of topic names to use for zero-shot classification
+ zeroshot_min_similarity: The minimum similarity between a zero-shot topic and
+ a document for assignment. The higher this value, the more
+ confident the model needs to be to assign a zero-shot topic to a document.
verbose: Changes the verbosity of the model, Set to True if you want
to track the stages of the model.
embedding_model: Use a custom embedding model.
@@ -196,14 +204,14 @@ def __init__(self,
`.fit` and `.predict` functions along with the `.labels_` variable.
vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model.
ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model.
- representation_model: Pass in a model that fine-tunes the topic representations
+ representation_model: Pass in a model that fine-tunes the topic representations
calculated through c-TF-IDF. Models from `bertopic.representation`
are supported.
"""
# Topic-based parameters
if top_n_words > 100:
- warnings.warn("Note that extracting more than 100 words from a sparse "
- "can slow down computation quite a bit.")
+ logger.warning("Note that extracting more than 100 words from a sparse "
+ "can slow down computation quite a bit.")
self.top_n_words = top_n_words
self.min_topic_size = min_topic_size
@@ -212,6 +220,8 @@ def __init__(self,
self.calculate_probabilities = calculate_probabilities
self.verbose = verbose
self.seed_topic_list = seed_topic_list
+ self.zeroshot_topic_list = zeroshot_topic_list
+ self.zeroshot_min_similarity = zeroshot_min_similarity
# Embedding model
self.language = language if not embedding_model else None
@@ -259,6 +269,8 @@ def __init__(self,
if verbose:
logger.set_level("DEBUG")
+ else:
+ logger.set_level("WARNING")
def fit(self,
documents: List[str],
@@ -310,7 +322,7 @@ def fit_transform(self,
images: List[str] = None,
y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
Union[np.ndarray, None]]:
- """ Fit the models on a collection of documents, generate topics,
+ """ Fit the models on a collection of documents, generate topics,
and return the probabilities and topic per document.
Arguments:
@@ -369,21 +381,30 @@ def fit_transform(self,
# Extract embeddings
if embeddings is None:
+ logger.info("Embedding - Transforming documents to embeddings.")
self.embedding_model = select_backend(self.embedding_model,
language=self.language)
embeddings = self._extract_embeddings(documents.Document.values.tolist(),
images=images,
method="document",
verbose=self.verbose)
- logger.info("Transformed documents to Embeddings")
+ logger.info("Embedding - Completed \u2713")
else:
if self.embedding_model is not None:
self.embedding_model = select_backend(self.embedding_model,
language=self.language)
- # Reduce dimensionality
+ # Guided Topic Modeling
if self.seed_topic_list is not None and self.embedding_model is not None:
y, embeddings = self._guided_topic_modeling(embeddings)
+
+ # Zero-shot Topic Modeling
+ if self._is_zeroshot():
+ documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings)
+ if documents is None:
+ return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)
+
+ # Reduce dimensionality
umap_embeddings = self._reduce_dimensionality(embeddings, y)
# Cluster reduced embeddings
@@ -395,8 +416,8 @@ def fit_transform(self,
# Create documents from images if we have images only
if documents.Document.values[0] is None:
- custom_documents = self._images_to_text(documents, embeddings)
-
+ custom_documents = self._images_to_text(documents, embeddings)
+
# Extract topics by calculating c-TF-IDF
self._extract_topics(custom_documents, embeddings=embeddings)
self._create_topic_vectors(documents=documents, embeddings=embeddings)
@@ -408,8 +429,8 @@ def fit_transform(self,
# Save the top 3 most representative documents per topic
self._save_representative_docs(custom_documents)
else:
- # Extract topics by calculating c-TF-IDF
- self._extract_topics(documents, embeddings=embeddings)
+ # Extract topics by calculating c-TF-IDF
+ self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)
# Reduce topics
if self.nr_topics:
@@ -417,16 +438,20 @@ def fit_transform(self,
# Save the top 3 most representative documents per topic
self._save_representative_docs(documents)
-
+
# Resulting output
self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
predictions = documents.Topic.to_list()
+ # Combine Zero-shot with outliers
+ if self._is_zeroshot() and len(documents) != len(doc_ids):
+ predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)
+
return predictions, self.probabilities_
def transform(self,
documents: Union[str, List[str]],
- embeddings: np.ndarray = None,
+ embeddings: np.ndarray = None,
images: List[str] = None) -> Tuple[List[int], np.ndarray]:
""" After having fit a model, use transform to predict new instances
@@ -482,7 +507,7 @@ def transform(self,
images=images,
method="document",
verbose=self.verbose)
-
+
# Check if an embedding model was found
if embeddings is None:
raise ValueError("No embedding model was found to embed the documents."
@@ -491,6 +516,7 @@ def transform(self,
# Transform without hdbscan_model and umap_model using only cosine similarity
elif type(self.hdbscan_model) == BaseCluster:
+ logger.info("Predicting topic assignments through cosine similarity of topic and document embeddings.")
sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))
predictions = np.argmax(sim_matrix, axis=1) - self._outliers
@@ -501,21 +527,24 @@ def transform(self,
# Transform with full pipeline
else:
+ logger.info("Dimensionality - Reducing dimensionality of input embeddings.")
umap_embeddings = self.umap_model.transform(embeddings)
- logger.info("Reduced dimensionality")
+ logger.info("Dimensionality - Completed \u2713")
# Extract predictions and probabilities if it is a HDBSCAN-like model
+ logger.info("Clustering - Approximating new points with `hdbscan_model`")
if is_supported_hdbscan(self.hdbscan_model):
predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
# Calculate probabilities
if self.calculate_probabilities:
+ logger.info("Probabilities - Start calculation of probabilities with HDBSCAN")
probabilities = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
- logger.info("Calculated probabilities with HDBSCAN")
+ logger.info("Probabilities - Completed \u2713")
else:
predictions = self.hdbscan_model.predict(umap_embeddings)
probabilities = None
- logger.info("Predicted clusters")
+ logger.info("Cluster - Completed \u2713")
# Map probabilities and predictions
probabilities = self._map_probabilities(probabilities, original_topics=True)
@@ -672,6 +701,7 @@ def topics_over_time(self,
docs: List[str],
timestamps: Union[List[str],
List[int]],
+ topics: List[int] = None,
nr_bins: int = None,
datetime_format: str = None,
evolution_tuning: bool = True,
@@ -697,6 +727,10 @@ def topics_over_time(self,
If it is a list of strings, then the datetime format will be automatically
inferred. If it is a list of ints, then the documents will be ordered by
ascending order.
+ topics: A list of topics where each topic is related to a document in `docs` and
+ a timestamp in `timestamps`. You can use this to apply topics_over_time on
+ a subset of the data. Make sure that `docs`, `timestamps`, and `topics`
+ all correspond to one another and have the same size.
nr_bins: The number of bins you want to create for the timestamps. The left interval will
be chosen as the timestamp. An additional column will be created with the
entire interval.
@@ -729,7 +763,8 @@ def topics_over_time(self,
"""
check_is_fitted(self)
check_documents_type(docs)
- documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Timestamps": timestamps})
+ selected_topics = topics if topics else self.topics_
+ documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps})
global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)
all_topics = sorted(list(documents.Topic.unique()))
@@ -749,9 +784,9 @@ def topics_over_time(self,
documents = documents.sort_values("Timestamps")
timestamps = documents.Timestamps.unique()
if len(timestamps) > 100:
- warnings.warn(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) "
- "which significantly slows down the application. Consider setting `nr_bins` "
- "to a value lower than 100 to speed up calculation. ")
+ logger.warning(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) "
+ "which significantly slows down the application. Consider setting `nr_bins` "
+ "to a value lower than 100 to speed up calculation. ")
# For each unique timestamp, create topic representations
topics_over_time = []
@@ -840,6 +875,7 @@ def topics_per_class(self,
topics_per_class = topic_model.topics_per_class(docs, classes)
```
"""
+ check_documents_type(docs)
documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes})
global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)
@@ -928,6 +964,7 @@ def hierarchical_topics(self,
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)
```
"""
+ check_documents_type(docs)
if distance_function is None:
distance_function = lambda x: 1 - cosine_similarity(x)
@@ -1258,7 +1295,7 @@ def find_topics(self,
below 5 words.
Arguments:
- search_term: the term you want to use to search for topics.
+ search_term: the term you want to use to search for topics.
top_n: the number of topics to return
Returns:
@@ -1361,13 +1398,14 @@ def update_topics(self,
topic_model.update_topics(docs, my_updated_topics)
```
"""
+ check_documents_type(docs)
check_is_fitted(self)
if not n_gram_range:
n_gram_range = self.n_gram_range
if top_n_words > 100:
- warnings.warn("Note that extracting more than 100 words from a sparse "
- "can slow down computation quite a bit.")
+ logger.warning("Note that extracting more than 100 words from a sparse "
+ "can slow down computation quite a bit.")
self.top_n_words = top_n_words
self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()
@@ -1376,11 +1414,11 @@ def update_topics(self,
if topics is None:
topics = self.topics_
else:
- warnings.warn("Using a custom list of topic assignments may lead to errors if "
- "topic reduction techniques are used afterwards. Make sure that "
- "manually assigning topics is the last step in the pipeline."
- "Note that topic embeddings will also be created through weighted"
- "c-TF-IDF embeddings instead of centroid embeddings.")
+ logger.warning("Using a custom list of topic assignments may lead to errors if "
+ "topic reduction techniques are used afterwards. Make sure that "
+ "manually assigning topics is the last step in the pipeline."
+ "Note that topic embeddings will also be created through weighted"
+ "c-TF-IDF embeddings instead of centroid embeddings.")
self._outliers = 1 if -1 in set(topics) else 0
# Extract words
@@ -1414,7 +1452,7 @@ def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:
check_is_fitted(self)
if full:
- topic_representations = {"Main": self.topic_representations_}
+ topic_representations = {"Main": self.topic_representations_}
topic_representations.update(self.topic_aspects_)
return topic_representations
else:
@@ -1586,6 +1624,7 @@ def get_document_info(self,
metadata={"Topic_distribution": distributions})
```
"""
+ check_documents_type(docs)
if df is not None:
document_info = df.copy()
document_info["Document"] = docs
@@ -1898,7 +1937,7 @@ def merge_topics(self,
[1, 2, 3] will merge topics 1, 2 and 3
[[1, 2], [3, 4]] will merge topics 1 and 2, and
separately merge topics 3 and 4.
- images: A list of paths to the images used when calling either
+ images: A list of paths to the images used when calling either
`fit` or `fit_transform`
Examples:
@@ -1920,6 +1959,7 @@ def merge_topics(self,
```
"""
check_is_fitted(self)
+ check_documents_type(docs)
documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))})
mapping = {topic: topic for topic in set(self.topics_)}
@@ -1933,15 +1973,15 @@ def merge_topics(self,
else:
raise ValueError("Make sure that `topics_to_merge` is either"
"a list of topics or a list of list of topics.")
-
+
# Track mappings and sizes of topics for merging topic embeddings
mappings = defaultdict(list)
for key, val in sorted(mapping.items()):
mappings[val].append(key)
- mappings = {topic_from:
+ mappings = {topic_from:
{"topics_to": topics_to,
- "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
- for topic_from, topics_to in mappings.items()}
+ "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
+ for topic_from, topics_to in mappings.items()}
# Update topics
documents.Topic = documents.Topic.map(mapping)
@@ -1971,7 +2011,7 @@ def reduce_topics(self,
Arguments:
docs: The docs you used when calling either `fit` or `fit_transform`
nr_topics: The number of topics you want reduced to
- images: A list of paths to the images used when calling either
+ images: A list of paths to the images used when calling either
`fit` or `fit_transform`
Updates:
@@ -1995,6 +2035,7 @@ def reduce_topics(self,
```
"""
check_is_fitted(self)
+ check_documents_type(docs)
self.nr_topics = nr_topics
documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))})
@@ -2044,7 +2085,7 @@ def reduce_outliers(self,
Arguments:
documents: A list of documents for which we reduce or remove the outliers.
topics: The topics that correspond to the documents
- images: A list of paths to the images used when calling either
+ images: A list of paths to the images used when calling either
`fit` or `fit_transform`
strategy: The strategy used for reducing outliers.
Options:
@@ -2132,7 +2173,7 @@ def reduce_outliers(self,
elif strategy.lower() == "embeddings":
if self.embedding_model is None and embeddings is None:
raise ValueError("To use this strategy, you will need to pass a model to `embedding_model`"
- "when instantiating BERTopic.")
+ "when instantiating BERTopic.")
outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]
if images is not None:
outlier_docs = [images[index] for index in outlier_ids]
@@ -2170,8 +2211,11 @@ def visualize_topics(self,
Arguments:
topics: A selection of topics to visualize
+ Not to be confused with the topics that you get from `.fit_transform`.
+ For example, if you want to visualize only topics 1 through 5:
+ `topics = [1, 2, 3, 4, 5]`.
top_n_topics: Only select the top n most frequent topics
- custom_labels: Whether to use custom topic labels that were defined using
+ custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
width: The width of the figure.
@@ -2283,6 +2327,7 @@ def visualize_documents(self,
style="width:1000px; height: 800px; border: 0px;"">
"""
check_is_fitted(self)
+ check_documents_type(docs)
return plotting.visualize_documents(self,
docs=docs,
topics=topics,
@@ -2396,6 +2441,7 @@ def visualize_hierarchical_documents(self,
style="width:1000px; height: 770px; border: 0px;"">
"""
check_is_fitted(self)
+ check_documents_type(docs)
return plotting.visualize_hierarchical_documents(self,
docs=docs,
hierarchical_topics=hierarchical_topics,
@@ -2921,10 +2967,10 @@ def save(self,
safetensors.
"""
if serialization == "pickle":
- warnings.warn("When you use `pickle` to save/load a BERTopic model,"
- "please make sure that the environments in which you save"
- "and load the model are **exactly** the same. The version of BERTopic,"
- "its dependencies, and python need to remain the same.")
+ logger.warning("When you use `pickle` to save/load a BERTopic model,"
+ "please make sure that the environments in which you save"
+ "and load the model are **exactly** the same. The version of BERTopic,"
+ "its dependencies, and python need to remain the same.")
with open(path, 'wb') as file:
@@ -2949,17 +2995,17 @@ def save(self,
if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str):
save_embedding_model = self.embedding_model._hf_model
elif not save_embedding_model:
- warnings.warn("You are saving a BERTopic model without explicitly defining an embedding model."
- "If you are using a sentence-transformers model or a HuggingFace model supported"
- "by sentence-transformers, please save the model by using a pointer towards that model."
- "For example, `save_embedding_model=sentence-transformers/all-mpnet-base-v2`")
+ logger.warning("You are saving a BERTopic model without explicitly defining an embedding model."
+ "If you are using a sentence-transformers model or a HuggingFace model supported"
+ "by sentence-transformers, please save the model by using a pointer towards that model."
+ "For example, `save_embedding_model=sentence-transformers/all-mpnet-base-v2`", RuntimeWarning)
# Minimal
save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)
save_utils.save_topics(model=self, path=save_directory / "topics.json")
save_utils.save_images(model=self, path=save_directory / "images")
save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model)
-
+
# Additional
if save_ctfidf:
save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization)
@@ -3009,7 +3055,7 @@ def load(cls,
else:
raise ValueError("Make sure to either pass a valid directory or HF model.")
topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images)
-
+
# Replace embedding model if one is specifically chosen
if embedding_model is not None and type(topic_model.embedding_model) == BaseEmbedder:
topic_model.embedding_model = select_backend(embedding_model)
@@ -3067,7 +3113,7 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):
with TemporaryDirectory() as tmpdir:
# Save model weights and config.
- all_topics, all_params, all_tensors = [], [], []
+ all_topics, all_params, all_tensors = [], [], []
for index, model in enumerate(models):
model.save(tmpdir, serialization="pytorch")
topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))
@@ -3086,7 +3132,6 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):
# Calculate similarity matrix
sim_matrix = cosine_similarity(tensors, merged_tensors)
sims = np.max(sim_matrix, axis=1)
- min_similarity = 0.7
# Extract new topics
new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity])
@@ -3094,6 +3139,7 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):
# Merge Topic Representations
new_topics_dict = {}
+ new_topic_val = max_topic + 1
for index, new_topic in enumerate(new_topics):
new_topic_val = max_topic + index + 1
new_topics_dict[new_topic] = new_topic_val
@@ -3108,25 +3154,25 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):
merged_tensors = np.vstack([merged_tensors, new_tensors])
# Topic Mapper
- merged_topics["topic_mapper"] = TopicMapper(list(range(-1, new_topic_val+1, 1)))
+ merged_topics["topic_mapper"] = TopicMapper(list(range(-1, new_topic_val+1, 1))).mappings_
# Find similar topics and re-assign those from the new models
sims_idx = np.argmax(sim_matrix, axis=1)
sims = np.max(sim_matrix, axis=1)
to_merge = {
- a- selected_topics["_outliers"]:
- b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims))
+ a - selected_topics["_outliers"]:
+ b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims))
if val >= min_similarity
}
to_merge.update(new_topics_dict)
to_merge[-1] = -1
- topics = [to_merge[topic] for topic in selected_topics["topics"]]
+ topics = [to_merge[topic] for topic in selected_topics["topics"]]
merged_topics["topics"].extend(topics)
merged_topics["topic_sizes"] = dict(Counter(merged_topics["topics"]))
# Create a new model from the merged parameters
merged_tensors = {"topic_embeddings": torch.from_numpy(merged_tensors)}
- merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None)
+ merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False)
merged_model.embedding_model = models[0].embedding_model
# Replace embedding model if one is specifically chosen
@@ -3238,7 +3284,7 @@ def _extract_embeddings(self,
"""
if isinstance(documents, str):
documents = [documents]
-
+
if images is not None and hasattr(self.embedding_model, "embed_images"):
embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose)
elif method == "word":
@@ -3255,6 +3301,7 @@ def _extract_embeddings(self,
def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
""" Convert images to text """
+ logger.info("Images - Converting images to text. This might take a while.")
if isinstance(self.representation_model, dict):
for tuner in self.representation_model.values():
if getattr(tuner, 'image_to_text_model', False):
@@ -3263,9 +3310,10 @@ def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd
for tuner in self.representation_model:
if getattr(tuner, 'image_to_text_model', False):
documents = tuner.image_to_text(documents, embeddings)
- elif isinstance(self.representation_model, BaseRepresentation):
- if getattr(self.representation_model, 'image_to_text_model', False):
- documents = self.representation_model.image_to_text(documents, embeddings)
+ elif isinstance(self.representation_model, BaseRepresentation):
+ if getattr(self.representation_model, 'image_to_text_model', False):
+ documents = self.representation_model.image_to_text(documents, embeddings)
+ logger.info("Images - Completed \u2713")
return documents
def _map_predictions(self, predictions: List[int]) -> List[int]:
@@ -3291,6 +3339,7 @@ def _reduce_dimensionality(self,
Returns:
umap_embeddings: The reduced embeddings
"""
+ logger.info("Dimensionality - Fitting the dimensionality reduction algorithm")
# Partial fit
if partial_fit:
if hasattr(self.umap_model, "partial_fit"):
@@ -3305,12 +3354,11 @@ def _reduce_dimensionality(self,
y = np.array(y) if y is not None else None
self.umap_model.fit(embeddings, y=y)
except TypeError:
- logger.info("The dimensionality reduction algorithm did not contain the `y` parameter and"
- " therefore the `y` parameter was not used")
+
self.umap_model.fit(embeddings)
umap_embeddings = self.umap_model.transform(embeddings)
- logger.info("Reduced dimensionality")
+ logger.info("Dimensionality - Completed \u2713")
return np.nan_to_num(umap_embeddings)
def _cluster_embeddings(self,
@@ -3331,6 +3379,7 @@ def _cluster_embeddings(self,
and newly added Topics
probabilities: The distribution of probabilities
"""
+ logger.info("Cluster - Start clustering the reduced embeddings")
if partial_fit:
self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings)
labels = self.hdbscan_model.labels_
@@ -3364,9 +3413,171 @@ def _cluster_embeddings(self,
if not partial_fit:
self.topic_mapper_ = TopicMapper(self.topics_)
- logger.info("Clustered reduced embeddings")
+ logger.info("Cluster - Completed \u2713")
return documents, probabilities
+ def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,
+ pd.DataFrame, np.array]:
+ """ Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list
+
+ We transform the the topics in `self.zeroshot_topic_list` to embeddings and
+ through cosine similarity compare them with the document embeddings.
+ If they pass the `self.zeroshot_min_similarity` threshold, they are assigned.
+
+ Arguments:
+ documents: Dataframe with documents and their corresponding IDs
+ embeddings: The document embeddings
+
+ Returns:
+ documents: The leftover documents that were not assigned to any topic
+ embeddings: The leftover embeddings that were not assigned to any topic
+ """
+ logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics")
+ # Similarity between document and zero-shot topic embeddings
+ zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list)
+ cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings)
+ assignment = np.argmax(cosine_similarities, 1)
+ assignment_vals = np.max(cosine_similarities, 1)
+ assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity]
+ non_assigned_ids = [index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity]
+
+ # Assign topics
+ assigned_documents = documents.iloc[assigned_ids]
+ assigned_documents["Topic"] = [topic for topic in assignment[assigned_ids]]
+ assigned_documents["Old_ID"] = assigned_documents["ID"].copy()
+ assigned_documents["ID"] = range(len(assigned_documents))
+ assigned_embeddings = embeddings[assigned_ids]
+
+ # Select non-assigned topics to be clustered
+ documents = documents.iloc[non_assigned_ids]
+ documents["Old_ID"] = documents["ID"].copy()
+ documents["ID"] = range(len(documents))
+ embeddings = embeddings[non_assigned_ids]
+
+ # If only matches were found
+ if len(non_assigned_ids) == 0:
+ return None, None, assigned_documents, assigned_embeddings
+ logger.info("Zeroshot Step 1 - Completed \u2713")
+ return documents, embeddings, assigned_documents, assigned_embeddings
+
+ def _is_zeroshot(self):
+ """ Check whether zero-shot topic modeling is possible
+
+ * There should be a cluster model used
+ * Embedding model is necessary to convert zero-shot topics to embeddings
+ * Zero-shot topics should be defined
+ """
+ if self.zeroshot_topic_list is not None and self.embedding_model is not None and type(self.hdbscan_model) != BaseCluster:
+ return True
+ return False
+
+ def _combine_zeroshot_topics(self,
+ documents: pd.DataFrame,
+ assigned_documents: pd.DataFrame,
+ embeddings: np.ndarray) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]:
+ """ Combine the zero-shot topics with the clustered topics
+
+ There are three cases considered:
+ * Only zero-shot topics were found which will only return the zero-shot topic model
+ * Only clustered topics were found which will only return the clustered topic model
+ * Both zero-shot and clustered topics were found which will return a merged model
+ * This merged model is created using the `merge_models` function which will ignore
+ the underlying UMAP and HDBSCAN models
+
+ Arguments:
+ documents: Dataframe with documents and their corresponding IDs
+ assigned_documents: Dataframe with documents and their corresponding IDs
+ that were assigned to a zero-shot topic
+ embeddings: The document embeddings
+
+ Returns:
+ topics: The topics for each document
+ probabilities: The probabilities for each document
+ """
+ logger.info("Zeroshot Step 2 - Clustering documents that were not found in the zero-shot model...")
+
+ # Fit BERTopic without actually performing any clustering
+ docs = assigned_documents.Document.tolist()
+ y = assigned_documents.Topic.tolist()
+ empty_dimensionality_model = BaseDimensionalityReduction()
+ empty_cluster_model = BaseCluster()
+ zeroshot_model = BERTopic(
+ n_gram_range=self.n_gram_range,
+ low_memory=self.low_memory,
+ calculate_probabilities=self.calculate_probabilities,
+ embedding_model=self.embedding_model,
+ umap_model=empty_dimensionality_model,
+ hdbscan_model=empty_cluster_model,
+ vectorizer_model=self.vectorizer_model,
+ ctfidf_model=self.ctfidf_model,
+ representation_model=self.representation_model,
+ verbose=self.verbose
+ ).fit(docs, embeddings=embeddings, y=y)
+ logger.info("Zeroshot Step 2 - Completed \u2713")
+ logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model")
+
+ # Update model
+ self.umap_model = BaseDimensionalityReduction()
+ self.hdbscan_model = BaseCluster()
+
+ # Update topic label
+ assigned_topics = assigned_documents.groupby("Topic").first().reset_index()
+ indices, topics = assigned_topics.ID.values, assigned_topics.Topic.values
+ labels = [zeroshot_model.topic_labels_[zeroshot_model.topics_[index]] for index in indices]
+ labels = {label: self.zeroshot_topic_list[topic] for label, topic in zip(labels, topics)}
+
+ # If only zero-shot matches were found and clustering was not performed
+ if documents is None:
+ for topic in range(len(set(y))):
+ if zeroshot_model.topic_labels_.get(topic):
+ if labels.get(zeroshot_model.topic_labels_[topic]):
+ zeroshot_model.topic_labels_[topic] = labels[zeroshot_model.topic_labels_[topic]]
+ self.__dict__.clear()
+ self.__dict__.update(zeroshot_model.__dict__)
+ return self.topics_, self.probabilities_
+
+ # Merge the two topic models
+ merged_model = BERTopic.merge_models([zeroshot_model, self], min_similarity=1)
+
+ # Update topic labels and representative docs of the zero-shot model
+ for topic in range(len(set(y))):
+ if merged_model.topic_labels_.get(topic):
+ if labels.get(merged_model.topic_labels_[topic]):
+ label = labels[merged_model.topic_labels_[topic]]
+ merged_model.topic_labels_[topic] = label
+ merged_model.representative_docs_[topic] = zeroshot_model.representative_docs_[topic]
+
+ # Add representative docs of the clustered model
+ for topic in set(self.topics_):
+ merged_model.representative_docs_[topic + self._outliers + len(set(y))] = self.representative_docs_[topic]
+
+ if self._outliers and merged_model.topic_sizes_.get(-1):
+ merged_model.topic_sizes_[len(set(y))] = merged_model.topic_sizes_[-1]
+ del merged_model.topic_sizes_[-1]
+
+ # Update topic assignment by finding the documents with the
+ # correct updated topics
+ zeroshot_indices = list(assigned_documents.Old_ID.values)
+ zeroshot_topics = [self.zeroshot_topic_list[topic] for topic in assigned_documents.Topic.values]
+
+ cluster_indices = list(documents.Old_ID.values)
+ cluster_names = list(merged_model.topic_labels_.values())[len(set(y)):]
+ cluster_topics = [cluster_names[topic + self._outliers] for topic in documents.Topic.values]
+
+ df = pd.DataFrame({
+ "Indices": zeroshot_indices + cluster_indices,
+ "Label": zeroshot_topics + cluster_topics}
+ ).sort_values("Indices")
+ reverse_topic_labels = dict((v, k) for k, v in merged_model.topic_labels_.items())
+ df.Label = df.Label.map(reverse_topic_labels)
+ merged_model.topics_ = df.Label.values
+
+ # Update the class internally
+ self.__dict__.clear()
+ self.__dict__.update(merged_model.__dict__)
+ logger.info("Zeroshot Step 3 - Completed \u2713")
+ return self.topics_
+
def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]:
""" Apply Guided Topic Modeling
@@ -3388,6 +3599,7 @@ def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.
y: The labels for each seeded topic
embeddings: Updated embeddings
"""
+ logger.info("Guided - Find embeddings highly related to seeded topics.")
# Create embeddings from the seeded topics
seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list]
seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose)
@@ -3403,17 +3615,23 @@ def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.
for seed_topic in range(len(seed_topic_list)):
indices = [index for index, topic in enumerate(y) if topic == seed_topic]
embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1])
+ logger.info("Guided - Completed \u2713")
return y, embeddings
- def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None):
+ def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None, verbose: bool = False):
""" Extract topics from the clusters using a class-based TF-IDF
Arguments:
documents: Dataframe with documents and their corresponding IDs
+ embeddings: The document embeddings
+ mappings: The mappings from topic to word
+ verbose: Whether log the process of extracting topics
Returns:
c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic
"""
+ if verbose:
+ logger.info("Representation - Extracting topics from clusters using representation models.")
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
self.topic_representations_ = self._extract_words_per_topic(words, documents)
@@ -3421,6 +3639,8 @@ def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None
self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]])
for key, values in
self.topic_representations_.items()}
+ if verbose:
+ logger.info("Representation - Completed \u2713")
def _save_representative_docs(self, documents: pd.DataFrame):
""" Save the 3 most representative docs per topic
@@ -3431,13 +3651,15 @@ def _save_representative_docs(self, documents: pd.DataFrame):
Updates:
self.representative_docs_: Populate each topic with 3 representative docs
"""
- repr_docs, _, _, _= self._extract_representative_docs(self.c_tf_idf_,
- documents,
- self.topic_representations_,
- nr_samples=500,
- nr_repr_docs=3)
+ repr_docs, _, _, _ = self._extract_representative_docs(
+ self.c_tf_idf_,
+ documents,
+ self.topic_representations_,
+ nr_samples=500,
+ nr_repr_docs=3
+ )
self.representative_docs_ = repr_docs
-
+
def _extract_representative_docs(self,
c_tf_idf: csr_matrix,
documents: pd.DataFrame,
@@ -3488,7 +3710,7 @@ def _extract_representative_docs(self,
selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]
selected_docs = selection["Document"].values
selected_docs_ids = selection.index.tolist()
-
+
# Calculate similarity
nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs)
bow = self.vectorizer_model.transform(selected_docs)
@@ -3503,7 +3725,7 @@ def _extract_representative_docs(self,
else:
indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:]
docs = [selected_docs[index] for index in indices]
-
+
doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs]
repr_docs_ids.append(doc_ids)
repr_docs.extend(docs)
@@ -3515,14 +3737,14 @@ def _extract_representative_docs(self,
def _create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.ndarray = None, mappings=None):
""" Creates embeddings per topics based on their topic representation
- As a default, topic vectors (topic embeddings) or created by taking
- the average of all document embeddings within a topic. If topics are
- merged, then a weighted average of topic embeddings is taken based on
+ As a default, topic vectors (topic embeddings) or created by taking
+ the average of all document embeddings within a topic. If topics are
+ merged, then a weighted average of topic embeddings is taken based on
the initial topic sizes.
- For the `.partial_fit` and `.update_topics` method, the average
+ For the `.partial_fit` and `.update_topics` method, the average
of all document embeddings is not taken since those are not known.
- Instead, the weighted average of the embeddings of the top n words
+ Instead, the weighted average of the embeddings of the top n words
is taken for each topic. The weighting is done based on the c-TF-IDF
score. This will put more emphasis to words that represent a topic best.
"""
@@ -3569,9 +3791,11 @@ def _create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.n
# Extract embeddings for all words in all topics
topic_words = [self.get_topic(topic) for topic in topic_list]
topic_words = [word[0] for topic in topic_words for word in topic]
- word_embeddings = self._extract_embeddings(topic_words,
- method="word",
- verbose=False)
+ word_embeddings = self._extract_embeddings(
+ topic_words,
+ method="word",
+ verbose=False
+ )
# Take the weighted average of word embeddings in a topic based on their c-TF-IDF value
# The embeddings var is a single numpy matrix and therefore slicing is necessary to
@@ -3620,11 +3844,16 @@ def _c_tf_idf(self,
else:
words = self.vectorizer_model.get_feature_names()
- if self.seed_topic_list:
+ multiplier = None
+ if self.ctfidf_model.seed_words and self.seed_topic_list:
+ seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
+ multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])
+ multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])
+ elif self.ctfidf_model.seed_words:
+ multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])
+ elif self.seed_topic_list:
seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])
- else:
- multiplier = None
if fit:
self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)
@@ -3719,6 +3948,7 @@ def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
Returns:
documents: Updated dataframe with documents and the reduced number of Topics
"""
+ logger.info("Topic reduction - Reducing number of topics")
initial_nr_topics = len(self.get_topics())
if isinstance(self.nr_topics, int):
@@ -3729,7 +3959,7 @@ def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
else:
raise ValueError("nr_topics needs to be an int or 'auto'! ")
- logger.info(f"Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}")
+ logger.info(f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}")
return documents
def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
@@ -3764,10 +3994,10 @@ def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
mappings = defaultdict(list)
for key, val in sorted(mapped_topics.items()):
mappings[val].append(key)
- mappings = {topic_from:
+ mappings = {topic_from:
{"topics_to": topics_to,
- "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
- for topic_from, topics_to in mappings.items()}
+ "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
+ for topic_from, topics_to in mappings.items()}
# Map topics
documents.Topic = new_topics
@@ -3815,10 +4045,10 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
mappings = defaultdict(list)
for key, val in sorted(mapped_topics.items()):
mappings[val].append(key)
- mappings = {topic_from:
+ mappings = {topic_from:
{"topics_to": topics_to,
- "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
- for topic_from, topics_to in mappings.items()}
+ "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
+ for topic_from, topics_to in mappings.items()}
# Update documents and topics
self.topic_mapper_.add_mappings(mapped_topics)
@@ -4073,7 +4303,8 @@ def _create_model_from_files(
tensors: Mapping[str, np.array],
ctfidf_tensors: Mapping[str, Any] = None,
ctfidf_config: Mapping[str, Any] = None,
- images: Mapping[int, Any] = None):
+ images: Mapping[int, Any] = None,
+ warn_no_backend: bool = True):
""" Create a BERTopic model from a variety of inputs
Arguments:
@@ -4084,6 +4315,8 @@ def _create_model_from_files(
tensors: The topic embeddings
ctfidf_tensors: The c-TF-IDF representations
ctfidf_config: The config for CountVectorizer and c-TF-IDF
+ images: The images per topic
+ warn_no_backend: Whether to warn the user if no backend is given
"""
from sentence_transformers import SentenceTransformer
params["n_gram_range"] = tuple(params["n_gram_range"])
@@ -4100,9 +4333,11 @@ def _create_model_from_files(
embedding_model = select_backend(SentenceTransformer(params['embedding_model']))
except:
embedding_model = BaseEmbedder()
- warnings.warn("You are loading a BERTopic model without explicitly defining an embedding model."
- "If you want to also load in an embedding model, make sure to use"
- "BERTopic.load(my_model, embedding_model=my_embedding_model).")
+
+ if warn_no_backend:
+ logger.warning("You are loading a BERTopic model without explicitly defining an embedding model."
+ "If you want to also load in an embedding model, make sure to use"
+ "BERTopic.load(my_model, embedding_model=my_embedding_model).")
if params.get("embedding_model") is not None:
del params['embedding_model']
diff --git a/bertopic/_utils.py b/bertopic/_utils.py
index 5e7dd523..c067c33b 100644
--- a/bertopic/_utils.py
+++ b/bertopic/_utils.py
@@ -1,4 +1,5 @@
import numpy as np
+import pandas as pd
import logging
from collections.abc import Iterable
from scipy.sparse import csr_matrix
@@ -13,7 +14,10 @@ def __init__(self, level):
self.logger.propagate = False
def info(self, message):
- self.logger.info("{}".format(message))
+ self.logger.info(f"{message}")
+
+ def warning(self, message):
+ self.logger.warning(f"WARNING: {message}")
def set_level(self, level):
levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
@@ -32,10 +36,11 @@ def _add_handler(self):
def check_documents_type(documents):
""" Check whether the input documents are indeed a list of strings """
- if isinstance(documents, Iterable) and not isinstance(documents, str):
+ if isinstance(documents, pd.DataFrame):
+ raise TypeError("Make sure to supply a list of strings, not a dataframe.")
+ elif isinstance(documents, Iterable) and not isinstance(documents, str):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError("Make sure that the iterable only contains strings.")
-
else:
raise TypeError("Make sure that the documents variable is an iterable containing strings only.")
@@ -94,15 +99,16 @@ def __getattr__(self, *args, **kwargs):
def __call__(self, *args, **kwargs):
raise ModuleNotFoundError(self.msg)
+
def validate_distance_matrix(X, n_samples):
""" Validate the distance matrix and convert it to a condensed distance matrix
if necessary.
- A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
- with zeros on the diagonal and non-negative values or condensed distance matrix
- of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
+ A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
+ with zeros on the diagonal and non-negative values or condensed distance matrix
+ of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
distance matrix.
-
+
Arguments:
X: Distance matrix to validate.
n_samples: Number of samples in the dataset.
@@ -118,26 +124,26 @@ def validate_distance_matrix(X, n_samples):
if len(s) == 1:
# check it has correct size
n = s[0]
- if n != (n_samples * (n_samples -1) / 2):
+ if n != (n_samples * (n_samples - 1) / 2):
raise ValueError("The condensed distance matrix must have "
- "shape (n*(n-1)/2,).")
+ "shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
raise ValueError("The distance matrix must be of shape "
- "(n, n) where n is the number of samples.")
+ "(n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
else:
raise ValueError("The distance matrix must be either a 1-D condensed "
- "distance matrix of shape (n*(n-1)/2,) or a "
- "2-D square distance matrix of shape (n, n)."
- "where n is the number of documents."
- "Got a distance matrix of shape %s" % str(s))
+ "distance matrix of shape (n*(n-1)/2,) or a "
+ "2-D square distance matrix of shape (n, n)."
+ "where n is the number of documents."
+ "Got a distance matrix of shape %s" % str(s))
# Make sure its entries are non-negative
if np.any(X < 0):
raise ValueError("Distance matrix cannot contain negative values.")
- return X
\ No newline at end of file
+ return X
diff --git a/bertopic/backend/_cohere.py b/bertopic/backend/_cohere.py
index 3e71d4fb..54562ee7 100644
--- a/bertopic/backend/_cohere.py
+++ b/bertopic/backend/_cohere.py
@@ -1,14 +1,13 @@
import time
-import cohere
import numpy as np
from tqdm import tqdm
-from typing import List
+from typing import Any, List, Mapping
from bertopic.backend import BaseEmbedder
class CohereBackend(BaseEmbedder):
""" Cohere Embedding Model
-
+
Arguments:
client: A `cohere` client.
embedding_model: A Cohere model. Default is "large".
@@ -17,6 +16,9 @@ class CohereBackend(BaseEmbedder):
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
+ embed_kwargs: Kwargs passed to `cohere.Client.embed`.
+ Can be used to define additional parameters
+ such as `input_type`
Examples:
@@ -27,17 +29,34 @@ class CohereBackend(BaseEmbedder):
client = cohere.Client("APIKEY")
cohere_model = CohereBackend(client)
```
+
+ If you want to specify `input_type`:
+
+ ```python
+ cohere_model = CohereBackend(
+ client,
+ embedding_model="embed-english-v3.0",
+ embed_kwargs={"input_type": "clustering"}
+ )
+ ```
"""
- def __init__(self,
+ def __init__(self,
client,
embedding_model: str = "large",
delay_in_seconds: float = None,
- batch_size: int = None):
+ batch_size: int = None,
+ embed_kwargs: Mapping[str, Any] = {}):
super().__init__()
self.client = client
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
+ self.embed_kwargs = embed_kwargs
+
+ if self.embed_kwargs.get("model"):
+ self.embedding_model = embed_kwargs.get("model")
+ else:
+ self.embed_kwargs["model"] = self.embedding_model
def embed(self,
documents: List[str],
@@ -57,19 +76,19 @@ def embed(self,
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(documents), disable=not verbose):
- response = self.client.embed(batch, model=self.embedding_model)
+ response = self.client.embed(batch, **self.embed_kwargs)
embeddings.extend(response.embeddings)
-
+
# Delay subsequent calls
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)
# Extract embeddings all at once
else:
- response = self.client.embed(documents, model=self.embedding_model)
+ response = self.client.embed(documents, **self.embed_kwargs)
embeddings = response.embeddings
return np.array(embeddings)
-
- def _chunks(self, documents):
+
+ def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
- yield documents[i:i + self.batch_size]
\ No newline at end of file
+ yield documents[i:i + self.batch_size]
diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py
index 0ef50cbc..f4b35eb9 100644
--- a/bertopic/backend/_openai.py
+++ b/bertopic/backend/_openai.py
@@ -2,20 +2,24 @@
import openai
import numpy as np
from tqdm import tqdm
-from typing import List
+from typing import List, Mapping, Any
from bertopic.backend import BaseEmbedder
class OpenAIBackend(BaseEmbedder):
""" OpenAI Embedding Model
-
+
Arguments:
- embedding_model: An OpenAI model. Default is
+ client: A `openai.OpenAI` client.
+ embedding_model: An OpenAI model. Default is
For an overview of models see:
https://platform.openai.com/docs/models/embeddings
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
+ generator_kwargs: Kwargs passed to `openai.Embedding.create`.
+ Can be used to define custom engines or
+ deployment_ids.
Examples:
@@ -23,18 +27,25 @@ class OpenAIBackend(BaseEmbedder):
import openai
from bertopic.backend import OpenAIBackend
- openai.api_key = MY_API_KEY
- openai_embedder = OpenAIBackend("text-embedding-ada-002")
+ client = openai.OpenAI(api_key="sk-...")
+ openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
```
"""
- def __init__(self,
+ def __init__(self,
embedding_model: str = "text-embedding-ada-002",
delay_in_seconds: float = None,
- batch_size: int = None):
+ batch_size: int = None,
+ generator_kwargs: Mapping[str, Any] = {}):
super().__init__()
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
+ self.generator_kwargs = generator_kwargs
+
+ if self.generator_kwargs.get("model"):
+ self.embedding_model = generator_kwargs.get("model")
+ elif not self.generator_kwargs.get("engine"):
+ self.generator_kwargs["model"] = self.embedding_model
def embed(self,
documents: List[str],
@@ -54,7 +65,7 @@ def embed(self,
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(documents), disable=not verbose):
- response = openai.Embedding.create(input=batch, model=self.embedding_model)
+ response = openai.Embedding.create(input=batch, **self.generator_kwargs)
embeddings.extend([r["embedding"] for r in response["data"]])
# Delay subsequent calls
@@ -63,10 +74,10 @@ def embed(self,
# Extract embeddings all at once
else:
- response = openai.Embedding.create(input=documents, model=self.embedding_model)
+ response = openai.Embedding.create(input=documents, **self.generator_kwargs)
embeddings = [r["embedding"] for r in response["data"]]
return np.array(embeddings)
-
- def _chunks(self, documents):
+
+ def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
yield documents[i:i + self.batch_size]
diff --git a/bertopic/representation/__init__.py b/bertopic/representation/__init__.py
index 2fdcfc66..3fd5ce63 100644
--- a/bertopic/representation/__init__.py
+++ b/bertopic/representation/__init__.py
@@ -4,6 +4,14 @@
from bertopic.representation._keybert import KeyBERTInspired
from bertopic.representation._mmr import MaximalMarginalRelevance
+
+# Llama CPP Generator
+try:
+ from bertopic.representation._llamacpp import LlamaCPP
+except ModuleNotFoundError:
+ msg = "`pip install llama-cpp-python` \n\n"
+ LlamaCPP = NotInstalled("llama.cpp", "llama-cpp-python", custom_msg=msg)
+
# Text Generation using transformers
try:
from bertopic.representation._textgeneration import TextGeneration
@@ -25,7 +33,7 @@
msg = "`pip install openai` \n\n"
OpenAI = NotInstalled("OpenAI", "openai", custom_msg=msg)
-# OpenAI Generator
+# LangChain Generator
try:
from bertopic.representation._langchain import LangChain
except ModuleNotFoundError:
@@ -45,7 +53,6 @@
VisualRepresentation = NotInstalled("a visual representation model", "vision")
-
__all__ = [
"BaseRepresentation",
"TextGeneration",
@@ -56,5 +63,6 @@
"Cohere",
"OpenAI",
"LangChain",
+ "LlamaCPP",
"VisualRepresentation"
]
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index 6d002293..494cb3ac 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -44,7 +44,7 @@ class Cohere(BaseRepresentation):
https://docs.cohere.ai/docs
Arguments:
- client: A cohere.Client
+ client: A `cohere.Client`
model: Model to use within Cohere, defaults to `"xlarge"`.
prompt: The prompt to be used in the model. If no prompt is given,
`self.default_prompt_` is used instead.
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
new file mode 100644
index 00000000..a7cfbecf
--- /dev/null
+++ b/bertopic/representation/_llamacpp.py
@@ -0,0 +1,183 @@
+import pandas as pd
+from tqdm import tqdm
+from scipy.sparse import csr_matrix
+from llama_cpp import Llama
+from typing import Mapping, List, Tuple, Any, Union, Callable
+from bertopic.representation._base import BaseRepresentation
+from bertopic.representation._utils import truncate_document
+
+
+DEFAULT_PROMPT = """
+Q: I have a topic that contains the following documents:
+[DOCUMENTS]
+
+The topic is described by the following keywords: '[KEYWORDS]'.
+
+Based on the above information, can you give a short label of the topic?
+A: """
+
+
+class LlamaCPP(BaseRepresentation):
+ """ A llama.cpp implementation to use as a representation model.
+
+ Arguments:
+ model: Either a string pointing towards a local LLM or a
+ `llama_cpp.Llama` object.
+ prompt: The prompt to be used in the model. If no prompt is given,
+ `self.default_prompt_` is used instead.
+ NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
+ to decide where the keywords and documents need to be
+ inserted.
+ pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama`
+ when it is called such as `max_tokens` to be generated.
+ nr_docs: The number of documents to pass to OpenAI if a prompt
+ with the `["DOCUMENTS"]` tag is used.
+ diversity: The diversity of documents to pass to OpenAI.
+ Accepts values between 0 and 1. A higher
+ values results in passing more diverse documents
+ whereas lower values passes more similar documents.
+ doc_length: The maximum length of each document. If a document is longer,
+ it will be truncated. If None, the entire document is passed.
+ tokenizer: The tokenizer used to calculate to split the document into segments
+ used to count the length of a document.
+ * If tokenizer is 'char', then the document is split up
+ into characters which are counted to adhere to `doc_length`
+ * If tokenizer is 'whitespace', the the document is split up
+ into words separated by whitespaces. These words are counted
+ and truncated depending on `doc_length`
+ * If tokenizer is 'vectorizer', then the internal CountVectorizer
+ is used to tokenize the document. These tokens are counted
+ and trunctated depending on `doc_length`
+ * If tokenizer is a callable, then that callable is used to tokenize
+ the document. These tokens are counted and truncated depending
+ on `doc_length`
+
+ Usage:
+
+ To use a llama.cpp, first download the LLM:
+
+ ```bash
+ wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf
+ ```
+
+ Then, we can now use the model the model with BERTopic in just a couple of lines:
+
+ ```python
+ from bertopic import BERTopic
+ from bertopic.representation import LlamaCPP
+
+ # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
+ representation_model = LlamaCPP("zephyr-7b-alpha.Q4_K_M.gguf")
+
+ # Create our BERTopic model
+ topic_model = BERTopic(representation_model=representation_model, verbose=True)
+ ```
+
+ If you want to have more control over the LLMs parameters, you can run it like so:
+
+ ```python
+ from bertopic import BERTopic
+ from bertopic.representation import LlamaCPP
+ from llama_cpp import Llama
+
+ # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
+ llm = Llama(model_path="zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop="Q:")
+ representation_model = LlamaCPP(llm)
+
+ # Create our BERTopic model
+ topic_model = BERTopic(representation_model=representation_model, verbose=True)
+ ```
+ """
+ def __init__(self,
+ model: Union[str, Llama],
+ prompt: str = None,
+ pipeline_kwargs: Mapping[str, Any] = {},
+ nr_docs: int = 4,
+ diversity: float = None,
+ doc_length: int = None,
+ tokenizer: Union[str, Callable] = None
+ ):
+ if isinstance(model, str):
+ self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
+ elif isinstance(model, Llama):
+ self.model = model
+ else:
+ raise ValueError("Make sure that the model that you"
+ "pass is either a string referring to a"
+ "local LLM or a ` llama_cpp.Llama` object.")
+ self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
+ self.default_prompt_ = DEFAULT_PROMPT
+ self.pipeline_kwargs = pipeline_kwargs
+ self.nr_docs = nr_docs
+ self.diversity = diversity
+ self.doc_length = doc_length
+ self.tokenizer = tokenizer
+
+ self.prompts_ = []
+
+ def extract_topics(self,
+ topic_model,
+ documents: pd.DataFrame,
+ c_tf_idf: csr_matrix,
+ topics: Mapping[str, List[Tuple[str, float]]]
+ ) -> Mapping[str, List[Tuple[str, float]]]:
+ """ Extract topic representations and return a single label
+
+ Arguments:
+ topic_model: A BERTopic model
+ documents: Not used
+ c_tf_idf: Not used
+ topics: The candidate topics as calculated with c-TF-IDF
+
+ Returns:
+ updated_topics: Updated topic representations
+ """
+ # Extract the top 4 representative documents per topic
+ repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(
+ c_tf_idf,
+ documents,
+ topics,
+ 500,
+ self.nr_docs,
+ self.diversity
+ )
+
+ updated_topics = {}
+ for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):
+
+ # Prepare prompt
+ truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]
+ prompt = self._create_prompt(truncated_docs, topic, topics)
+ self.prompts_.append(prompt)
+
+ # Extract result from generator and use that as label
+ topic_description = self.model(prompt, **self.pipeline_kwargs)['choices']
+ topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]
+
+ if len(topic_description) < 10:
+ topic_description += [("", 0) for _ in range(10-len(topic_description))]
+
+ updated_topics[topic] = topic_description
+
+ return updated_topics
+
+ def _create_prompt(self, docs, topic, topics):
+ keywords = ", ".join(list(zip(*topics[topic]))[0])
+
+ # Use the default prompt and replace keywords
+ if self.prompt == DEFAULT_PROMPT:
+ prompt = self.prompt.replace("[KEYWORDS]", keywords)
+
+ # Use a prompt that leverages either keywords or documents in
+ # a custom location
+ else:
+ prompt = self.prompt
+ if "[KEYWORDS]" in prompt:
+ prompt = prompt.replace("[KEYWORDS]", keywords)
+ if "[DOCUMENTS]" in prompt:
+ to_replace = ""
+ for doc in docs:
+ to_replace += f"- {doc}\n"
+ prompt = prompt.replace("[DOCUMENTS]", to_replace)
+
+ return prompt
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index fd66ded3..cdeb49dd 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -59,6 +59,7 @@ class OpenAI(BaseRepresentation):
https://platform.openai.com/docs/models
Arguments:
+ client: A `openai.OpenAI` client
model: Model to use within OpenAI, defaults to `"text-ada-001"`.
NOTE: If a `gpt-3.5-turbo` model is used, make sure to set
`chat` to True.
@@ -69,26 +70,26 @@ class OpenAI(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
- delay_in_seconds: The delay in seconds between consecutive prompts
- in order to prevent RateLimitErrors.
- exponential_backoff: Retry requests with a random exponential backoff.
- A short sleep is used when a rate limit error is hit,
+ delay_in_seconds: The delay in seconds between consecutive prompts
+ in order to prevent RateLimitErrors.
+ exponential_backoff: Retry requests with a random exponential backoff.
+ A short sleep is used when a rate limit error is hit,
then the requests is retried. Increase the sleep length
- if errors are hit until 10 unsuccesfull requests.
+ if errors are hit until 10 unsuccesfull requests.
If True, overrides `delay_in_seconds`.
chat: Set this to True if a GPT-3.5 model is used.
See: https://platform.openai.com/docs/models/gpt-3-5
nr_docs: The number of documents to pass to OpenAI if a prompt
with the `["DOCUMENTS"]` tag is used.
diversity: The diversity of documents to pass to OpenAI.
- Accepts values between 0 and 1. A higher
+ Accepts values between 0 and 1. A higher
values results in passing more diverse documents
whereas lower values passes more similar documents.
doc_length: The maximum length of each document. If a document is longer,
it will be truncated. If None, the entire document is passed.
tokenizer: The tokenizer used to calculate to split the document into segments
- used to count the length of a document.
- * If tokenizer is 'char', then the document is split up
+ used to count the length of a document.
+ * If tokenizer is 'char', then the document is split up
into characters which are counted to adhere to `doc_length`
* If tokenizer is 'whitespace', the document is split up
into words separated by whitespaces. These words are counted
@@ -114,7 +115,8 @@ class OpenAI(BaseRepresentation):
from bertopic import BERTopic
# Create your representation model
- representation_model = OpenAI(delay_in_seconds=5)
+ client = openai.OpenAI(api_key=MY_API_KEY)
+ representation_model = OpenAI(client, delay_in_seconds=5)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
@@ -124,16 +126,17 @@ class OpenAI(BaseRepresentation):
```python
prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '"
- representation_model = OpenAI(prompt=prompt, delay_in_seconds=5)
+ representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5)
```
If you want to use OpenAI's ChatGPT model:
```python
- representation_model = OpenAI(model="gpt-3.5-turbo", delay_in_seconds=10, chat=True)
+ representation_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True)
```
"""
def __init__(self,
+ client,
model: str = "text-ada-001",
prompt: str = None,
generator_kwargs: Mapping[str, Any] = {},
@@ -145,6 +148,7 @@ def __init__(self,
doc_length: int = None,
tokenizer: Union[str, Callable] = None
):
+ self.client = client
self.model = model
if prompt is None:
@@ -208,16 +212,22 @@ def extract_topics(self,
]
kwargs = {"model": self.model, "messages": messages, **self.generator_kwargs}
if self.exponential_backoff:
- response = chat_completions_with_backoff(**kwargs)
+ response = chat_completions_with_backoff(self.client, **kwargs)
else:
- response = openai.ChatCompletion.create(**kwargs)
- label = response["choices"][0]["message"]["content"].strip().replace("topic: ", "")
+ response = self.client.chat.completions.create(**kwargs)
+
+ # Check whether content was actually generated
+ # Adresses #1570 for potential issues with OpenAI's content filter
+ if hasattr(response.choices[0].message, "content"):
+ label = response.choices[0].message.content.strip().replace("topic: ", "")
+ else:
+ label = "No label returned"
else:
if self.exponential_backoff:
- response = completions_with_backoff(model=self.model, prompt=prompt, **self.generator_kwargs)
+ response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs)
else:
- response = openai.Completion.create(model=self.model, prompt=prompt, **self.generator_kwargs)
- label = response["choices"][0]["text"].strip()
+ response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)
+ label = response.choices[0].message.content.strip()
updated_topics[topic] = [(label, 1)]
@@ -251,21 +261,19 @@ def _replace_documents(prompt, docs):
return prompt
-def completions_with_backoff(**kwargs):
+def completions_with_backoff(client, **kwargs):
return retry_with_exponential_backoff(
- openai.Completion.create,
+ client.completions.create,
errors=(
- openai.error.RateLimitError,
- openai.error.ServiceUnavailableError,
+ openai.RateLimitError,
),
)(**kwargs)
-def chat_completions_with_backoff(**kwargs):
+def chat_completions_with_backoff(client, **kwargs):
return retry_with_exponential_backoff(
- openai.ChatCompletion.create,
+ client.chat.completions.create,
errors=(
- openai.error.RateLimitError,
- openai.error.ServiceUnavailableError,
+ openai.RateLimitError,
),
)(**kwargs)
diff --git a/bertopic/vectorizers/_ctfidf.py b/bertopic/vectorizers/_ctfidf.py
index 1e0151a0..4ae16a34 100644
--- a/bertopic/vectorizers/_ctfidf.py
+++ b/bertopic/vectorizers/_ctfidf.py
@@ -1,3 +1,4 @@
+from typing import List
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.utils import check_array
@@ -26,6 +27,11 @@ class ClassTfidfTransformer(TfidfTransformer):
`log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
Helps to reduce the impact of words that appear too frequently.
+ seed_words: Specific words that will have their idf value increased by
+ the value of `seed_multiplier`.
+ NOTE: This will only increase the value of words that have an exact match.
+ seed_multiplier: The value with which the idf values of the words in `seed_words`
+ are multiplied.
Examples:
@@ -33,9 +39,16 @@ class ClassTfidfTransformer(TfidfTransformer):
transformer = ClassTfidfTransformer()
```
"""
- def __init__(self, bm25_weighting: bool = False, reduce_frequent_words: bool = False):
+ def __init__(self,
+ bm25_weighting: bool = False,
+ reduce_frequent_words: bool = False,
+ seed_words: List[str] = None,
+ seed_multiplier: bool = 2
+ ):
self.bm25_weighting = bm25_weighting
self.reduce_frequent_words = reduce_frequent_words
+ self.seed_words = seed_words
+ self.seed_multiplier = seed_multiplier
super(ClassTfidfTransformer, self).__init__()
def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
diff --git a/docs/algorithm/algorithm.md b/docs/algorithm/algorithm.md
index 47858a3c..6a9a2b62 100644
--- a/docs/algorithm/algorithm.md
+++ b/docs/algorithm/algorithm.md
@@ -21,6 +21,16 @@ As a result, BERTopic is quite modular and can maintain its quality of topic gen
+There is extensive documentation on how to use each step in this pipeline:
+
+1. [Embeddings](../getting_started/embeddings/embeddings.html)
+2. [Dimensionality Reduction](../getting_started/dim_reduction/dim_reduction.html)
+3. [Clustering](../getting_started/dim_reduction/dim_reduction.html)
+4. [Tokenizer](../getting_started/vectorizers/vectorizers.html)
+5. [Weighting Scheme](../getting_started/ctfidf/ctfidf.html)
+6. [Representation Tuning](../getting_started/representation/representation.html)
+ * [Large Language Models (LLM)](../getting_started/representation/llm.html)
+
## **Code Overview**
After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive.
diff --git a/docs/changelog.md b/docs/changelog.md
index 60c7acc0..98be9f72 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -5,6 +5,229 @@ hide:
# Changelog
+## **Version 0.16.0**
+*Release date: 26 November, 2023*
+
+
Highlights:
+
+* Merge pre-trained BERTopic models with [**`.merge_models`**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html)
+ * Combine models with different representations together!
+ * Use this for *incremental/online topic modeling* to detect new incoming topics
+ * First step towards *federated learning* with BERTopic
+* [**Zero-shot**](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) Topic Modeling
+ * Use a predefined list of topics to assign documents
+ * If needed, allows for further exploration of undefined topics
+* [**Seed (domain-specific) words**](https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html) with `ClassTfidfTransformer`
+ * Make sure selected words are more likely to end up in the representation without influencing the clustering process
+* Added params to [**truncate documents**](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#truncating-documents) to length when using LLMs
+* Added [**LlamaCPP**](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#llamacpp) as a representation model
+* LangChain: Support for **LCEL Runnables** by [@joshuasundance-swca](/~https://github.com/joshuasundance-swca) in [#1586](/~https://github.com/MaartenGr/BERTopic/pull/1586)
+* Added `topics` parameter to `.topics_over_time` to select a subset of documents and topics
+* Documentation:
+ * [Best practices Guide](https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)
+ * [Llama 2 Tutorial](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#llama-2)
+ * [Zephyr Tutorial](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#zephyr-mistral-7b)
+ * Improved [embeddings guidance](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#sentence-transformers) (MTEB)
+ * Improved logging throughout the package
+* Added support for **Cohere's Embed v3**:
+```python
+cohere_model = CohereBackend(
+ client,
+ embedding_model="embed-english-v3.0",
+ embed_kwargs={"input_type": "clustering"}
+)
+```
+
+
Fixes:
+
+* Fixed n-gram Keywords need delimiting in OpenAI() [#1546](/~https://github.com/MaartenGr/BERTopic/issues/1546)
+* Fixed OpenAI v1.0 issues [#1629](/~https://github.com/MaartenGr/BERTopic/issues/1629)
+* Improved documentation/logging to adress [#1589](/~https://github.com/MaartenGr/BERTopic/issues/1589), [#1591](/~https://github.com/MaartenGr/BERTopic/issues/1591)
+* Fixed engine support for Azure OpenAI embeddings [#1577](/~https://github.com/MaartenGr/BERTopic/issues/1487)
+* Fixed OpenAI Representation: KeyError: 'content' [#1570](/~https://github.com/MaartenGr/BERTopic/issues/1570)
+* Fixed Loading topic model with multiple topic aspects changes their format [#1487](/~https://github.com/MaartenGr/BERTopic/issues/1487)
+* Fix expired link in algorithm.md by [@burugaria7](/~https://github.com/burugaria7) in [#1396](/~https://github.com/MaartenGr/BERTopic/pull/1396)
+* Fix guided topic modeling in cuML's UMAP by [@stevetracvc](/~https://github.com/stevetracvc) in [#1326](/~https://github.com/MaartenGr/BERTopic/pull/1326)
+* OpenAI: Allow retrying on Service Unavailable errors by [@agamble](/~https://github.com/agamble) in [#1407](/~https://github.com/MaartenGr/BERTopic/pull/1407)
+* Fixed parameter naming for HDBSCAN in best practices by [@rnckp](/~https://github.com/rnckp) in [#1408](/~https://github.com/MaartenGr/BERTopic/pull/1408)
+* Fixed typo in tips_and_tricks.md by [@aronnoordhoek](/~https://github.com/aronnoordhoek) in [#1446](/~https://github.com/MaartenGr/BERTopic/pull/1446)
+* Fix typos in documentation by [@bobchien](/~https://github.com/bobchien) in [#1481](/~https://github.com/MaartenGr/BERTopic/pull/1481)
+* Fix IndexError when all outliers are removed by reduce_outliers by [@Aratako](/~https://github.com/Aratako) in [#1466](/~https://github.com/MaartenGr/BERTopic/pull/1466)
+* Fix TypeError on reduce_outliers "probabilities" by [@ananaphasia](/~https://github.com/ananaphasia) in [#1501](/~https://github.com/MaartenGr/BERTopic/pull/1501)
+* Add new line to fix markdown bullet point formatting by [@saeedesmaili](/~https://github.com/saeedesmaili) in [#1519](/~https://github.com/MaartenGr/BERTopic/pull/1519)
+* Update typo in topicrepresentation.md by [@oliviercaron](/~https://github.com/oliviercaron) in [#1537](/~https://github.com/MaartenGr/BERTopic/pull/1537)
+* Fix typo in FAQ by [@sandijou](/~https://github.com/sandijou) in [#1542](/~https://github.com/MaartenGr/BERTopic/pull/1542)
+* Fixed typos in best practices documentation by [@poomkusa](/~https://github.com/poomkusa) in [#1557](/~https://github.com/MaartenGr/BERTopic/pull/1557)
+* Correct TopicMapper doc example by [@chrisji](/~https://github.com/chrisji) in [#1637](/~https://github.com/MaartenGr/BERTopic/pull/1637)
+* Fix typing in hierarchical_topics by [@dschwalm](/~https://github.com/dschwalm) in [#1364](/~https://github.com/MaartenGr/BERTopic/pull/1364)
+* Fixed typing issue with treshold parameter in reduce_outliers by [@dschwalm](/~https://github.com/dschwalm) in [#1380](/~https://github.com/MaartenGr/BERTopic/pull/1380)
+* Fix several typos by [@mertyyanik](/~https://github.com/mertyyanik) in [#1307](/~https://github.com/MaartenGr/BERTopic/pull/1307)
+(#1307)
+* Fix inconsistent naming by [@rolanderdei](/~https://github.com/rolanderdei) in [#1073](/~https://github.com/MaartenGr/BERTopic/pull/1073)
+
+
+
+The new `.merge_models` feature allows for any number of fitted BERTopic models to be merged. Doing so allows for a number of use cases:
+
+* **Incremental topic modeling** -- Continuously merge models together to detect whether new topics have appeared
+* **Federated Learning** - Train BERTopic models on different clients and combine them on a central server
+* **Minimal compute** - We can essentially batch the training process into multiple instances to reduce compute
+* **Different datasets** - When you have different datasets that you want to train seperately on, for example with different languages, you can train each model separately and join them after training
+
+To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.
+
+First, we train three separate models on different parts of the data:
+
+```python
+from umap import UMAP
+from bertopic import BERTopic
+from datasets import load_dataset
+
+dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
+
+# Extract abstracts to train on and corresponding titles
+abstracts_1 = dataset["abstract"][:5_000]
+abstracts_2 = dataset["abstract"][5_000:10_000]
+abstracts_3 = dataset["abstract"][10_000:15_000]
+
+# Create topic models
+umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
+topic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)
+topic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)
+topic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)
+```
+
+Then, we can combine all three models into one with `.merge_models`:
+
+```python
+# Combine all models into one
+merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])
+```
+
+
+Zeroshot Topic Modeling is a technique that allows you to find pre-defined topics in large amounts of documents. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics.
+This allows for extensive flexibility as there are three scenario's to explore.
+
+* No zeroshot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
+* Only zeroshot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
+* Both zeroshot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.
+
+data:image/s3,"s3://crabby-images/416de/416dee11a898c730e900b5f2c79dd053e9a40639" alt="zeroshot"
+
+In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However,
+there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers.
+We know the data and believe there to be at least the following topics: *clustering*, *topic modeling*, and *large language models*.
+However, we are not sure whether other topics exist and want to explore those.
+
+Using this feature is straightforward:
+
+```python
+from datasets import load_dataset
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+
+# We select a subsample of 5000 abstracts from ArXiv
+dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
+docs = dataset["abstract"][:5_000]
+
+# We define a number of topics that we know are in the documents
+zeroshot_topic_list = ["Clustering", "Topic Modeling", "Large Language Models"]
+
+# We fit our model using the zero-shot topics
+# and we define a minimum similarity. For each document,
+# if the similarity does not exceed that value, it will be used
+# for clustering instead.
+topic_model = BERTopic(
+ embedding_model="thenlper/gte-small",
+ min_topic_size=15,
+ zeroshot_topic_list=zeroshot_topic_list,
+ zeroshot_min_similarity=.85,
+ representation_model=KeyBERTInspired()
+)
+topics, _ = topic_model.fit_transform(docs)
+```
+
+When we run `topic_model.get_topic_info()` you will see something like this:
+
+data:image/s3,"s3://crabby-images/bce4e/bce4ec51a80326a85e4c101842f97219fb28bce6" alt="zeroshot_output"
+
+
+
+
+When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the *"TNM"* classification is a method for identifying the stage of most cancers. The word *"TNM"* is an abbreviation and might not be correctly captured in generic embedding models.
+
+To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of `seed_words` in the `bertopic.vectorizer.ClassTfidfTransformer`. To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like "agent" and "robot" should be important in such a topic were it to be found. Using the `ClassTfidfTransformer`, we can define those `seed_words` and also choose by how much their values are multiplied.
+
+The full example is then as follows:
+
+```python
+from umap import UMAP
+from datasets import load_dataset
+from bertopic import BERTopic
+from bertopic.vectorizers import ClassTfidfTransformer
+
+# Let's take a subset of ArXiv abstracts as the training data
+dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
+abstracts = dataset["abstract"][:5_000]
+
+# For illustration purposes, we make sure the output is fixed when running this code multiple times
+umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
+
+# We can choose any number of seed words for which we want their representation
+# to be strengthen. We increase the importance of these words as we want them to be more
+# likely to end up in the topic representations.
+ctfidf_model = ClassTfidfTransformer(
+ seed_words=["agent", "robot", "behavior", "policies", "environment"],
+ seed_multiplier=2
+)
+
+# We run the topic model with the seeded words
+topic_model = BERTopic(
+ umap_model=umap_model,
+ min_topic_size=15,
+ ctfidf_model=ctfidf_model,
+).fit(abstracts)
+```
+
+
+
+When using LLMs with BERTopic, we can truncate the input documents in `[DOCUMENTS]` in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:
+
+* `doc_length` - The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
+* `tokenizer` - The tokenizer used to calculate to split the document into segments used to count the length of a document.
+ * Options include `'char'`, `'whitespace'`, `'vectorizer'`, and a callable
+
+This means that the definition of `doc_length` changes depending on what constitutes a token in the `tokenizer` parameter. If a token is a character, then `doc_length` refers to max length in characters. If a token is a word, then `doc_length` refers to the max length in words.
+
+Let's illustrate this with an example. In the code below, we will use [`tiktoken`](/~https://github.com/openai/tiktoken) to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.
+
+We use `bertopic.representation.OpenAI` to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:
+
+```python
+import openai
+import tiktoken
+from bertopic.representation import OpenAI
+from bertopic import BERTopic
+
+# Tokenizer
+tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")
+
+# Create your representation model
+openai.api_key = MY_API_KEY
+representation_model = OpenAI(
+ model="gpt-3.5-turbo",
+ delay_in_seconds=2,
+ chat=True,
+ nr_docs=4,
+ doc_length=100,
+ tokenizer=tokenizer
+)
+
+# Use the representation model in BERTopic on top of the default pipeline
+topic_model = BERTopic(representation_model=representation_model)
+```
+
## **Version 0.15.0**
*Release date: 29 May, 2023*
diff --git a/docs/getting_started/best_practices/best_practices.md b/docs/getting_started/best_practices/best_practices.md
index 3b6a86fa..dd963a7d 100644
--- a/docs/getting_started/best_practices/best_practices.md
+++ b/docs/getting_started/best_practices/best_practices.md
@@ -127,7 +127,7 @@ pos_model = PartOfSpeech("en_core_web_sm")
mmr_model = MaximalMarginalRelevance(diversity=0.3)
# GPT-3.5
-openai.api_key = "sk-..."
+client = openai.OpenAI(api_key="sk-...")
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
@@ -136,7 +136,7 @@ The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic:
"""
-openai_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)
+openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)
# All representation models
representation_model = {
diff --git a/docs/getting_started/embeddings/embeddings.md b/docs/getting_started/embeddings/embeddings.md
index de9867e6..06587bd8 100644
--- a/docs/getting_started/embeddings/embeddings.md
+++ b/docs/getting_started/embeddings/embeddings.md
@@ -195,8 +195,8 @@ to be used in our topic model:
import openai
from bertopic.backend import OpenAIBackend
-openai.api_key = MY_API_KEY
-embedding_model = OpenAIBackend("text-embedding-ada-002")
+client = openai.OpenAI(api_key="sk-...")
+embedding_model = OpenAIBackend(client, "text-embedding-ada-002")
topic_model = BERTopic(embedding_model=embedding_model)
```
diff --git a/docs/getting_started/online/online.md b/docs/getting_started/online/online.md
index deb7451b..d858cb4d 100644
--- a/docs/getting_started/online/online.md
+++ b/docs/getting_started/online/online.md
@@ -1,5 +1,8 @@
Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic.
+!!! Tip
+ Another method for online topic modeling can be found with the [**.merge_models**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. A major benefit, compared to `.partial_fit` is that you can keep using the original UMAP and HDBSCAN models which tends result in improved performance and gives you significant more flexibility.
+
In BERTopic, there are three main goals for using this technique.
* To reduce the memory necessary for training a topic model.
@@ -13,7 +16,7 @@ In BERTopic, online topic modeling can be a bit tricky as there are several step
3. Cluster reduced embeddings
4. Tokenize topics
5. Extract topic words
-6. Diversify topic words
+6. (Optional) Fine-tune topic words
For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6.
@@ -28,6 +31,7 @@ This means that we will need online variants for steps 2 through 4. Steps 2 and
Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer).
+
## **Example**
Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally.
diff --git a/docs/getting_started/quickstart/quickstart.md b/docs/getting_started/quickstart/quickstart.md
index 67d12217..9f22c49a 100644
--- a/docs/getting_started/quickstart/quickstart.md
+++ b/docs/getting_started/quickstart/quickstart.md
@@ -97,8 +97,8 @@ import openai
from bertopic.representation import OpenAI
# Fine-tune topic representations with GPT
-openai.api_key = "sk-..."
-representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
+client = openai.OpenAI(api_key="sk-...")
+representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
topic_model = BERTopic(representation_model=representation_model)
```
diff --git a/docs/getting_started/representation/llm.md b/docs/getting_started/representation/llm.md
index e26c9d9b..b5ca6576 100644
--- a/docs/getting_started/representation/llm.md
+++ b/docs/getting_started/representation/llm.md
@@ -1,4 +1,4 @@
-As we have seen in the [previous section](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html), the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solution.
+As we have seen in the [previous section](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html), the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solutions.
Using these techniques, we can further fine-tune topics to generate labels, summaries, poems of topics, and more. To do so, we first generate a set of keywords and documents that describe a topic best using BERTopic's c-TF-IDF calculate. Then, these candidate keywords and documents are passed to the text generation model and asked to generate output that fits the topic best.
@@ -36,8 +36,50 @@ Based on the above information, can you give a short label of the topic?
"""
```
-!!! tip Tip
- You can access the default prompts of these models with `representation_model.default_prompt_`
+!!! tip "Tip 1"
+ You can access the default prompts of these models with `representation_model.default_prompt_`. The prompts that were generated after training can be accessed with `topic_model.representation_model.prompts_`.
+
+### **Selecting Documents**
+
+By default, four of the most representative documents will be passed to `[DOCUMENTS]`. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected.
+
+To increase the number of documents passed to `[DOCUMENTS]`, we can use the `nr_docs` parameter which is accessible in all LLMs on this page. Using this value allows you to select the top *n* most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents.
+
+However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the `diversity` parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders!
+
+### **Truncating Documents**
+
+If you increase the number of documents passed to `[DOCUMENTS]`, then the token limit can quickly be filled. Instead, we could only pass a part of the document by truncating it to a specific length. For that, we use two parameters that are accessible in all LLMs on this page, namely `document_length` and `tokenizer`.
+
+Let's start with the `tokenizer`. It is used to split a document up into tokens/segments. Each segment can then be used to calculate the complete document length.
+The methods for tokenization are as follows:
+
+* If tokenizer is `char`, then the document is split up into characters.
+* If tokenizer is `whitespace`, the the document is split up into words separated by whitespaces.
+* If tokenizer is `vectorizer`, then the internal CountVectorizer is used to tokenize the document.
+* If tokenizer is a `callable`, then that callable is used to tokenize the document.
+
+After having tokenized the document according one of the strategies above, the `document_length` is used to truncate the document to its specified value.
+
+For example, if the tokenizer is `whitespace` then a document is split up into individual words and the length of the document is counted by the total number of words. In contrast, if the tokenizer is `callable` we can use any callable that has an `.encode` and `.decode` function. If we were to use [tiktoken](/~https://github.com/openai/tiktoken), then the document would be split up into tokens and the length of the document is counted by the total number tokens.
+
+To give an example, using tiktoken would work as follows:
+
+```python
+import openai
+import tiktoken
+from bertopic.representation import OpenAI
+from bertopic import BERTopic
+
+# Create tokenizer
+tokenizer = tiktoken.get_encoding("cl100k_base")
+
+# Create your representation model
+client = openai.OpenAI(api_key="sk-...")
+representation_model = OpenAI(client, tokenizer=tokenizer, document_length=50)
+```
+
+In this example, each document will be at most 50 tokens, anything more will get truncated.
### **Document Truncation**
@@ -75,8 +117,9 @@ from bertopic import BERTopic
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")
# Create your representation model
-openai.api_key = MY_API_KEY
+client = openai.OpenAI(api_key="sk-...")
representation_model = OpenAI(
+ client,
model="gpt-3.5-turbo",
delay_in_seconds=2,
chat=True,
@@ -130,7 +173,73 @@ representation_model = TextGeneration(generator)
As can be seen from the example above, if you would like to use a `text2text-generation` model, you will to
pass a `transformers.pipeline` with the `"text2text-generation"` parameter. Moreover, you can use a custom prompt and decide where the keywords should
-be inserted by using the `[KEYWORDS]` or documents with the `[DOCUMENTS]` tag:
+be inserted by using the `[KEYWORDS]` or documents with the `[DOCUMENTS]` tag.
+
+### **Zephyr** (Mistral 7B)
+
+We can go a step further with open-source Large Language Models (LLMs) that have shown to match the performance of closed-source LLMs like ChatGPT.
+
+In this example, we will show you how to use Zephyr, a fine-tuning version of Mistral 7B. Mistral 7B outperforms other open-source LLMs at a much smaller scale and is a worthwhile solution for use cases such as topic modeling. We want to keep inference as fast as possible and a relatively small model helps with that. Zephyr is a fine-tuned version of Mistral 7B that was trained on a mix of publicly available and synthetic datasets using Direct Preference Optimization (DPO).
+
+To use Zephyr in BERTopic, we will first need to install and update a couple of packages that can handle quantized versions of Zephyr:
+
+```python
+pip install ctransformers[cuda]
+pip install --upgrade git+/~https://github.com/huggingface/transformers
+```
+
+Instead of loading in the full model, we can instead load a quantized model which is a compressed version of the original model:
+
+```python
+from ctransformers import AutoModelForCausalLM
+from transformers import AutoTokenizer, pipeline
+
+# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
+model = AutoModelForCausalLM.from_pretrained(
+ "TheBloke/zephyr-7B-alpha-GGUF",
+ model_file="zephyr-7b-alpha.Q4_K_M.gguf",
+ model_type="mistral",
+ gpu_layers=50,
+ hf=True
+)
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
+
+# Pipeline
+generator = pipeline(
+ model=model, tokenizer=tokenizer,
+ task='text-generation',
+ max_new_tokens=50,
+ repetition_penalty=1.1
+)
+```
+
+This Zephyr model requires a specific prompt template in order to work:
+
+```python
+prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..
+<|user|>
+I have a topic that contains the following documents:
+[DOCUMENTS]
+
+The topic is described by the following keywords: '[KEYWORDS]'.
+
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+<|assistant|>"""
+```
+
+After creating this prompt template, we can create our representation model to be used in BERTopic:
+
+
+```python
+from bertopic.representation import TextGeneration
+
+# Text generation with Zephyr
+zephyr = TextGeneration(generator, prompt=prompt)
+representation_model = {"Zephyr": zephyr}
+
+# Topic Modeling
+topic_model = BERTopic(representation_model=representation_model, verbose=True)
+```
### **Llama 2**
@@ -237,6 +346,71 @@ representation_model = {
topic_model = BERTopic(representation_model=representation_model, verbose=True)
```
+## **llama.cpp**
+
+An amazing framework for using LLMs for inference is [`llama.cpp`](/~https://github.com/ggerganov/llama.cpp) which has [python bindings](/~https://github.com/abetlen/llama-cpp-python) that we can use in BERTopic. To start with, we first need to install `llama-cpp-python`:
+
+```bash
+pip install llama-cpp-python
+```
+
+or using the following for hardware acceleration:
+
+```bash
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+!!! Note
+ There are a number of [installation options](/~https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration) depending on your hardware and OS. Make sure that you select the correct one to optimize your performance.
+
+After installation, you need to download your LLM locally before we use it in BERTopic, like so:
+
+```bash
+wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf
+```
+
+Finally, we can now use the model the model with BERTopic in just a couple of lines:
+
+```python
+from bertopic import BERTopic
+from bertopic.representation import LlamaCPP
+
+# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
+representation_model = LlamaCPP("zephyr-7b-alpha.Q4_K_M.gguf")
+
+# Create our BERTopic model
+topic_model = BERTopic(representation_model=representation_model, verbose=True)
+```
+
+If you want to have more control over the LLMs parameters, you can run it like so:
+
+```python
+from bertopic import BERTopic
+from bertopic.representation import LlamaCPP
+from llama_cpp import Llama
+
+# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha
+llm = Llama(model_path="zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop="Q:")
+representation_model = LlamaCPP(llm)
+
+# Create our BERTopic model
+topic_model = BERTopic(representation_model=representation_model, verbose=True)
+```
+
+!!! Note
+ The default template that is being used uses a "Q: ... A: ... " type of structure which is why the `stop` is set at `"Q:"`.
+ The default template is:
+ ```python
+ """
+ Q: I have a topic that contains the following documents:
+ [DOCUMENTS]
+
+ The topic is described by the following keywords: '[KEYWORDS]'.
+
+ Based on the above information, can you give a short label of the topic?
+ A:
+ """
+ ```
## **OpenAI**
@@ -256,8 +430,8 @@ from bertopic.representation import OpenAI
from bertopic import BERTopic
# Create your representation model
-openai.api_key = MY_API_KEY
-representation_model = OpenAI()
+client = openai.OpenAI(api_key="sk-...")
+representation_model = OpenAI(client)
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
@@ -273,7 +447,7 @@ You can also use a custom prompt:
```python
prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '"
-representation_model = OpenAI(prompt=prompt)
+representation_model = OpenAI(client, prompt=prompt)
```
### **ChatGPT**
@@ -282,7 +456,7 @@ Within OpenAI's API, the ChatGPT models use a different API structure compared t
In order to use ChatGPT with BERTopic, we need to define the model and make sure to enable `chat`:
```python
-representation_model = OpenAI(model="gpt-3.5-turbo", delay_in_seconds=10, chat=True)
+representation_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True)
```
Prompting with ChatGPT is very satisfying and is customizable as follows:
@@ -323,7 +497,7 @@ Based on the information above, please give a description of this topic in the f
topic:
"""
-representation_model = OpenAI(model="gpt-3.5-turbo", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)
+representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)
```
The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method!
diff --git a/docs/getting_started/representation/representation.md b/docs/getting_started/representation/representation.md
index b1fbd247..2fb1d03e 100644
--- a/docs/getting_started/representation/representation.md
+++ b/docs/getting_started/representation/representation.md
@@ -177,8 +177,8 @@ from bertopic import BERTopic
import openai
# Create your representation models
-openai.api_key = MY_API_KEY
-openai_generator = OpenAI()
+client = openai.OpenAI(api_key="sk-...")
+openai_generator = OpenAI(client)
mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, openai_generator]
diff --git a/docs/getting_started/seed_words/seed_words.md b/docs/getting_started/seed_words/seed_words.md
new file mode 100644
index 00000000..2bf75e0f
--- /dev/null
+++ b/docs/getting_started/seed_words/seed_words.md
@@ -0,0 +1,59 @@
+When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the *"TNM"* classification is a method for identifying the stage of most cancers. The word *"TNM"* is an abbreviation and might not be correctly captured in generic embedding models.
+
+To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of `seed_words` in the `bertopic.vectorizer.ClassTfidfTransformer`. The `ClassTfidfTransformer` is the base representation of BERTopic and essentially represents each topic as a bag of words. As such, we can choose to increase the importance of certain words, such as *"TNM"*.
+
+To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like "agent" and "robot" should be important in such a topic were it to be found. Using the `ClassTfidfTransformer`, we can define those `seed_words` and also choose by how much their values are multiplied.
+
+The full example is then as follows:
+
+```python
+from umap import UMAP
+from datasets import load_dataset
+from bertopic import BERTopic
+from bertopic.vectorizers import ClassTfidfTransformer
+
+# Let's take a subset of ArXiv abstracts as the training data
+dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
+abstracts = dataset["abstract"][:5_000]
+
+# For illustration purposes, we make sure the output is fixed when running this code multiple times
+umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
+
+# We can choose any number of seed words for which we want their representation
+# to be strengthen. We increase the importance of these words as we want them to be more
+# likely to end up in the topic representations.
+ctfidf_model = ClassTfidfTransformer(
+ seed_words=["agent", "robot", "behavior", "policies", "environment"],
+ seed_multiplier=2
+)
+
+# We run the topic model with the seeded words
+topic_model = BERTopic(
+ umap_model=umap_model,
+ min_topic_size=15,
+ ctfidf_model=ctfidf_model,
+).fit(abstracts)
+```
+
+Then, when we run `topic_model.get_topic(0)`, we get the following output:
+
+```python
+[('policy', 0.023413102511982354),
+ ('reinforcement', 0.021796126795834238),
+ ('agent', 0.021131601305431902),
+ ('policies', 0.01888385271486409),
+ ('environment', 0.017819874593917057),
+ ('learning', 0.015321710504308708),
+ ('robot', 0.013881115279230468),
+ ('control', 0.013297705894983875),
+ ('the', 0.013247933839985382),
+ ('to', 0.013058208312484141)]
+```
+
+As we can see, the output includes some of the seed words that we assigned. However, if a word is not found to be important in a topic than we can still multiply its importance but it will remain relatively low. This is a great feature as it allows you to improve their importance with less risk of making words important in topics that really should not be.
+
+A benefit of this method is that this often influences all other representation methods, like KeyBERTInspired and OpenAI. The reason for this is that each representation model uses the words generated by the `ClassTfidfTransformer` as candidate words to be further optimized. In many cases, words like *"TNM"* might not end up in the candidate words. By increasing their importance, they are more likely to end up as candidate words in representation models.
+
+Another benefit of using this method is that it artificially increases the interpretability of topics. Sure, some words might be more important than others but there might not mean something to a domain expert. For them, certain words, like *"TNM"* are highly descriptive and that is something difficult to capture using any method (embedding model, large language model, etc.).
+
+Moreover, these `seed_words` can be defined together with the domain expert as they can decide what type of words are generally important and might need a nudge from you the algorithmic developer.
diff --git a/docs/getting_started/serialization/serialization.md b/docs/getting_started/serialization/serialization.md
index d4b153e7..7decd37e 100644
--- a/docs/getting_started/serialization/serialization.md
+++ b/docs/getting_started/serialization/serialization.md
@@ -136,8 +136,8 @@ The embedding model cannot always be saved using a non-pickle method if, for exa
import openai
from bertopic.backend import OpenAIBackend
-openai.api_key = MY_API_KEY
-embedding_model = OpenAIBackend("text-embedding-ada-002")
+client = openai.OpenAI(api_key="sk-...")
+embedding_model = OpenAIBackend(client, "text-embedding-ada-002")
# Load model and add embedding model
loaded_model = BERTopic.load("path/to/my/model_dir", embedding_model=embedding_model)
diff --git a/docs/getting_started/zeroshot/zeroshot.md b/docs/getting_started/zeroshot/zeroshot.md
new file mode 100644
index 00000000..81916da9
--- /dev/null
+++ b/docs/getting_started/zeroshot/zeroshot.md
@@ -0,0 +1,76 @@
+Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics.
+
+This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics.
+This allows for extensive flexibility as there are three scenario's to explore.
+
+First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.
+
+Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
+
+Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
+
+
+
+This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will be put through a regular BERTopic model.
+
+This creates two models. One for the zero-shot topics and one for the non-zero-shot topics. We combine these two BERTopic models to create a single model that contains both zero-shot and non-zero-shot topics.
+
+### **Example**
+In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However,
+there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers.
+We know the data and believe there to be at least the following topics: *clustering*, *topic modeling*, and *large language models*.
+However, we are not sure whether other topics exist and want to explore those.
+
+Zero-shot BERTopic needs two parameters:
+* `zeroshot_topic_list` - The names of the topics to assign documents to. Making sure this is as descriptive as possible helps improve the assignment since they are based on cosine similarities between embeddings.
+* `zeroshot_min_similarity` - The minimum cosine similarity needed to match a document to a document. It is a value between 0 and 1.
+
+
+Using this feature is straightforward:
+
+```python
+from datasets import load_dataset
+
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+
+# We select a subsample of 5000 abstracts from ArXiv
+dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
+docs = dataset["abstract"][:5_000]
+
+# We define a number of topics that we know are in the documents
+zeroshot_topic_list = ["Clustering", "Topic Modeling", "Large Language Models"]
+
+# We fit our model using the zero-shot topics
+# and we define a minimum similarity. For each document,
+# if the similarity does not exceed that value, it will be used
+# for clustering instead.
+topic_model = BERTopic(
+ embedding_model="thenlper/gte-small",
+ min_topic_size=15,
+ zeroshot_topic_list=zeroshot_topic_list,
+ zeroshot_min_similarity=.85,
+ representation_model=KeyBERTInspired()
+)
+topics, _ = topic_model.fit_transform(docs)
+```
+
+When we run `topic_model.get_topic_info()` you will see something like this:
+
+
+
+
+The `zeroshot_min_similarity` parameter controls how many of the documents are assigned to the predefined zero-shot topics. Lower this value and you will have more documents assigned to zero-shot topics and fewer documents will be clustered. Increase this value you will have fewer documents assigned to zero-shot topics and more documents will be clustered.
+
+!!! Note
+ Setting the `zeroshot_min_similarity` parameter requires a bit of experimentation. Some embedding
+ models have different similarity distributions, so trying out the values manually and exploring the results
+ is highly advised.
+
+
+!!! Tip
+ Because zero-shot topic modeling is essentially merging two different topic models, the
+ `probs` will be empty initially. If you want to have the probabilities of topics across documents,
+ you can run `topic_model.transform` on your documents to extract the updated `probs`.
diff --git a/docs/getting_started/zeroshot/zeroshot.svg b/docs/getting_started/zeroshot/zeroshot.svg
new file mode 100644
index 00000000..94432498
--- /dev/null
+++ b/docs/getting_started/zeroshot/zeroshot.svg
@@ -0,0 +1,282 @@
+
diff --git a/docs/getting_started/zeroshot/zeroshot_output.png b/docs/getting_started/zeroshot/zeroshot_output.png
new file mode 100644
index 00000000..a4229cf7
Binary files /dev/null and b/docs/getting_started/zeroshot/zeroshot_output.png differ
diff --git a/docs/index.md b/docs/index.md
index 0ec2acb6..7e7ccab9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,12 +30,12 @@ BERTopic supports all kinds of topic modeling techniques: