diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 66a9dcf5..276129ba 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -505,6 +505,9 @@ def fit_transform( documents = assigned_documents embeddings = assigned_embeddings + # Update topic_sizes_ when all documents are assigned to zero-shot topics + self._update_topic_size(documents) + # Sort and Map Topic IDs by their frequency if not self.nr_topics: documents = self._sort_mappings_by_frequency(documents) diff --git a/docs/changelog.md b/docs/changelog.md index f2ff9850..eaf0cd5c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,12 @@ hide: # Changelog +## **Unreleased** + +

Fixes:

+ +* Fix `topic_sizes_` not being updated in zero-shot topic modeling when using `nr_topics` parameter ([#2384](https://github.com/MaartenGr/BERTopic/issues/2384)) + ## **Version 0.17.3** *Release date: 8 July, 2025* diff --git a/tests/test_variations/test_zeroshot.py b/tests/test_variations/test_zeroshot.py new file mode 100644 index 00000000..05356191 --- /dev/null +++ b/tests/test_variations/test_zeroshot.py @@ -0,0 +1,185 @@ +""" +Tests for zero-shot topic modeling functionality. + +This module tests various aspects of zero-shot topic modeling, including +edge cases with the nr_topics parameter and topic_sizes_ consistency. +""" + +from bertopic import BERTopic +from umap import UMAP + + +def test_zeroshot_with_nr_topics(): + """Test zero-shot topic modeling with nr_topics parameter.""" + docs = [ + "This is about machine learning and artificial intelligence", + "Deep learning neural networks are powerful", + "Python programming for data science", + "Machine learning algorithms and models", + "Artificial intelligence and deep learning", + "Data science with Python programming", + "Neural networks and machine learning", + "Programming in Python for AI", + "Deep learning models and algorithms", + "Artificial intelligence programming", + ] + + zeroshot_topics = ["Technology and Programming"] + + topic_model = BERTopic( + zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=0.1, nr_topics=2, min_topic_size=2 + ) + + topics, probs = topic_model.fit_transform(docs) + + # Verify topic_sizes_ is properly populated + assert topic_model.topic_sizes_ is not None + assert len(topic_model.topic_sizes_) > 0 + + # Verify total document count matches + total_in_sizes = sum(topic_model.topic_sizes_.values()) + assert total_in_sizes == len(docs) + + # Verify all topics are accounted for + for topic in set(topics): + assert topic in topic_model.topic_sizes_ + + +def test_zeroshot_all_documents_assigned(): + """Test edge case where all documents are assigned to zero-shot topics.""" + docs = [ + "Technology is advancing rapidly", + "Software development is important", + "Programming languages are evolving", + "Computer science research continues", + "Digital transformation is happening", + "Innovation in technology sector", + "Software engineering best practices", + "Modern programming techniques", + "Computer systems and architecture", + "Digital solutions and platforms", + "Technology trends and developments", + "Software design patterns", + "Programming paradigms evolution", + "Computing infrastructure advances", + "Digital innovation strategies", + ] + + zeroshot_topics = ["Technology"] + umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric="cosine", random_state=42) + + topic_model = BERTopic( + zeroshot_topic_list=zeroshot_topics, + zeroshot_min_similarity=0.05, + nr_topics=2, + min_topic_size=1, + umap_model=umap_model, + ) + + topics, probs = topic_model.fit_transform(docs) + + # Verify all documents are accounted for + total_in_sizes = sum(topic_model.topic_sizes_.values()) + assert total_in_sizes == len(docs) + assert topic_model.topic_sizes_ is not None + + +def test_zeroshot_topic_info_consistency(): + """Test consistency between topic_sizes_ and get_topic_info().""" + docs = [ + "AI and machine learning research", + "Deep learning neural networks", + "Neural network architectures", + "Machine learning algorithms", + "Artificial intelligence systems", + "Deep learning models training", + "Neural network optimization", + "Machine learning applications", + "AI research and development", + "Deep learning frameworks", + ] + zeroshot_topics = ["Artificial Intelligence"] + umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric="cosine", random_state=42) + + topic_model = BERTopic( + zeroshot_topic_list=zeroshot_topics, + zeroshot_min_similarity=0.1, + nr_topics=2, + min_topic_size=1, + umap_model=umap_model, + ) + + topics, probs = topic_model.fit_transform(docs) + + # Verify topic info consistency + topic_info = topic_model.get_topic_info() + assert not topic_info.empty + assert topic_info.shape[0] > 0 + + # Verify topic_sizes_ and topic_info are consistent + topic_info_counts = dict(zip(topic_info.Topic, topic_info.Count)) + for topic_id, count in topic_model.topic_sizes_.items(): + assert topic_id in topic_info_counts + assert topic_info_counts[topic_id] == count + + +def test_github_issue_2384_reproduction(): + """Test exact reproduction case from GitHub issue #2384.""" + # Exact reproduction case from GitHub issue #2384 + docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 50 + zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"] + + model = BERTopic( + zeroshot_topic_list=zeroshot_topics, + zeroshot_min_similarity=-1, + nr_topics=4, + ) + + topics, _ = model.fit_transform(docs) + + # Verify the fix + assert model.topic_sizes_ is not None + assert len(model.topic_sizes_) > 0 + + # Verify get_topic_info() works + topic_info = model.get_topic_info() + assert not topic_info.empty + assert topic_info.shape[0] > 0 + + # Verify total document count matches + total_docs_in_sizes = sum(model.topic_sizes_.values()) + assert total_docs_in_sizes == len(docs) + + # Verify topic_representations_ still works (no regression) + assert model.topic_representations_ is not None + assert len(model.topic_representations_) > 0 + + +def test_zeroshot_nr_topics_consistency(): + """Test consistency between using nr_topics and not using it.""" + docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 20 + zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"] + + # Test without nr_topics + model_without = BERTopic(zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=-1) + topics_without, _ = model_without.fit_transform(docs) + + # Test with nr_topics + model_with = BERTopic(zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=-1, nr_topics=4) + topics_with, _ = model_with.fit_transform(docs) + + # Both should have properly populated topic_sizes_ + assert model_without.topic_sizes_ is not None + assert model_with.topic_sizes_ is not None + + # Both should have same total document count + total_without = sum(model_without.topic_sizes_.values()) + total_with = sum(model_with.topic_sizes_.values()) + assert total_without == len(docs) + assert total_with == len(docs) + + # Both should have working get_topic_info() + info_without = model_without.get_topic_info() + info_with = model_with.get_topic_info() + assert not info_without.empty + assert not info_with.empty