Skip to content

Commit fec0041

Browse files
committed
Fix topic_sizes_ not updated in zero-shot topic modeling (#2384)
1 parent 144ab7b commit fec0041

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

bertopic/_bertopic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,9 @@ def fit_transform(
504504
# All documents matches zero-shot topics
505505
documents = assigned_documents
506506
embeddings = assigned_embeddings
507+
508+
# Update topic_sizes_ when all documents are assigned to zero-shot topics
509+
self._update_topic_size(documents)
507510

508511
# Sort and Map Topic IDs by their frequency
509512
if not self.nr_topics:

docs/changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ hide:
55

66
# Changelog
77

8+
## **Unreleased**
9+
10+
<h3><b>Fixes:</a></b></h3>
11+
12+
* Fix `topic_sizes_` not being updated in zero-shot topic modeling when using `nr_topics` parameter ([#2384](https://github.com/MaartenGr/BERTopic/issues/2384))
13+
814
## **Version 0.17.3**
915
*Release date: 8 July, 2025*
1016

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Tests for zero-shot topic modeling functionality.
3+
4+
This module tests various aspects of zero-shot topic modeling, including
5+
edge cases with the nr_topics parameter and topic_sizes_ consistency.
6+
"""
7+
8+
from bertopic import BERTopic
9+
from umap import UMAP
10+
11+
12+
def test_zeroshot_with_nr_topics():
13+
"""Test zero-shot topic modeling with nr_topics parameter."""
14+
docs = [
15+
"This is about machine learning and artificial intelligence",
16+
"Deep learning neural networks are powerful",
17+
"Python programming for data science",
18+
"Machine learning algorithms and models",
19+
"Artificial intelligence and deep learning",
20+
"Data science with Python programming",
21+
"Neural networks and machine learning",
22+
"Programming in Python for AI",
23+
"Deep learning models and algorithms",
24+
"Artificial intelligence programming"
25+
]
26+
27+
zeroshot_topics = ["Technology and Programming"]
28+
29+
topic_model = BERTopic(
30+
zeroshot_topic_list=zeroshot_topics,
31+
zeroshot_min_similarity=0.1,
32+
nr_topics=2,
33+
min_topic_size=2
34+
)
35+
36+
topics, probs = topic_model.fit_transform(docs)
37+
38+
# Verify topic_sizes_ is properly populated
39+
assert topic_model.topic_sizes_ is not None
40+
assert len(topic_model.topic_sizes_) > 0
41+
42+
# Verify total document count matches
43+
total_in_sizes = sum(topic_model.topic_sizes_.values())
44+
assert total_in_sizes == len(docs)
45+
46+
# Verify all topics are accounted for
47+
for topic in set(topics):
48+
assert topic in topic_model.topic_sizes_
49+
50+
51+
def test_zeroshot_all_documents_assigned():
52+
"""Test edge case where all documents are assigned to zero-shot topics."""
53+
docs = [
54+
"Technology is advancing rapidly",
55+
"Software development is important",
56+
"Programming languages are evolving",
57+
"Computer science research continues",
58+
"Digital transformation is happening",
59+
"Innovation in technology sector",
60+
"Software engineering best practices",
61+
"Modern programming techniques",
62+
"Computer systems and architecture",
63+
"Digital solutions and platforms",
64+
"Technology trends and developments",
65+
"Software design patterns",
66+
"Programming paradigms evolution",
67+
"Computing infrastructure advances",
68+
"Digital innovation strategies"
69+
]
70+
71+
zeroshot_topics = ["Technology"]
72+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42)
73+
74+
topic_model = BERTopic(
75+
zeroshot_topic_list=zeroshot_topics,
76+
zeroshot_min_similarity=0.05,
77+
nr_topics=2,
78+
min_topic_size=1,
79+
umap_model=umap_model
80+
)
81+
82+
topics, probs = topic_model.fit_transform(docs)
83+
84+
# Verify all documents are accounted for
85+
total_in_sizes = sum(topic_model.topic_sizes_.values())
86+
assert total_in_sizes == len(docs)
87+
assert topic_model.topic_sizes_ is not None
88+
89+
90+
def test_zeroshot_topic_info_consistency():
91+
"""Test consistency between topic_sizes_ and get_topic_info()."""
92+
docs = [
93+
"AI and machine learning research",
94+
"Deep learning neural networks",
95+
"Neural network architectures",
96+
"Machine learning algorithms",
97+
"Artificial intelligence systems",
98+
"Deep learning models training",
99+
"Neural network optimization",
100+
"Machine learning applications",
101+
"AI research and development",
102+
"Deep learning frameworks"
103+
]
104+
zeroshot_topics = ["Artificial Intelligence"]
105+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42)
106+
107+
topic_model = BERTopic(
108+
zeroshot_topic_list=zeroshot_topics,
109+
zeroshot_min_similarity=0.1,
110+
nr_topics=2,
111+
min_topic_size=1,
112+
umap_model=umap_model
113+
)
114+
115+
topics, probs = topic_model.fit_transform(docs)
116+
117+
# Verify topic info consistency
118+
topic_info = topic_model.get_topic_info()
119+
assert not topic_info.empty
120+
assert topic_info.shape[0] > 0
121+
122+
# Verify topic_sizes_ and topic_info are consistent
123+
topic_info_counts = dict(zip(topic_info.Topic, topic_info.Count))
124+
for topic_id, count in topic_model.topic_sizes_.items():
125+
assert topic_id in topic_info_counts
126+
assert topic_info_counts[topic_id] == count
127+
128+
129+
def test_github_issue_2384_reproduction():
130+
"""Test exact reproduction case from GitHub issue #2384."""
131+
# Exact reproduction case from GitHub issue #2384
132+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 50
133+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
134+
135+
model = BERTopic(
136+
zeroshot_topic_list=zeroshot_topics,
137+
zeroshot_min_similarity=-1,
138+
nr_topics=4,
139+
)
140+
141+
topics, _ = model.fit_transform(docs)
142+
143+
# Verify the fix
144+
assert model.topic_sizes_ is not None
145+
assert len(model.topic_sizes_) > 0
146+
147+
# Verify get_topic_info() works
148+
topic_info = model.get_topic_info()
149+
assert not topic_info.empty
150+
assert topic_info.shape[0] > 0
151+
152+
# Verify total document count matches
153+
total_docs_in_sizes = sum(model.topic_sizes_.values())
154+
assert total_docs_in_sizes == len(docs)
155+
156+
# Verify topic_representations_ still works (no regression)
157+
assert model.topic_representations_ is not None
158+
assert len(model.topic_representations_) > 0
159+
160+
161+
def test_zeroshot_nr_topics_consistency():
162+
"""Test consistency between using nr_topics and not using it."""
163+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 20
164+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
165+
166+
# Test without nr_topics
167+
model_without = BERTopic(
168+
zeroshot_topic_list=zeroshot_topics,
169+
zeroshot_min_similarity=-1
170+
)
171+
topics_without, _ = model_without.fit_transform(docs)
172+
173+
# Test with nr_topics
174+
model_with = BERTopic(
175+
zeroshot_topic_list=zeroshot_topics,
176+
zeroshot_min_similarity=-1,
177+
nr_topics=4
178+
)
179+
topics_with, _ = model_with.fit_transform(docs)
180+
181+
# Both should have properly populated topic_sizes_
182+
assert model_without.topic_sizes_ is not None
183+
assert model_with.topic_sizes_ is not None
184+
185+
# Both should have same total document count
186+
total_without = sum(model_without.topic_sizes_.values())
187+
total_with = sum(model_with.topic_sizes_.values())
188+
assert total_without == len(docs)
189+
assert total_with == len(docs)
190+
191+
# Both should have working get_topic_info()
192+
info_without = model_without.get_topic_info()
193+
info_with = model_with.get_topic_info()
194+
assert not info_without.empty
195+
assert not info_with.empty

0 commit comments

Comments
 (0)