|
| 1 | +""" |
| 2 | +Tests for zero-shot topic modeling functionality. |
| 3 | +
|
| 4 | +This module tests various aspects of zero-shot topic modeling, including |
| 5 | +edge cases with the nr_topics parameter and topic_sizes_ consistency. |
| 6 | +""" |
| 7 | + |
| 8 | +from bertopic import BERTopic |
| 9 | +from umap import UMAP |
| 10 | + |
| 11 | + |
| 12 | +def test_zeroshot_with_nr_topics(): |
| 13 | + """Test zero-shot topic modeling with nr_topics parameter.""" |
| 14 | + docs = [ |
| 15 | + "This is about machine learning and artificial intelligence", |
| 16 | + "Deep learning neural networks are powerful", |
| 17 | + "Python programming for data science", |
| 18 | + "Machine learning algorithms and models", |
| 19 | + "Artificial intelligence and deep learning", |
| 20 | + "Data science with Python programming", |
| 21 | + "Neural networks and machine learning", |
| 22 | + "Programming in Python for AI", |
| 23 | + "Deep learning models and algorithms", |
| 24 | + "Artificial intelligence programming" |
| 25 | + ] |
| 26 | + |
| 27 | + zeroshot_topics = ["Technology and Programming"] |
| 28 | + |
| 29 | + topic_model = BERTopic( |
| 30 | + zeroshot_topic_list=zeroshot_topics, |
| 31 | + zeroshot_min_similarity=0.1, |
| 32 | + nr_topics=2, |
| 33 | + min_topic_size=2 |
| 34 | + ) |
| 35 | + |
| 36 | + topics, probs = topic_model.fit_transform(docs) |
| 37 | + |
| 38 | + # Verify topic_sizes_ is properly populated |
| 39 | + assert topic_model.topic_sizes_ is not None |
| 40 | + assert len(topic_model.topic_sizes_) > 0 |
| 41 | + |
| 42 | + # Verify total document count matches |
| 43 | + total_in_sizes = sum(topic_model.topic_sizes_.values()) |
| 44 | + assert total_in_sizes == len(docs) |
| 45 | + |
| 46 | + # Verify all topics are accounted for |
| 47 | + for topic in set(topics): |
| 48 | + assert topic in topic_model.topic_sizes_ |
| 49 | + |
| 50 | + |
| 51 | +def test_zeroshot_all_documents_assigned(): |
| 52 | + """Test edge case where all documents are assigned to zero-shot topics.""" |
| 53 | + docs = [ |
| 54 | + "Technology is advancing rapidly", |
| 55 | + "Software development is important", |
| 56 | + "Programming languages are evolving", |
| 57 | + "Computer science research continues", |
| 58 | + "Digital transformation is happening", |
| 59 | + "Innovation in technology sector", |
| 60 | + "Software engineering best practices", |
| 61 | + "Modern programming techniques", |
| 62 | + "Computer systems and architecture", |
| 63 | + "Digital solutions and platforms", |
| 64 | + "Technology trends and developments", |
| 65 | + "Software design patterns", |
| 66 | + "Programming paradigms evolution", |
| 67 | + "Computing infrastructure advances", |
| 68 | + "Digital innovation strategies" |
| 69 | + ] |
| 70 | + |
| 71 | + zeroshot_topics = ["Technology"] |
| 72 | + umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42) |
| 73 | + |
| 74 | + topic_model = BERTopic( |
| 75 | + zeroshot_topic_list=zeroshot_topics, |
| 76 | + zeroshot_min_similarity=0.05, |
| 77 | + nr_topics=2, |
| 78 | + min_topic_size=1, |
| 79 | + umap_model=umap_model |
| 80 | + ) |
| 81 | + |
| 82 | + topics, probs = topic_model.fit_transform(docs) |
| 83 | + |
| 84 | + # Verify all documents are accounted for |
| 85 | + total_in_sizes = sum(topic_model.topic_sizes_.values()) |
| 86 | + assert total_in_sizes == len(docs) |
| 87 | + assert topic_model.topic_sizes_ is not None |
| 88 | + |
| 89 | + |
| 90 | +def test_zeroshot_topic_info_consistency(): |
| 91 | + """Test consistency between topic_sizes_ and get_topic_info().""" |
| 92 | + docs = [ |
| 93 | + "AI and machine learning research", |
| 94 | + "Deep learning neural networks", |
| 95 | + "Neural network architectures", |
| 96 | + "Machine learning algorithms", |
| 97 | + "Artificial intelligence systems", |
| 98 | + "Deep learning models training", |
| 99 | + "Neural network optimization", |
| 100 | + "Machine learning applications", |
| 101 | + "AI research and development", |
| 102 | + "Deep learning frameworks" |
| 103 | + ] |
| 104 | + zeroshot_topics = ["Artificial Intelligence"] |
| 105 | + umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42) |
| 106 | + |
| 107 | + topic_model = BERTopic( |
| 108 | + zeroshot_topic_list=zeroshot_topics, |
| 109 | + zeroshot_min_similarity=0.1, |
| 110 | + nr_topics=2, |
| 111 | + min_topic_size=1, |
| 112 | + umap_model=umap_model |
| 113 | + ) |
| 114 | + |
| 115 | + topics, probs = topic_model.fit_transform(docs) |
| 116 | + |
| 117 | + # Verify topic info consistency |
| 118 | + topic_info = topic_model.get_topic_info() |
| 119 | + assert not topic_info.empty |
| 120 | + assert topic_info.shape[0] > 0 |
| 121 | + |
| 122 | + # Verify topic_sizes_ and topic_info are consistent |
| 123 | + topic_info_counts = dict(zip(topic_info.Topic, topic_info.Count)) |
| 124 | + for topic_id, count in topic_model.topic_sizes_.items(): |
| 125 | + assert topic_id in topic_info_counts |
| 126 | + assert topic_info_counts[topic_id] == count |
| 127 | + |
| 128 | + |
| 129 | +def test_github_issue_2384_reproduction(): |
| 130 | + """Test exact reproduction case from GitHub issue #2384.""" |
| 131 | + # Exact reproduction case from GitHub issue #2384 |
| 132 | + docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 50 |
| 133 | + zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"] |
| 134 | + |
| 135 | + model = BERTopic( |
| 136 | + zeroshot_topic_list=zeroshot_topics, |
| 137 | + zeroshot_min_similarity=-1, |
| 138 | + nr_topics=4, |
| 139 | + ) |
| 140 | + |
| 141 | + topics, _ = model.fit_transform(docs) |
| 142 | + |
| 143 | + # Verify the fix |
| 144 | + assert model.topic_sizes_ is not None |
| 145 | + assert len(model.topic_sizes_) > 0 |
| 146 | + |
| 147 | + # Verify get_topic_info() works |
| 148 | + topic_info = model.get_topic_info() |
| 149 | + assert not topic_info.empty |
| 150 | + assert topic_info.shape[0] > 0 |
| 151 | + |
| 152 | + # Verify total document count matches |
| 153 | + total_docs_in_sizes = sum(model.topic_sizes_.values()) |
| 154 | + assert total_docs_in_sizes == len(docs) |
| 155 | + |
| 156 | + # Verify topic_representations_ still works (no regression) |
| 157 | + assert model.topic_representations_ is not None |
| 158 | + assert len(model.topic_representations_) > 0 |
| 159 | + |
| 160 | + |
| 161 | +def test_zeroshot_nr_topics_consistency(): |
| 162 | + """Test consistency between using nr_topics and not using it.""" |
| 163 | + docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 20 |
| 164 | + zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"] |
| 165 | + |
| 166 | + # Test without nr_topics |
| 167 | + model_without = BERTopic( |
| 168 | + zeroshot_topic_list=zeroshot_topics, |
| 169 | + zeroshot_min_similarity=-1 |
| 170 | + ) |
| 171 | + topics_without, _ = model_without.fit_transform(docs) |
| 172 | + |
| 173 | + # Test with nr_topics |
| 174 | + model_with = BERTopic( |
| 175 | + zeroshot_topic_list=zeroshot_topics, |
| 176 | + zeroshot_min_similarity=-1, |
| 177 | + nr_topics=4 |
| 178 | + ) |
| 179 | + topics_with, _ = model_with.fit_transform(docs) |
| 180 | + |
| 181 | + # Both should have properly populated topic_sizes_ |
| 182 | + assert model_without.topic_sizes_ is not None |
| 183 | + assert model_with.topic_sizes_ is not None |
| 184 | + |
| 185 | + # Both should have same total document count |
| 186 | + total_without = sum(model_without.topic_sizes_.values()) |
| 187 | + total_with = sum(model_with.topic_sizes_.values()) |
| 188 | + assert total_without == len(docs) |
| 189 | + assert total_with == len(docs) |
| 190 | + |
| 191 | + # Both should have working get_topic_info() |
| 192 | + info_without = model_without.get_topic_info() |
| 193 | + info_with = model_with.get_topic_info() |
| 194 | + assert not info_without.empty |
| 195 | + assert not info_with.empty |
0 commit comments