Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def fit_transform(
logger.info("Embedding - Transforming documents to embeddings.")
self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
embeddings = self._extract_embeddings(
documents.Document.values.tolist(),
documents.Document.to_numpy().tolist(),
images=images,
method="document",
verbose=self.verbose,
Expand Down Expand Up @@ -503,7 +503,7 @@ def fit_transform(
documents = self._sort_mappings_by_frequency(documents)

# Create documents from images if we have images only
if documents.Document.values[0] is None:
if documents.Document.to_numpy()[0] is None:
custom_documents = self._images_to_text(documents, embeddings)

# Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
Expand Down Expand Up @@ -726,7 +726,7 @@ def partial_fit(
self.embedding_model, language=self.language, verbose=self.verbose
)
embeddings = self._extract_embeddings(
documents.Document.values.tolist(),
documents.Document.to_numpy().tolist(),
method="document",
verbose=self.verbose,
)
Expand Down Expand Up @@ -926,7 +926,7 @@ def topics_over_time(
# Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
# by simply taking the average of the two
if global_tuning:
selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.to_numpy()]
c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0

# Extract the words per topic
Expand Down Expand Up @@ -1010,11 +1010,11 @@ def topics_per_class(
# by simply taking the average of the two
if global_tuning:
c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False)
c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0
c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.to_numpy() + self._outliers] + c_tf_idf) / 2.0

# Extract the words per topic
words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()
topic_frequency = pd.Series(documents_per_topic.Class.to_numpy(), index=documents_per_topic.Topic).to_dict()

# Fill dataframe with results
topics_at_class = [
Expand Down Expand Up @@ -1796,7 +1796,7 @@ def get_document_info(

# Add topic info through `.get_topic_info()`
topic_info = self.get_topic_info().drop("Count", axis=1)
document_info = pd.merge(document_info, topic_info, on="Topic", how="left")
document_info = document_info.merge(topic_info, on="Topic", how="left")

# Add top n words
top_n_words = {topic: " - ".join(next(zip(*self.get_topic(topic)))) for topic in set(self.topics_)}
Expand Down Expand Up @@ -1941,7 +1941,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
(hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),
"Distance",
]
distance = distance.values[0] if len(distance) > 0 else 10
distance = distance.to_numpy()[0] if len(distance) > 0 else 10

if parent != start:
if grandpa is None:
Expand Down Expand Up @@ -4059,7 +4059,7 @@ def _zeroshot_topic_modeling(
embeddings = embeddings[non_assigned_ids]

if len(documents) == 0:
self.topics_ = assigned_documents["Topic"].values.tolist()
self.topics_ = assigned_documents["Topic"].to_numpy().tolist()
self.topic_mapper_ = TopicMapper(self.topics_)

logger.info("Zeroshot Step 1 - Completed \u2713")
Expand Down Expand Up @@ -4280,7 +4280,7 @@ def _extract_representative_docs(
for index, topic in enumerate(labels):
# Slice data
selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]
selected_docs = selection["Document"].values
selected_docs = selection["Document"].to_numpy()
selected_docs_ids = selection.index.tolist()

# Calculate similarity
Expand Down Expand Up @@ -4335,8 +4335,8 @@ def _create_topic_vectors(
if embeddings is not None and documents is not None:
topic_embeddings = []
topics = documents.sort_values("Topic").Topic.unique()
topic_ids = documents["Topic"].values
doc_ids = documents["ID"].values.astype(int)
topic_ids = documents["Topic"].to_numpy()
doc_ids = documents["ID"].to_numpy().astype(int)
for topic in topics:
mask = topic_ids == topic
topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))
Expand Down Expand Up @@ -4458,7 +4458,7 @@ def _update_topic_size(self, documents: pd.DataFrame):
Arguments:
documents: Updated dataframe with documents and their corresponding IDs and newly added Topics
"""
self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist())
self.topic_sizes_ = collections.Counter(documents.Topic.to_numpy().tolist())
self.topics_ = documents.Topic.astype(int).tolist()

def _extract_words_per_topic(
Expand Down
2 changes: 1 addition & 1 deletion bertopic/plotting/_datamap.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def visualize_document_datamap(
topic_name_mapping[topic_num] = "Unlabelled"

# Map in topic names and plot
named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values
named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).to_numpy()

if interactive:
figure = datamapplot.create_interactive_plot(
Expand Down
2 changes: 1 addition & 1 deletion bertopic/plotting/_hierarchical_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def visualize_hierarchical_documents(
else:
trace_name = (
f"{topic}_"
+ hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]
+ hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0]
)
plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])
topic_names[topic] = {
Expand Down
4 changes: 2 additions & 2 deletions bertopic/plotting/_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def _get_annotations(
else:
for key, value in parent_topic.items():
if set(value) == set(fst_topic):
fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]

if len(scnd_topic) == 1:
if isinstance(custom_labels, str):
Expand All @@ -320,7 +320,7 @@ def _get_annotations(
else:
for key, value in parent_topic.items():
if set(value) == set(scnd_topic):
scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]

text_annotations.append([fst_name, "", "", scnd_name])

Expand Down
6 changes: 3 additions & 3 deletions bertopic/plotting/_topics_over_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ def visualize_topics_over_time(
fig = go.Figure()
for index, topic in enumerate(data.Topic.unique()):
trace_data = data.loc[data.Topic == topic, :]
topic_name = trace_data.Name.values[0]
words = trace_data.Words.values
topic_name = trace_data.Name.to_numpy()[0]
words = trace_data.Words.to_numpy()
if normalize_frequency:
y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
y = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
else:
y = trace_data.Frequency
fig.add_trace(
Expand Down
6 changes: 3 additions & 3 deletions bertopic/plotting/_topics_per_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ def visualize_topics_per_class(
else:
visible = "legendonly"
trace_data = data.loc[data.Topic == topic, :]
topic_name = trace_data.Name.values[0]
words = trace_data.Words.values
topic_name = trace_data.Name.to_numpy()[0]
words = trace_data.Words.to_numpy()
if normalize_frequency:
x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
x = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
else:
x = trace_data.Frequency
fig.add_trace(
Expand Down
8 changes: 4 additions & 4 deletions bertopic/representation/_visual.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def extract_topics(
representative_images: Representative images per topic
"""
# Extract image ids of most representative documents
images = documents["Image"].values.tolist()
images = documents["Image"].to_numpy().tolist()
(_, _, _, repr_docs_ids) = topic_model._extract_representative_docs(
c_tf_idf,
documents,
Expand Down Expand Up @@ -156,10 +156,10 @@ def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> Li
def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
"""Convert images to text."""
# Create image topic embeddings
topics = documents.Topic.values.tolist()
images = documents.Image.values.tolist()
topics = documents.Topic.to_numpy().tolist()
images = documents.Image.to_numpy().tolist()
df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings]))
image_topic_embeddings = df.groupby(0).mean().values
image_topic_embeddings = df.groupby(0).mean().to_numpy()

# Extract image centroids
image_centroids = {}
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ select = [
"E9",
"F", # pyflakes
"D", # pydocstyle
"PD", # pandas-vet
"RUF", # ruff
]

Expand Down