diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 56b90412..cfafb58a 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -457,7 +457,7 @@ def fit_transform( logger.info("Embedding - Transforming documents to embeddings.") self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose) embeddings = self._extract_embeddings( - documents.Document.values.tolist(), + documents.Document.to_numpy().tolist(), images=images, method="document", verbose=self.verbose, @@ -503,7 +503,7 @@ def fit_transform( documents = self._sort_mappings_by_frequency(documents) # Create documents from images if we have images only - if documents.Document.values[0] is None: + if documents.Document.to_numpy()[0] is None: custom_documents = self._images_to_text(documents, embeddings) # Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations. @@ -726,7 +726,7 @@ def partial_fit( self.embedding_model, language=self.language, verbose=self.verbose ) embeddings = self._extract_embeddings( - documents.Document.values.tolist(), + documents.Document.to_numpy().tolist(), method="document", verbose=self.verbose, ) @@ -926,7 +926,7 @@ def topics_over_time( # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: - selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values] + selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.to_numpy()] c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0 # Extract the words per topic @@ -1010,11 +1010,11 @@ def topics_per_class( # by simply taking the average of the two if global_tuning: c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) - c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0 + c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.to_numpy() + self._outliers] + c_tf_idf) / 2.0 # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) - topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict() + topic_frequency = pd.Series(documents_per_topic.Class.to_numpy(), index=documents_per_topic.Topic).to_dict() # Fill dataframe with results topics_at_class = [ @@ -1796,7 +1796,7 @@ def get_document_info( # Add topic info through `.get_topic_info()` topic_info = self.get_topic_info().drop("Count", axis=1) - document_info = pd.merge(document_info, topic_info, on="Topic", how="left") + document_info = document_info.merge(topic_info, on="Topic", how="left") # Add top n words top_n_words = {topic: " - ".join(next(zip(*self.get_topic(topic)))) for topic in set(self.topics_)} @@ -1941,7 +1941,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""): (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent), "Distance", ] - distance = distance.values[0] if len(distance) > 0 else 10 + distance = distance.to_numpy()[0] if len(distance) > 0 else 10 if parent != start: if grandpa is None: @@ -4059,7 +4059,7 @@ def _zeroshot_topic_modeling( embeddings = embeddings[non_assigned_ids] if len(documents) == 0: - self.topics_ = assigned_documents["Topic"].values.tolist() + self.topics_ = assigned_documents["Topic"].to_numpy().tolist() self.topic_mapper_ = TopicMapper(self.topics_) logger.info("Zeroshot Step 1 - Completed \u2713") @@ -4280,7 +4280,7 @@ def _extract_representative_docs( for index, topic in enumerate(labels): # Slice data selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :] - selected_docs = selection["Document"].values + selected_docs = selection["Document"].to_numpy() selected_docs_ids = selection.index.tolist() # Calculate similarity @@ -4335,8 +4335,8 @@ def _create_topic_vectors( if embeddings is not None and documents is not None: topic_embeddings = [] topics = documents.sort_values("Topic").Topic.unique() - topic_ids = documents["Topic"].values - doc_ids = documents["ID"].values.astype(int) + topic_ids = documents["Topic"].to_numpy() + doc_ids = documents["ID"].to_numpy().astype(int) for topic in topics: mask = topic_ids == topic topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0)) @@ -4458,7 +4458,7 @@ def _update_topic_size(self, documents: pd.DataFrame): Arguments: documents: Updated dataframe with documents and their corresponding IDs and newly added Topics """ - self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist()) + self.topic_sizes_ = collections.Counter(documents.Topic.to_numpy().tolist()) self.topics_ = documents.Topic.astype(int).tolist() def _extract_words_per_topic( diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py index 11fdb6bf..58522fdc 100644 --- a/bertopic/plotting/_datamap.py +++ b/bertopic/plotting/_datamap.py @@ -162,7 +162,7 @@ def visualize_document_datamap( topic_name_mapping[topic_num] = "Unlabelled" # Map in topic names and plot - named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values + named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).to_numpy() if interactive: figure = datamapplot.create_interactive_plot( diff --git a/bertopic/plotting/_hierarchical_documents.py b/bertopic/plotting/_hierarchical_documents.py index b3453475..6e974e6e 100644 --- a/bertopic/plotting/_hierarchical_documents.py +++ b/bertopic/plotting/_hierarchical_documents.py @@ -230,7 +230,7 @@ def visualize_hierarchical_documents( else: trace_name = ( f"{topic}_" - + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0] + + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0] ) plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]]) topic_names[topic] = { diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py index 2449de50..177ae874 100644 --- a/bertopic/plotting/_hierarchy.py +++ b/bertopic/plotting/_hierarchy.py @@ -306,7 +306,7 @@ def _get_annotations( else: for key, value in parent_topic.items(): if set(value) == set(fst_topic): - fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0] + fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0] if len(scnd_topic) == 1: if isinstance(custom_labels, str): @@ -320,7 +320,7 @@ def _get_annotations( else: for key, value in parent_topic.items(): if set(value) == set(scnd_topic): - scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0] + scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0] text_annotations.append([fst_name, "", "", scnd_name]) diff --git a/bertopic/plotting/_topics_over_time.py b/bertopic/plotting/_topics_over_time.py index fdde18be..9966e1c5 100644 --- a/bertopic/plotting/_topics_over_time.py +++ b/bertopic/plotting/_topics_over_time.py @@ -92,10 +92,10 @@ def visualize_topics_over_time( fig = go.Figure() for index, topic in enumerate(data.Topic.unique()): trace_data = data.loc[data.Topic == topic, :] - topic_name = trace_data.Name.values[0] - words = trace_data.Words.values + topic_name = trace_data.Name.to_numpy()[0] + words = trace_data.Words.to_numpy() if normalize_frequency: - y = normalize(trace_data.Frequency.values.reshape(1, -1))[0] + y = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0] else: y = trace_data.Frequency fig.add_trace( diff --git a/bertopic/plotting/_topics_per_class.py b/bertopic/plotting/_topics_per_class.py index 1da151d1..c7fafc3c 100644 --- a/bertopic/plotting/_topics_per_class.py +++ b/bertopic/plotting/_topics_per_class.py @@ -96,10 +96,10 @@ def visualize_topics_per_class( else: visible = "legendonly" trace_data = data.loc[data.Topic == topic, :] - topic_name = trace_data.Name.values[0] - words = trace_data.Words.values + topic_name = trace_data.Name.to_numpy()[0] + words = trace_data.Words.to_numpy() if normalize_frequency: - x = normalize(trace_data.Frequency.values.reshape(1, -1))[0] + x = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0] else: x = trace_data.Frequency fig.add_trace( diff --git a/bertopic/representation/_visual.py b/bertopic/representation/_visual.py index 07968596..8c98d5a6 100644 --- a/bertopic/representation/_visual.py +++ b/bertopic/representation/_visual.py @@ -92,7 +92,7 @@ def extract_topics( representative_images: Representative images per topic """ # Extract image ids of most representative documents - images = documents["Image"].values.tolist() + images = documents["Image"].to_numpy().tolist() (_, _, _, repr_docs_ids) = topic_model._extract_representative_docs( c_tf_idf, documents, @@ -156,10 +156,10 @@ def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> Li def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: """Convert images to text.""" # Create image topic embeddings - topics = documents.Topic.values.tolist() - images = documents.Image.values.tolist() + topics = documents.Topic.to_numpy().tolist() + images = documents.Image.to_numpy().tolist() df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings])) - image_topic_embeddings = df.groupby(0).mean().values + image_topic_embeddings = df.groupby(0).mean().to_numpy() # Extract image centroids image_centroids = {} diff --git a/pyproject.toml b/pyproject.toml index 77c7bc26..d3019893 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ select = [ "E9", "F", # pyflakes "D", # pydocstyle + "PD", # pandas-vet "RUF", # ruff ]