ndleah
diff --git a/‎Text_Predication/README.md
+69 b/‎Text_Predication/README.md
+69
diff --git a/‎Text_Predication/assets/demo.PNG
5.13 KB b/‎Text_Predication/assets/demo.PNG
5.13 KB
diff --git a/‎Text_Predication/assets/predict.jpg
170 KB b/‎Text_Predication/assets/predict.jpg
170 KB
diff --git a/‎Text_Predication/train.csv ‎Text_Predication/corpus/train.csv b/‎Text_Predication/train.csv ‎Text_Predication/corpus/train.csv
diff --git a/‎Text_Predication/text_prediction.py
+20-8 b/‎Text_Predication/text_prediction.py
+20-8
@@ -0,0 +1,69 @@
+# Text prediction using N-gram Language Model
+
+<p align="center">
+<img src="assets/predict.jpg" width=40% height=40%>
+
+
+## ⚙️ Languages or Frameworks Used
+This Python project utilizes the Natural Language Toolkit (NLTK) library to implement an N-gram language model. The code
+includes the following packages:
+### Packages
+
+1. **Pandas:** A data manipulation library used for handling and analyzing tabular data.
+
+2. **NLTK (Natural Language Toolkit):**
+    - `bigrams`: Module for extracting bigrams from a sequence of words.
+    - `lm.preprocessing.pad_both_ends`: Preprocessing module for padding both ends of a sequence.
+    - `tokenize.WordPunctTokenizer`: Tokenizer for breaking text into words using punctuation and whitespace.
+
+    - `lm.Vocabulary`: Module for constructing a vocabulary from a given text corpus.
+    - `lm.Laplace`: Module implementing Laplace smoothing for language modeling.
+
+## 🛠️ Description
+
+### N-gram Language Model Project
+This Python project implements a text prediction system using the Laplace smoothing model with bigrams. The goal is to predict the next word in a given sentence based on the provided prefix. The project utilizes the Natural Language Toolkit (NLTK) library for processing and modeling natural language data.
+
+### How It Works
+
+1. **Data Preprocessing:**
+   - The project starts by reading a CSV file (`train.csv`) containing text data.
+   - HTML tags are removed from the 'Body' column of the dataset using a function called `remove_html_tags`.
+   - The text is tokenized using the `WordPunctTokenizer` from NLTK.
+
+2. **N-gram Model Building:**
+   - The corpus is then processed further by padding both ends of each sentence with special symbols ("<s>" and "</s>") using the `pad_both_ends` function.
+   - Bigrams are extracted from the padded sentences using the `bigrams` function.
+   - The vocabulary is constructed using the `Vocabulary` class from NLTK.
+
+3. **Laplace Smoothing Model:**
+   - The Laplace smoothing model is implemented using the `Laplace` class from NLTK's language modeling module.
+   - The model is trained on the bigram data.
+
+4. **Next-word Prediction:**
+   - User input is taken to provide a prefix for next-word prediction.
+   - The Laplace model scores each word in the vocabulary based on its likelihood to follow the given prefix.
+   - The top three predictions with their respective scores are displayed.
+
+
+## 🌟 How to run
+
+1. **Instal** the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+2. **Run** the code:
+
+```bash
+python text_prediction.py
+```
+
+## 📺 Demo
+
+![](https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExN3BndnM1M2tnaWhlbjkxczJmcndzenh2bnlhaWFkZWR2YWhqNDg0ZSZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/NIQIoC9vc7xBEPOCPY/giphy.gif)
+
+## 🤖 Author
+
+links : [louisbau](https://github.com/louisbau)
@@ -1,29 +1,28 @@
-import math
-
 import pandas as pd
 from nltk import bigrams
 from nltk.lm.preprocessing import pad_both_ends
-
 from nltk.tokenize import WordPunctTokenizer
-from nltk.probability import FreqDist
 from nltk.lm import Vocabulary
 from nltk.lm import Laplace
 
 
 def remove_html_tags(text):
+    # Function to remove HTML tags from text
     import re
     clean = re.compile('<.*?>')
     return re.sub(clean, '', text)
 
 
 def remap_corpus(path):
+    # Read CSV file, preprocess the 'Body' column, and tokenize the text
     df_corpus = pd.read_csv(path)
     df_corpus['Body'] = df_corpus['Body'].apply(lambda x: remove_html_tags(x))
     df_corpus['Body_tokenized'] = df_corpus['Body'].apply(lambda x: WordPunctTokenizer().tokenize(x))
     return df_corpus
 
 
 def padding_corpus(corpus):
+    # Pad both ends of each sentence in the corpus
     corpus_padding = []
     for sentence in corpus:
         corpus_padding.append(
@@ -33,13 +32,15 @@ def padding_corpus(corpus):
 
 
 def remap_bigram(corpus):
+    # Extract bigrams from each sentence in the corpus
     corpus_bigram = []
     for sentence in corpus:
         corpus_bigram.append(list(bigrams(sentence)))
     return corpus_bigram
 
 
 def vocab(corpus):
+    # Create a vocabulary list from the corpus
     voc_list = []
     for sentence in corpus:
         for word in sentence:
@@ -48,6 +49,7 @@ def vocab(corpus):
 
 
 def prediction(train, prefix):
+    # Perform next-word prediction using Laplace smoothing
     train = padding_corpus(train)
     voc = vocab(train)
     LaplaceModel = Laplace(2, vocabulary=voc)
@@ -62,10 +64,20 @@ def prediction(train, prefix):
 
 
 if __name__ == '__main__':
-    path_train = "./train.csv"
+    # Main execution
+    print("Text Prediction using Laplace Smoothing")
+    print("--------------------------------------")
+    print("Reading training data...")
+
+    path_train = "corpus/train.csv"
     corpus_train = remap_corpus(path_train)['Body_tokenized']
+    print("Training data is ready!")
+    print("--------------------------------------")
     user_input = input("Enter a prefix for next-word prediction: ")
-    prediction = prediction(corpus_train, user_input)
-    for i in prediction:
-        print(f"Next word predictionc can be : {i}")
+    print("--------------------------------------")
+    print("Performing next-word prediction...")
 
+    prediction_result = prediction(corpus_train, user_input)
+    print("------- Result of Prediction ---------")
+    for i in prediction_result:
+        print(f"Next word predictionc can be : {i}")