undertheseanlp
diff --git a/‎README.md
+6-3 b/‎README.md
+6-3
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎cnn.py
+5-23 b/‎cnn.py
+5-23
diff --git a/‎model.h5
8.08 MB b/‎model.h5
8.08 MB
diff --git a/‎predict.py
+6-4 b/‎predict.py
+6-4
diff --git a/‎preprocess.py ‎preprocessing.py
+14-5 b/‎preprocess.py ‎preprocessing.py
+14-5
@@ -47,7 +47,7 @@ Mô hình được train bởi hàm tối ưu Adam với learning rate = 0.0001,
 
 *Dự đoán*
 
-Mỗi một sample được chia thành mỗi 250 frames, sử dụng phương pháp trích rút đặc trưng như mô tả ở trên, rồi đưa vào mạng CNN. Nhãn của file âm thanh được chọn bởi phương pháp Majority voting.
+Mỗi một sample được chia thành mỗi 250 frames, sử dụng phương pháp trích rút đặc trưng như mô tả ở trên, rồi đưa vào mạng CNN. Nhãn của file âm thanh được chọn bởi chiến thuật majority voting.
 
 
 # Cách sử dụng
@@ -62,10 +62,12 @@ pip install requirements.txt
 
 ## Huấn luyện mô hình 
 
-Để huấn luyện mô hình, chạy script `make_sample.py` và `train.py` 
+Để huấn luyện mô hình, chạy script `preprocessing.py` và `train.py` 
+
+Chú ý: Dữ liệu train gồm có folder `train` cần đặt vào thư mục `data`
 
 ```
-python make_sample.py
+python preprocessing.py train
 python train.py
 ```
 
@@ -74,6 +76,7 @@ python train.py
 Để dự đoán, chạy script `predict.py` 
 
 ```
+python preprocessing.py test
 python predict.py 
 ``` 
 
@@ -34,8 +34,8 @@ def reverse_transform(self, y):
         return
 
 
-train_data = joblib.load("tmp/zalo_data/train_full.data.bin")
-test_data = joblib.load("tmp/zalo_data/test.data.bin")
+train_data = joblib.load("tmp/train_full.data.bin")
+test_data = joblib.load("tmp/test.data.bin")
 
 labels = []
 is_first = True
@@ -55,7 +55,7 @@ def reverse_transform(self, y):
 input_shape = X.shape[1:]
 num_classes = 6
 batch_size = 32
-epochs = 30
+epochs = 10
 
 model = Sequential()
 model.add(Conv2D(64, kernel_size=(7, 7), strides=(1, 1), activation='relu', input_shape=input_shape, padding='same'))
@@ -81,26 +81,8 @@ def reverse_transform(self, y):
                     validation_data=(X_test, y_test),
                     callbacks=[early_stopping])
 
-# Predictions
-import os
-prediction_filename = "submission.csv"
-try:
-    os.remove(prediction_filename)
-except Exception:
-    pass
-map_values = [(0, 1), (0, 0), (0, 2), (1, 1), (1, 0), (1, 2)]
-prediction_file = open(prediction_filename, "a")
-prediction_file.write("id,gender,accent\n")
-count_error_file = 0
-for label, X in test_data:
-    try:
-        value = np.bincount(np.argmax(model.predict(X), axis=1)).argmax()
-    except:
-        print(f"Cannot detect file {label}")
-        value = 0
-        count_error_file += 1
-    gender, accent = map_values[value]
-    prediction_file.write(f"{label},{gender},{accent}\n")
+model.save('model.h5')
+
 
 # evaluation("submission.csv", "data/public_test_gt.csv")
 
 
@@ -1,16 +1,17 @@
 import os
 import numpy as np
-
 import joblib
+from keras.models import load_model
+
 
 prediction_filename = "submission.csv"
 try:
     os.remove(prediction_filename)
 except Exception:
     pass
 
-model = None
-test_data = joblib.load("tmp/zalo_data/test.data.bin")
+model = load_model("model.h5")
+test_data = joblib.load("tmp/test.data.bin")
 
 map_values = [(0, 1), (0, 0), (0, 2), (1, 1), (1, 0), (1, 2)]
 prediction_file = open(prediction_filename, "a")
@@ -24,4 +25,5 @@
         value = 0
         count_error_file += 1
     gender, accent = map_values[value]
-    prediction_file.write(f"{label},{gender},{accent}\n")
+    prediction_file.write(f"{label},{gender},{accent}\n")
+print(f"Results is saved in file {prediction_filename}")
@@ -1,3 +1,4 @@
+import argparse
 from multiprocessing.pool import Pool
 from os import listdir
 import numpy as np
@@ -57,12 +58,13 @@ def make_train_data():
     n = len(files)
     features = list(tqdm.tqdm(p.imap(extract_features, files), total=n))
 
-    joblib.dump(features, "tmp/zalo_data/train_full.data.bin")
+    joblib.dump(features, "tmp/train_full.data.bin")
     print(len(features))
 
 
 def make_test_data():
-    TEST_FOLDER = "data/public_test"
+    # TEST_FOLDER = "data/public_test"
+    TEST_FOLDER = "/data"
     tmp = listdir(TEST_FOLDER)
     files = []
     for label in tmp:
@@ -72,9 +74,16 @@ def make_test_data():
     n = len(files)
     features = list(tqdm.tqdm(p.imap(extract_features, files), total=n))
 
-    joblib.dump(features, "tmp/zalo_data/test.data.bin")
+    joblib.dump(features, "tmp/test.data.bin")
     print(len(features))
 
 
-make_train_data()
-make_test_data()
+parser = argparse.ArgumentParser("preprocessing.py")
+parser.add_argument("option", nargs="+", help="train or test")
+
+args = parser.parse_args()
+mode = args.mode
+if mode == "train":
+    make_train_data()
+elif mode == "test":
+    make_test_data()