-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdata_fasttext.py
104 lines (74 loc) · 2.39 KB
/
data_fasttext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from sklearn import metrics
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import label_binarize
from sklearn import svm
from sklearn.metrics import accuracy_score
from nltk.stem import *
from nltk.corpus import stopwords
import time
import re
import os.path
import math
import codecs
# path = 'webkb/'
# other_path = 'WEBKB'
# path = 'amazon/'
# other_path = 'amazon'
# path = 'subject/'
# other_path = 'subject'
# path = 'imdb/'
# other_path = 'IMDB'
# path = '20newsgroup/'
# other_path = '20NG'
path = 'reuters/'
other_path = 'reuters'
words_frequency = {}
clean_train_documents = []
y_train = []
y_test = []
clean_test_documents = []
## Open the file with read only permission
f = codecs.open(path+'data/my_'+other_path+'_train.txt', "r", encoding="utf-8")
train = [x.strip('\n') for x in f.readlines()]
f.close()
num_documents = len(train)
for i in range( 0, num_documents ):
# Call our function for each one, and add the result to the list of
# clean reviews
line = train[i].split('\t')
if line[1].split(" ")>1:
y_train.append(line[0])
for n, w in enumerate( line[1].split(' ') ):
if w not in words_frequency:
words_frequency[w] = 1
else:
words_frequency[w] = words_frequency[w] + 1
clean_train_documents.append( line[1] )
## Open the file with read only permit
f = codecs.open(path+'data/my_'+other_path+'_test.txt', "r", encoding="utf-8")
test = [x.strip('\n') for x in f.readlines()]
f.close()
num_test_documents = len(test)
for i in range( 0, num_test_documents ):
# Call our function for each one, and add the result to the list of
# clean reviews
line = test[i].split('\t')
if line[1].split(" ")>1:
y_test.append(line[0])
clean_test_documents.append( line[1] )
f = codecs.open(path+'data/my_'+other_path+'_train_FASTTEXT.txt', "w", encoding="utf-8")
for i, doc in enumerate(clean_train_documents):
s = doc.split(" ")
if len(set(s))>1:
f.write("__label__"+str(y_train[i])+" "+doc+"\n")
f.close()
f = codecs.open(path+'data/my_'+other_path+'_test_FASTTEXT.txt', "w", encoding="utf-8")
for i, doc in enumerate(clean_test_documents):
s = doc.split(" ")
if len(set(s))>1:
f.write("__label__"+str(y_test[i])+" "+doc+"\n")
f.close()