-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats_script.py
92 lines (67 loc) · 2.4 KB
/
stats_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import statistics
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk import word_tokenize
from collections import Counter
from learning_dataset_generator import LEARNING_DATASET_TRAIN_PATH, LEARNING_DATASET_TEST_PATH, \
LEARNING_SMALL_DATASET_TRAIN_PATH, LEARNING_TINY_DATASET_TRAIN_PATH, LEARNING_TINY_DATASET_TEST_PATH, \
LEARNING_SMALL_DATASET_TEST_PATH
try:
df = pd.read_csv(FULL_DATASET_PATH)
except:
print('Dataset.csv not found')
sys.exit()
def word_counter(song):
line_list = re.split('\n|\s', song)
return len(list(filter(None, line_list)))
# number of characters per song
df['char_count'] = df['lyrics'].apply(len)
# number of words per song
df['word_count'] = df['lyrics'].apply(word_counter)
# number of verses per song
df['line_count'] = df.lyrics.str.count('\n') + 1
# avg number of characters
avg_chars = df['char_count'].mean()
# avg number of words
avg_words = df['word_count'].mean()
# avg number of line
avg_lines = df['line_count'].mean()
genres = df['genre'].value_counts()
genres = pd.DataFrame(genres)
genres = genres.rename(columns={'genre': 'num_songs'})
##IMPORTANT##
tmp = df[['genre', 'char_count', 'word_count',
'line_count']].groupby('genre').mean()
tmp = tmp.rename(columns={'char_count': 'char_avg',
'word_count': 'word_avg', 'line_count': 'line_avg'})
genres = genres.join(tmp)
print(genres)
print('Dataset shape {}'.format(df.shape))
word_series = pd.Series()
word_array = []
def splitter(song):
line_list = song.split('\n')
output = []
for line in line_list:
output.append(len(word_tokenize(line)))
return np.array(output)
word_array_series = df['lyrics'].apply(splitter)
word_array = np.array([0])
for array in word_array_series:
word_array = np.concatenate((word_array, array))
word_array_series = pd.Series(word_array)
word_count_series = word_array_series.value_counts()
print('the most common number of words is: {}'.format(
word_count_series[word_count_series == max(word_count_series)]))
word_count_series = word_count_series.sort_index()
word_count_series.plot.bar()
plt.show()
line_count_series = df['line_count'].value_counts()
print('the most common number of lines is: {}'.format(
line_count_series[line_count_series == max(line_count_series)]))
line_count_series = line_count_series.sort_index()
line_count_series.plot.bar()
plt.show()