Skip to content

Commit 1a0ef81

Browse files
author
Abhishek Dutta
committed
Added Back end source code (Part 2)
Added Prepocessing (Extraction, Cleaning and Condensing), Machine Learning and Sentiment Analysis Model
1 parent 757e9ea commit 1a0ef81

35 files changed

+300246
-0
lines changed

accuracy_data.csv

+3,279
Large diffs are not rendered by default.

accuracy_measure.ipynb

+214
Large diffs are not rendered by default.

actual_predict_tags.ipynb

+653
Large diffs are not rendered by default.

actual_sentiment_analysis.ipynb

+5,054
Large diffs are not rendered by default.

clean.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import json
2+
import csv
3+
import urllib2
4+
import re
5+
import string
6+
7+
api_key = 'AIzaSyC4C3gzSSErzmc2FeUTleQqZGzw8-z-d6w'
8+
# AIzaSyCrFWiPfGcb5IsyS-wpAMk6eaNdMaC8pXs
9+
# AIzaSyDlZR2UhwQXeGw2IhCRnpoZB8LHZkagwI4
10+
# AIzaSyCXqjs2ZPb0PQReIWiENMAAkSx0_tvd4nk
11+
# AIzaSyCsE91PTD-XjTU3O_IZpY0PvVom2tw4Dr8
12+
# AIzaSyArrhkh49b2GNlC8UdLodq3uSpKzcgdzeg
13+
# AIzaSyCPcAKC74SzgQB8MSXKcPO6zIoVfqwlOig
14+
# AIzaSyDBkoHdD1Iw6HooMhMoObbHFCXHFSwKzIU
15+
# AIzaSyC4C3gzSSErzmc2FeUTleQqZGzw8-z-d6w
16+
17+
url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics,recordingDetails&id='
18+
19+
#strip punctuation
20+
regex = re.compile('[%s]' % re.escape(string.punctuation))
21+
22+
#id, title, description, like count, dislike count, location, tags
23+
with open('tempList.txt', 'rb') as f:
24+
l = [line.split(',') for line in f]
25+
26+
# data = json.load(f)
27+
# l = []
28+
# t = open('tempList.txt', 'wb')
29+
30+
# for item in data['items']:
31+
# l.append([item['id']['videoId'], item['snippet']['title'], item['snippet']['description']])
32+
# t.write('%s,' % item['id']['videoId'])
33+
# try:
34+
# t.write('%s,' % regex.sub('', item['snippet']['title'].encode('utf8').decode('unicode_escape').encode('ascii','ignore')))
35+
# except:
36+
# print('title missing')
37+
# t.write(',')
38+
# try:
39+
# t.write('%s\n' % regex.sub('', item['snippet']['description'].encode('utf8').decode('unicode_escape').encode('ascii','ignore')))
40+
# except:
41+
# print('description missing')
42+
# t.write('\n')
43+
44+
# t.close()
45+
46+
with open('videoStats.csv', 'wb') as c:
47+
writer = csv.writer(c)
48+
writer.writerow(['Id', 'Title', 'Description', 'LikeCount', 'DislikeCount', 'Location (latitude, longitude)', 'Tags (; delimited string)'])
49+
50+
for vid in l:
51+
try:
52+
stats = json.load(urllib2.urlopen(url + vid[0] + '&key=' + api_key))
53+
print(vid[0])
54+
except:
55+
print('API key ran out')
56+
print(l.index(vid[0]))
57+
58+
if stats['items'] == []:
59+
writer.writerow([vid[0], vid[1].encode('utf8'), vid[2].encode('utf8'),0,0,'',''])
60+
continue
61+
62+
s = stats['items'][0]
63+
LC = 0
64+
DC = 0
65+
loc = ''
66+
tags = ''
67+
68+
if 'likeCount' in s['statistics']:
69+
LC = s['statistics']['likeCount']
70+
if 'dislikeCount' in s['statistics']:
71+
DC = s['statistics']['dislikeCount']
72+
if 'latitude' in s.get('recordingDetails', {}).get('location', {}):
73+
loc = str(s['recordingDetails']['location']['latitude']) + ';' + str(s['recordingDetails']['location']['longitude'])
74+
if 'tags' in s['snippet']:
75+
t = s['snippet']['tags']
76+
77+
for i in range(len(t)):
78+
t[i] = re.sub(r'http\S+|www.\S+', '', t[i])
79+
t[i] = regex.sub('', t[i])
80+
81+
tags = ';'.join(t)
82+
83+
title = re.sub(r'http\S+|www.\S+', '', vid[1])
84+
descr = re.sub(r'http\S+|www.\S+', '', vid[2])
85+
86+
title = regex.sub('', title)
87+
descr = regex.sub('', descr)
88+
89+
writer.writerow([vid[0], title.encode('utf8').decode('unicode_escape').encode('ascii','ignore'), descr.encode('utf8').decode('unicode_escape').encode('ascii','ignore'), LC, DC, loc, tags.encode('utf8').decode('unicode_escape').encode('ascii','ignore')])

commentsFile.csv

+51
Large diffs are not rendered by default.

condense.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import csv
2+
from syn import Synonyms
3+
import re
4+
5+
with open('videoStats.csv', 'rb') as c:
6+
reader = csv.reader(c)
7+
next(reader, None)
8+
9+
with open('condensedStats.csv', 'wb') as f:
10+
writer = csv.writer(f)
11+
writer.writerow(['Id', 'Title', 'Description', 'LikeCount', 'DislikeCount', 'Location (latitude, longitude)', 'Condensed Tags (; delimited string)'])
12+
13+
s = Synonyms.load(open('syns.txt'))
14+
15+
for row in reader:
16+
row[1] = row[1].replace('\n', '')
17+
row[2] = row[2].replace('\n', '')
18+
19+
if row[6] != '':
20+
l = row[6].split(';')
21+
22+
for i in range(len(l)):
23+
l[i] = l[i].replace('\r\n', '')
24+
l[i] = l[i].replace('\n', '')
25+
l[i] = re.sub(r'href\S+', '', l[i])
26+
merge = s.match(l[i], ignoreCase=True)
27+
28+
if merge:
29+
l[i] = merge
30+
31+
row[6] = ';'.join(list(set(l)))
32+
print(row[6])
33+
row[6] = re.sub(r'\s*;\s*', ';', row[6])
34+
print(row[6])
35+
36+
writer.writerow(row)

condensedStats.csv

+101,405
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)