Twitter-Sentiment-Analysis/twitter_api.py at master · HashimHB/Twitter-Sentiment-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from cgi import test
import tweepy
import configparser
import pandas as pd
import numpy as np
from emot.emo_unicode import UNICODE_EMOJI
import re
from nltk.tokenize import wordpunct_tokenize


#read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

#authentication
auth = tweepy.OAuth1UserHandler(api_key,api_key_secret)
auth.set_access_token(access_token,access_token_secret)

api = tweepy.API(auth)

#q = success also gives us the tweets to be searched

n=1600000

tweets=tweepy.Cursor(api.search_tweets,q="peace -filter:links -filter:retweets",lang="en",tweet_mode="extended").items(n)


tweets_json=[]

for tweet in tweets:
   tweets_json.append(tweet._json)

x = len(tweets_json)

# To save the data as collected from twitter

import json

for i in tweets_json:
    with open("basic_data.json", "w") as outfile:
        json.dump(i, outfile)

df = pd.DataFrame(columns = ['target','ids','date','flag','user','text'])

def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot," "+UNICODE_EMOJI[emot].replace(",","").replace(":","")+" ")
        text = text.replace("_"," ")
    return text


artext = []
user_id = []
timing = []
user_name = []

for i in range(x):
    basic = tweets_json[i]["full_text"]
    basic = convert_emojis(basic)
    artext.append(basic)
    user_id.append(tweets_json[i]["id"])
    user_name.append(tweets_json[i]["user"]["name"])
    timing.append(tweets_json[i]["created_at"][:-10]+" PDT "+tweets_json[i]["created_at"][-5:])


artext = pd.Series(artext)
timing = pd.Series(timing)
user_name = pd.Series(user_name)
user_id = pd.Series(user_id)


df["text"] = artext
df["ids"] = user_id
df["date"] = timing
df["user"] = user_name
df["flag"] = "NO_QUERY"


test1=df.copy()

test1=test1.apply(lambda x:x.astype(str).str.lower())

#replace all the non-alphabetic elements with 1 and keep
# [^A-za-z.!+ ] keep all the letter, dot and exclamation mark because a lot of of word attached to ! or . as
# form of expression
test1["text"]=test1["text"].apply(lambda x:  re.sub("[^A-za-z.!+ ]", '1', x))

test1["text"]=test1["text"].apply(lambda x: " ".join(w for w in wordpunct_tokenize(x.strip()) if (w.isalpha())))

x = len(test1["text"])

#TextBlob api
from textblob import TextBlob

sent = []

for i in range(x):
    blob = TextBlob(test1["text"][i])
    ss = 0
    den = 0
    for sentence in blob.sentences:
        ss += sentence.sentiment.polarity
        den += 1
    if den==0:
        sent.append(2)
    else:
        sco = ss/den
        if sco==0:
            sent.append(2)
        elif sco>0:
            sent.append(4)
        else:
            sent.append(0)

sent = pd.Series(sent,dtype="string")

test1["target"] = sent

print(test1)

test1.to_csv('testntrain.csv')