-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstockdata_etl_halfhour.py
62 lines (36 loc) · 2.1 KB
/
stockdata_etl_halfhour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 21:28:04 2020
@author: michael
"""
import requests
import pandas as pd
import arrow
import datetime
import sqlalchemy
import yfinance as yf
# Get the data for the stock Apple by specifying the stock ticker, start date, and end date
engine = sqlalchemy.create_engine('', pool_recycle=3600)
def offset(x): return max(x-4,0)
commentdata= pd.read_sql("select * from reddit",engine)
commentdata['created'] = pd.to_datetime(commentdata['created'],unit = 's')
commentdata['comment_Date'] = commentdata['created'].dt.weekday
#Turn time to EDT, created is 8 hours from UTC and UTC is 4 hours from EDT
commentdata['Datetime'] = commentdata['created'] - pd.DateOffset(hours=8)
commentdata['Datetime'] = commentdata['Datetime'].dt.round('30min') + pd.DateOffset(hours=1)
commentdata['Word_Count'] = commentdata.comment.str.len()
commentdata.to_sql("comment_data",engine,if_exists = 'replace',index = False)
for symbol in ['^GSPC','^VIX','AAPL','DIS','TLSA','NFLX','BA','WMT','AMZN','NVDA']:
df = pd.DataFrame(yf.download(symbol, interval = "30m",period = "60d")).reset_index()
df['Datetime'] = pd.to_datetime(df['Datetime'],utc = True).dt.tz_localize(None)
#Change from EDT to UTC, 5 hours for daylight sayings time
df['Datetime'] = df['Datetime'].dt.round('30min')
datamerge = df.merge(commentdata,on = ['Datetime'],how='inner')
df['Datetime'] = df['Datetime'] - pd.DateOffset(minutes=60)
df.columns = ['Datetime', 'OPEN_current', 'HIGH_current', 'LOW_current', 'CLOSE_current','Adj_Close_current', 'VOLUME_current']
datamerge = datamerge.merge(df,on = ['Datetime'],how='inner')
df['Datetime'] = df['Datetime'] - pd.DateOffset(minutes=30)
df.columns = ['Datetime', 'OPEN_previous', 'HIGH_previous', 'LOW_previous', 'CLOSE_previous','Adj_Close_previous', 'VOLUME_previous']
datamerge = datamerge.merge(df,on = ['Datetime'],how='inner')
datamerge.to_sql(symbol.lower().replace("^","")+"_by_hour_etl_lag",engine,if_exists = 'replace',index = False)
print(symbol)