letianzj
diff --git a/‎notebooks/bayesian_linear_regression.py
+58 b/‎notebooks/bayesian_linear_regression.py
+58
diff --git a/‎notebooks/classical_linear_regression.py
+42 b/‎notebooks/classical_linear_regression.py
+42
diff --git a/‎notebooks/cointegration_pairs_trading.py
+76 b/‎notebooks/cointegration_pairs_trading.py
+76
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime, date
+
+sample_size = 500
+sigma_e = 3.0             # true value of parameter error sigma
+random_num_generator = np.random.RandomState(0)
+x = 10.0 * random_num_generator.rand(sample_size)
+e = random_num_generator.normal(0, sigma_e, sample_size)
+y = 1.0 + 2.0 * x +  e          # a = 1.0; b = 2.0; y = a + b*x
+plt.scatter(x, y, color='blue')
+
+# initial belief
+sigma_e = 3.0            # make it known to avoid inverse gamma complexity
+a_0 = 0.5
+b_0 = 0.5
+sigma_a_0 = 0.5
+sigma_b_0 = 0.5
+beta_0 = np.array([[a_0], [b_0]])
+sigma_beta_0 = np.array([[sigma_a_0*sigma_a_0, 0], [0, sigma_b_0*sigma_b_0]])
+
+beta_recorder = []           # record parameter beta
+beta_recorder.append(beta_0)
+for pair in range(250):       # 500 points means 250 pairs
+    x1 = x[pair*2]
+    x2 = x[pair*2+1]
+    y1 = y[pair*2]
+    y2 = y[pair*2+1]
+    mu_y = np.array([[(x1*y2-x2*y1)/(x1-x2)], [(y1-y2)/(x1-x2)]])
+    sigma_y = np.array([[(np.square(x1/(x1-x2))+np.square(x2/(x1-x2)))*np.square(sigma_e),0],
+                             [0,2*np.square(sigma_e/(x1-x2))]])
+    sigma_beta_1 = np.linalg.inv(np.linalg.inv(sigma_beta_0)+np.linalg.inv(sigma_y))
+    beta_1 = sigma_beta_1.dot(np.linalg.inv(sigma_beta_0).dot(beta_0) + np.linalg.inv(sigma_y).dot(mu_y))
+
+    # assign beta_1 to beta_0
+    beta_0 = beta_1
+    sigma_beta_0 = sigma_beta_1
+    beta_recorder.append(beta_0)
+
+print('pamameters: %.7f, %.7f' %(beta_0[0], beta_0[1]))
+
+# plot the Beyesian dynamics
+xfit = np.linspace(0, 10, sample_size)
+ytrue = 2.0 * xfit + 1.0       # we know the true value of slope and intercept
+plt.plot(xfit, ytrue, label='true line', linewidth=3)
+y0 = beta_recorder[0][1] * xfit + beta_recorder[0][0]
+plt.plot(xfit, y0, label='initial belief', linewidth=1)
+y1 = beta_recorder[1][1] * xfit + beta_recorder[1][0]
+plt.plot(xfit, y1, label='1st update', linewidth=1)
+y10 = beta_recorder[10][1] * xfit + beta_recorder[10][0]
+plt.plot(xfit, y10, label='10th update', linewidth=1)
+y100 = beta_recorder[100][1] * xfit + beta_recorder[100][0]
+plt.plot(xfit, y100, label='100th update', linewidth=1)
+plt.legend()
+plt.show()
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime, date
+
+sample_size = 500
+sigma_e = 3.0             # true value of parameter error sigma
+random_num_generator = np.random.RandomState(0)
+x = 10.0 * random_num_generator.rand(sample_size)
+e = random_num_generator.normal(0, sigma_e, sample_size)
+y = 1.0 + 2.0 * x +  e          # a = 1.0; b = 2.0; y = a + b*x
+plt.scatter(x, y, color='blue')
+
+# normal equation to estimate the model parameters
+X = np.vstack((np.ones(sample_size), x)).T
+params_closed_form = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
+print('pamameters: %.7f, %.7f' %(params_closed_form[0], params_closed_form[1]))
+
+from sklearn.linear_model import LinearRegression
+# The next two lines does the regression
+lm_model = LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
+lm_model.fit(x.reshape(-1,1), y)        # fit() expects 2D array
+print('pamameters: %.7f, %.7f' %(lm_model.intercept_, lm_model.coef_))
+
+# present the graph
+xfit = np.linspace(0, 10, sample_size)
+yfit = lm_model.predict(xfit.reshape(-1,1))
+ytrue = 2.0 * xfit + 1.0       # we know the true value of slope and intercept
+plt.scatter(x, y, color='blue')
+plt.plot(xfit, yfit, color='red', label='fitted line', linewidth=3)
+plt.plot(xfit, ytrue, color='green', label='true line', linewidth=3)
+plt.legend()
+
+# R-Square
+r_square = lm_model.score(x.reshape(-1,1), y)
+print('R-Square %.7f' %(r_square))
+
+from scipy.stats.stats import pearsonr
+# The square root of R-Square is correlation coefficient
+print('Its square root is Pearson correlation coefficient: %.7f == %.7f' %(np.sqrt(r_square), pearsonr(x, y)[0]))
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime, date
+
+#################################################### Data #####################################################
+hist_file = os.path.join('hist/', '%s.csv' % 'EWA US Equity')
+ewa_price = pd.read_csv(hist_file, header=0, parse_dates=True, sep=',', index_col=0)
+ewa_price = ewa_price['Price']
+ewa_price.name = 'EWA US Equity'
+
+hist_file = os.path.join('hist/', '%s.csv' % 'EWC US Equity')
+ewc_price = pd.read_csv(hist_file, header=0, parse_dates=True, sep=',', index_col=0)
+ewc_price = ewc_price['Price']
+ewc_price.name = 'EWC US Equity'
+
+data = pd.concat([ewa_price, ewc_price], axis=1)
+# print(data[data.isnull().any(axis=1)])
+data.dropna(axis=0, how='any',inplace=True)
+
+from sklearn.linear_model import LinearRegression
+# The next two lines does the regression
+lm_model = LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
+lm_model.fit(data['EWA US Equity'].values.reshape(-1,1), data['EWC US Equity'].values)        # fit() expects 2D array
+print('pamameters: %.7f, %.7f' %(lm_model.intercept_, lm_model.coef_))
+
+# present the graph
+fig, ax = plt.subplots(nrows=1, ncols=2)
+ax[0].set_title('EWA vs EWC')
+ax[0].plot(data)
+yfit = lm_model.coef_ * data['EWA US Equity'] + lm_model.intercept_
+y_residual = data['EWC US Equity'] - yfit
+ax[1].set_title('Regression Residual')
+ax[1].plot(y_residual)
+plt.show()
+
+from scipy.stats.stats import pearsonr
+print('Pearson correlation coefficient:%.7f' %(pearsonr(data['EWA US Equity'], data['EWC US Equity'])[0]))
+####################################### CADF #####################################################
+import statsmodels.tsa.stattools as ts
+ts.adfuller(y_residual, 1)           # lag = 1
+# (-3.667485117146333,
+#  0.0045944586170011716,
+#  1,
+#  4560,
+#  {'1%': -3.431784865122899,
+#   '5%': -2.8621740417619224,
+#   '10%': -2.5671075035106954},
+#  625.5003218990623)
+
+lm_model = LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
+lm_model.fit(data['EWC US Equity'].values.reshape(-1,1), data['EWA US Equity'].values)        # fit() expects 2D array
+print('pamameters: %.7f, %.7f' %(lm_model.intercept_, lm_model.coef_))
+yfit = lm_model.coef_ * data['EWC US Equity'] + lm_model.intercept_
+y_residual = data['EWA US Equity'] - yfit
+ts.adfuller(y_residual, 1)           # lag = 1
+# statistic = -3.797221868633519
+
+####################################### Johansen #####################################################
+from statsmodels.tsa.vector_ar.vecm import coint_johansen
+
+jh_results = coint_johansen(data, 0, 1)             # 0 - constant term; 1 - log 1
+print(jh_results.lr1)                           # dim = (n,) Trace statistic
+print(jh_results.cvt)                           # dim = (n,3) critical value table (90%, 95%, 99%)
+print(jh_results.evec)                          # dim = (n, n), columnwise eigen-vectors
+v1 = jh_results.evec[:, 0]
+v2 = jh_results.evec[:, 1]
+
+# [21.44412674  3.64194243]                 # trace statistic
+# [[13.4294 15.4943 19.9349]                # r = 0 critical values
+#  [ 2.7055  3.8415  6.6349]]               # r <= 1 critical values
+# [[ 0.53474958  0.02398649]                # eigenvectors
+#  [-0.45169106  0.12036402]]