Covariance and correlation#

import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
# Number of points
n =10000

# Generate two uncorrelated gaussian distributed datasets
# mu, sigma = 0, 1
# x = np.random.normal(mu,sigma, n)
# y = np.random.normal(mu,sigma, n)

# # Generate two correlated gaussian distributed datasets
# mu, sigma = 0, 1
# x = np.random.normal(mu,sigma, n)
# y = np.random.normal(x, sigma, n)

# # Generate two anti-correlated gaussian distributed datasets
# mu, sigma = 0, 1
# x = np.random.normal(mu,sigma, n)
# y = np.random.normal(-x, sigma, n)

# Generate two strongly correlated gaussian distributed datasets
mu, sigma = 0, 1
x = np.random.normal(mu,sigma, n)
y = np.random.normal(-x,0.0001, n)

# plot them
plt.axes().set_aspect('equal')
plt.axis((-5,5,-5,5))
p = plt.scatter(x,y)
../_images/1c958627016e4211051a8ce3ff9458480bc066d2de652dd01fdfcc46a5c5890f.png

Compute the covariance matrix#

# paste the two arrays into a panda dataset
import pandas as pd
data = pd.DataFrame(dict(x=x, y=y))
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[3], line 2
      1 # paste the two arrays into a panda dataset
----> 2 import pandas as pd
      3 data = pd.DataFrame(dict(x=x, y=y))

ModuleNotFoundError: No module named 'pandas'
# compute the covariance matrix
print(data.cov())
          x         y
x  0.996748 -0.996749
y -0.996749  0.996750
# the elements on the diagonal are the variances of the datasets x and y
print(data.var())
x    0.996748
y    0.996750
dtype: float64
# the off diagonal element(s) quantify the dependence between x and y
# the covariance matrix is symmetric 

Compute the correlation coefficient#

np.corrcoef(x,y)
array([[ 1.        , -0.99999999],
       [-0.99999999,  1.        ]])
# this you can compute by normalizing the covariance to the sqrt of the variances
CovXY =  data.cov()
Varx = data.var()[0]
Vary = data.var()[1]
print(f"Correlation = {CovXY.iloc[0,1]/math.sqrt(Varx * Vary)}")
Correlation = -0.9999999949905587