================================================================================
* $$$x_1,x_2,\cdots,x_D$$$: D number of independent variables
* Linear regression model: $$$\hat{y} \sim x_1+x_2+\cdots+x_D$$$
* Suppose $$$x_1$$$ independent variable is range type variable
* $$$x_1$$$ can have "man" or "woman"
* You can use dummy variable d
* x="man" to d=(1,0)
* x="woman" to d=(0,1)
* Model:
$$$\hat{y}=[\omega_{d1}d_1 + \omega_{d2}d_2] + \omega_2x_2+\cdots+\omega_Dx_D$$$
================================================================================
* Full rank dummy variable model
x="A blood type" -> d=(1,0,0,0)
x="AB blood type" -> d=(1,1,0,0)
x="B blood type" -> d=(1,0,1,0)
x="O blood type" -> d=(1,0,0,1)
* Reduced rank dummy variable model which uses one-hot-encoding
x="A blood type" -> d=(1,0,0,0)
x="AB blood type" -> d=(0,1,0,0)
x="B blood type" -> d=(0,0,1,0)
x="O blood type" -> d=(0,0,0,1)
================================================================================
# conda activate py36gputorch100 && \
# cd /mnt/1T-5e7/mycodehtml/ML_theory/DHKim/003_Regression_ananalysis_and_Timeseries_analysis/002_Basic_of_linear_regression/02.03_Range_datatype_independent_variable && \
# rm e.l && python main.py \
# 2>&1 | tee -a e.l && code e.l
# ================================================================================
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import *
import datetime
from calendar import isleap
import scipy as sp
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
from sklearn.datasets import load_boston
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# ================================================================================
# * Linear regression on a model which has range type independet variable
# * independent variable: monthh
# * dependent variable: temperature
# ================================================================================
df_nottem=sm.datasets.get_rdataset("nottem").data
# print("df_nottem",df_nottem)
# time value
# 0 1920.000000 40.6
# 1 1920.083333 40.8
# ================================================================================
def convert_partial_year(number):
year=int(number)
d=datetime.timedelta(days=(number-year)*(365+isleap(year)))
day_one=datetime.datetime(year,1,1)
date=d+day_one
return date
# ================================================================================
time_col_data=df_nottem[["time"]]
# print("time_col_data",time_col_data)
# time
# 0 1920.000000
# 1 1920.083333
df_nottem["date0"]=time_col_data.applymap(convert_partial_year)
# ================================================================================
date0_col_data=df_nottem["date0"]
# print("date0_col_data",date0_col_data)
# 0 1920-01-01 00:00:00.000000
# 1 1920-01-31 11:59:59.999897
dt_idx_of_date0_col_data=pd.DatetimeIndex(date0_col_data)
# print("dt_idx_of_date0_col_data",dt_idx_of_date0_col_data)
# DatetimeIndex([ '1920-01-01 00:00:00', '1920-01-31 11:59:59.999897',
# '1920-03-02 00:00:00.000103', '1920-04-01 12:00:00',
# '1920-05-01 23:59:59.999897', '1920-06-01 12:00:00.000103',
rounded=dt_idx_of_date0_col_data.round('60min')
# print("rounded",rounded)
# DatetimeIndex(['1920-01-01 00:00:00', '1920-01-31 12:00:00',
# '1920-03-02 00:00:00', '1920-04-01 12:00:00',
# '1920-05-02 00:00:00', '1920-06-01 12:00:00',
sec_of_one_day=datetime.timedelta(seconds=3600*24)
# print("sec_of_one_day",sec_of_one_day)
# 1 day, 0:00:00
final_date_data=rounded+sec_of_one_day
# print("final_date_data",final_date_data)
# DatetimeIndex(['1920-01-02 00:00:00', '1920-02-01 12:00:00',
# '1920-03-03 00:00:00', '1920-04-02 12:00:00',
# '1920-05-03 00:00:00', '1920-06-02 12:00:00',
df_nottem["date"]=final_date_data
# ================================================================================
df_nottem["month"] = df_nottem["date"].dt.strftime("%m").astype('category')
del df_nottem["date0"], df_nottem["date"]
# ================================================================================
# print("df_nottem.tail()",df_nottem.tail())
# time value month
# 235 1939.583333 61.8 08
# 236 1939.666667 58.2 09
# ================================================================================
df_nottem.boxplot("value", "month")
plt.show()
# Meaning:
# 4,6,8,11 have small difference
# Temperature is highest at 6,7,8
# Temperature is lowerest at 1,2,12
# ================================================================================
sns.stripplot(x="month", y="value", data=df_nottem, jitter=True, alpha=.3)
# plt.show()
#
sns.pointplot(x="month", y="value", data=df_nottem, dodge=True, color='r')
# plt.show()
#
#
# ================================================================================
# Inference linear regression model
model = sm.OLS.from_formula("value ~ C(month) + 0", df_nottem)
result = model.fit()
# print(result.summary())
# OLS Regression Results
# ==============================================================================
# Dep. Variable: value R-squared: 0.930
# Model: OLS Adj. R-squared: 0.927
# Method: Least Squares F-statistic: 277.3
# Date: 화, 07 5월 2019 Prob (F-statistic): 2.96e-125
# Time: 13:14:36 Log-Likelihood: -535.82
# No. Observations: 240 AIC: 1096.
# Df Residuals: 228 BIC: 1137.
# Df Model: 11
# Covariance Type: nonrobust
# ================================================================================
# coef std err t P>|t| [0.025 0.975]
# --------------------------------------------------------------------------------
# C(month)[01] 39.6950 0.518 76.691 0.000 38.675 40.715
# C(month)[02] 39.1900 0.518 75.716 0.000 38.170 40.210
# C(month)[03] 42.1950 0.518 81.521 0.000 41.175 43.215
# C(month)[04] 46.2900 0.518 89.433 0.000 45.270 47.310
# C(month)[05] 52.5600 0.518 101.547 0.000 51.540 53.580
# C(month)[06] 58.0400 0.518 112.134 0.000 57.020 59.060
# C(month)[07] 61.9000 0.518 119.592 0.000 60.880 62.920
# C(month)[08] 60.5200 0.518 116.926 0.000 59.500 61.540
# C(month)[09] 56.4800 0.518 109.120 0.000 55.460 57.500
# C(month)[10] 49.4950 0.518 95.625 0.000 48.475 50.515
# C(month)[11] 42.5800 0.518 82.265 0.000 41.560 43.600
# C(month)[12] 39.5300 0.518 76.373 0.000 38.510 40.550
# ==============================================================================
# Omnibus: 5.430 Durbin-Watson: 1.529
# Prob(Omnibus): 0.066 Jarque-Bera (JB): 5.299
# Skew: -0.281 Prob(JB): 0.0707
# Kurtosis: 3.463 Cond. No. 1.00
# ==============================================================================
# Warnings:
# [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# ================================================================================
# If you delete "+0" from string of formula, you will use full rank dummy variable
model = sm.OLS.from_formula("value ~ C(month)", df_nottem)
result = model.fit()
# print(result.summary())
# ================================================================================
# * Boston house price has range type data (0 or 1) on CHAS column
# * Model equation
# When CHAS=1,
# $$$y = (w_0 + w_{\text{CHAS}}) + w_{\text{CRIM}} \text{CRIM} + w_{\text{ZN}} \text{ZN} + \cdots$$$
# When CHAS=0,
# $$$y = w_0 + w_{\text{CRIM}} \text{CRIM} + w_{\text{ZN}} \text{ZN} + \cdots$$$
boston=load_boston()
X_feat_data=boston.data
X_feat_names=boston.feature_names
X_wo_constant_term=pd.DataFrame(X_feat_data,columns=X_feat_names)
X_w_constant_term=sm.add_constant(X_wo_constant_term)
y_data=boston.target
y_data=pd.DataFrame(y_data,columns=["MEDV"])
boston_data=pd.concat([X_w_constant_term,y_data],axis=1)
model=sm.OLS(y_data,X_w_constant_term)
result=model.fit()
# print(result.summary())
# OLS Regression Results
# ==============================================================================
# Dep. Variable: MEDV R-squared: 0.741
# Model: OLS Adj. R-squared: 0.734
# Method: Least Squares F-statistic: 108.1
# Date: 화, 07 5월 2019 Prob (F-statistic): 6.72e-135
# Time: 13:22:41 Log-Likelihood: -1498.8
# No. Observations: 506 AIC: 3026.
# Df Residuals: 492 BIC: 3085.
# Df Model: 13
# Covariance Type: nonrobust
# ==============================================================================
# coef std err t P>|t| [0.025 0.975]
# ------------------------------------------------------------------------------
# const 36.4595 5.103 7.144 0.000 26.432 46.487
# CRIM -0.1080 0.033 -3.287 0.001 -0.173 -0.043
# ZN 0.0464 0.014 3.382 0.001 0.019 0.073
# INDUS 0.0206 0.061 0.334 0.738 -0.100 0.141
# CHAS 2.6867 0.862 3.118 0.002 0.994 4.380
# NOX -17.7666 3.820 -4.651 0.000 -25.272 -10.262
# RM 3.8099 0.418 9.116 0.000 2.989 4.631
# AGE 0.0007 0.013 0.052 0.958 -0.025 0.027
# DIS -1.4756 0.199 -7.398 0.000 -1.867 -1.084
# RAD 0.3060 0.066 4.613 0.000 0.176 0.436
# TAX -0.0123 0.004 -3.280 0.001 -0.020 -0.005
# PTRATIO -0.9527 0.131 -7.283 0.000 -1.210 -0.696
# B 0.0093 0.003 3.467 0.001 0.004 0.015
# LSTAT -0.5248 0.051 -10.347 0.000 -0.624 -0.425
# ==============================================================================
# Omnibus: 178.041 Durbin-Watson: 1.078
# Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126
# Skew: 1.521 Prob(JB): 8.84e-171
# Kurtosis: 8.281 Cond. No. 1.51e+04
# ==============================================================================
# Warnings:
# [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# [2] The condition number is large, 1.51e+04. This might indicate that there are
# strong multicollinearity or other numerical problems.
# ================================================================================
# ContrastMatrix: custom encoding data instead of using full and reduced rank encoding
demo_data_arr=demo_data("a", nlevels=3)
# print("demo_data_arr",demo_data_arr)
# {'a': ['a1', 'a2', 'a3', 'a1', 'a2', 'a3']}
df4=pd.DataFrame(demo_data_arr)
# print("df4",df4)
# a
# 0 a1
# 1 a2
# 2 a3
# 3 a1
# 4 a2
# 5 a3
# ================================================================================
# Group a2, a3 into one category when encoding data
encoding_vectors = [[1, 0], [0, 1], [0, 1]]
label_postfix = [":a1", ":a2a3"]
contrast = ContrastMatrix(encoding_vectors, label_postfix)
# dmatrix("C(a, contrast) + 0", df4)
# DesignMatrix with shape (6, 2)
# C(a, contrast):a1 C(a, contrast):a2a3
# 1 0
# 0 1
# 0 1
# 1 0
# 0 1
# 0 1
# Terms:
# 'C(a, contrast)' (columns 0:2)
# ================================================================================