bikesharingdemand
# @
# 자전거 수요 예측[3/4] 캐글 머신러닝 랜덤포레스트만으로 경진대회에 참여하기
# https://www.youtube.com/watch?v=g7EwIFXJntc&t=179s
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# You can fix issue of broken "-" in the graph,
# with following code
mpl.rcParams['axes.unicode_minus']=False
import warnings
warnings.filterwarnings('ignore')
train_dataset_dataframe=pd.read_csv("D://chromedown//kaggle-bike//train.csv",parse_dates=["datetime"])
train_dataset_dataframe.shape
# (10886,12)
test_dataset_dataframe=pd.read_csv("D://chromedown//kaggle-bike//test.csv",parse_dates=["datetime"])
test_dataset_dataframe.shape
# (6493,9)
# You will process engineering features
# You loaded dataset as datetime type
# You want to make them detailed columns
train_dataset_dataframe["year"]=train_dataset_dataframe["datetime"].dt.year
train_dataset_dataframe["month"]=train_dataset_dataframe["datetime"].dt.month
train_dataset_dataframe["day"]=train_dataset_dataframe["datetime"].dt.day
train_dataset_dataframe["hour"]=train_dataset_dataframe["datetime"].dt.hour
train_dataset_dataframe["minute"]=train_dataset_dataframe["datetime"].dt.minute
train_dataset_dataframe["second"]=train_dataset_dataframe["datetime"].dt.second
train_dataset_dataframe["dayofweek"]=train_dataset_dataframe["datetime"].dt.dayofweek
train_dataset_dataframe.shape
# (10886,19)
test_dataset_dataframe["year"]=test_dataset_dataframe["datetime"].dt.year
test_dataset_dataframe["month"]=test_dataset_dataframe["datetime"].dt.month
test_dataset_dataframe["day"]=test_dataset_dataframe["datetime"].dt.day
test_dataset_dataframe["hour"]=test_dataset_dataframe["datetime"].dt.hour
test_dataset_dataframe["minute"]=test_dataset_dataframe["datetime"].dt.minute
test_dataset_dataframe["second"]=test_dataset_dataframe["datetime"].dt.second
test_dataset_dataframe["dayofweek"]=test_dataset_dataframe["datetime"].dt.dayofweek
test_dataset_dataframe.shape
# (6493,16)
# You can visualize data
# You can see there are lots of data which have windspeed 0
# Data which have windspeed 0 might come from bad measurement,
# so you need to refine them
figure_object,subplot_object=plt.subplots(nrows=2)
figure_object.set_size_inches(18,10)
plt.sca(subplot_object[0])
plt.xticks(rotation=30,ha='right')
subplot_object[0].set(ylabel='Count',title="train windspeed")
sns.countplot(data=train_dataset_dataframe,x="windspeed",ax=subplot_object[0])
plt.sca(subplot_object[1])
plt.xticks(rotation=30,ha='right')
subplot_object[1].set(ylabel='Count',title="test windspeed")
sns.countplot(data=test_dataset_dataframe,x="windspeed",ax=subplot_object[1])
# img ed376c9c-ceeb-4ccb-9db9-e00e68a2488f.png
# You will separate data into windspeed 0 and windspeed not 0 in train data
train_dataset_having_windspeed0=train_dataset_dataframe.loc[train_dataset_dataframe['windspeed']==0]
train_dataset_not_having_windspeed0=train_dataset_dataframe.loc[train_dataset_dataframe['windspeed']!=0]
# You can see "not windspeed 0" cases are much more than "windspeed 0" cases
train_dataset_having_windspeed0.shape
# (0,19)
train_dataset_not_having_windspeed0.shape
# (9573,19)
from sklearn.ensemble import RandomForestClassifier
def predict_windspeed(data):
data_having_windspeed0=data.loc[data['windspeed']==0]
data_not_having_windspeed0=data.loc[data['windspeed']!=0]
# You can select features for "predicting windspeed model"
weather_features_list=["season","weather","humidity","month","temp","year","atemp"]
# You will change data type of windspeed from data_not_having_windspeed0 into string to use
data_not_having_windspeed0["windspeed"]=data_not_having_windspeed0["windspeed"].astype("str")
# You will use random forest classifier
randomforest_classifier_object_for_predicting_windspeed=RandomForestClassifier()
# You're creating predicting model for windspeed
# And training data for this predicting model is,
# data composed of weather_features_list
# You want to find optimized parameters,
# (letting predicting model for windspeed to learn),
# for this predicting model
# data_not_having_windspeed0[weather_features_list] will be fearues
# data_not_having_windspeed0["windspeed"] will be label
randomforest_classifier_object_for_predicting_windspeed.fit(\
data_not_having_windspeed0[weather_features_list]\
,data_not_having_windspeed0["windspeed"])
# You let predicting model to predict windspeed
prediction_value_for_windspeed=randomforest_classifier_object_for_predicting_windspeed.predict(\
X=data_having_windspeed0[weather_features_list])
prediction_value_for_windspeed0=data_having_windspeed0
prediction_value_for_windspeed_not_0=data_not_having_windspeed0
prediction_value_for_windspeed0["windspeed"]=prediction_value_for_windspeed
merged_date=prediction_value_for_windspeed_not_0.append(prediction_value_for_windspeed0)
# You convert datatype of windspeed into float
merged_date["windspeed"]=merged_date["windspeed"].astype("float")
merged_date.reset_index(inplace=True)
merged_date.drop('index',inplace=True,axis=1)
return merged_date
# You can use predict_windspeed() with putting train data to refine windspeed 0 data
refined_train_dataset_dataframe=predict_windspeed(train_dataset_dataframe)
# test=predict_windspeed(test)
# You can visualize refined data
figure_object,subplot_object1=plt.subplots()
figure_object.set_size_inches(18,6)
plt.sca(subplot_object1)
# data label text on x axis is rotated by 30 degree
plt.xticks(rotation=30,ha='right')
subplot_object1.set(ylabel='Count',title="windspeed of refined train data")
sns.countplot(data=refined_train_dataset_dataframe,x="windspeed",ax=subplot_object1)
# You can confirm windspeed 0 data are eleminated
# img 8b961666-30dc-44e1-b03f-087b7f1d3a84
# You need to process "feature selection"
# 1. It's required to distinguish between meaningful data and noise data
# 1. It doesn't mean the more feature, the better performance
# 1. It's recommended to add feature one by one,
# with testing performance,
# And you can eliminate feature if it turned out not helpful feature
# continuous feature and categorical feature
# continuous_feature=["temp","humidity","windspeed","atemp"]
# categorical_feature=["season","holiday","workingday","weather","dayofweek","month","year","hour"]
# You will choose following features as categorical feature
categorical_feature_names_list=["season","holiday","workingday","weather","dayofweek","month","year","hour"]
# categorical feature is needed to be categorical data type
for one_category in categorical_feature_names_list:
refined_train_dataset_dataframe[one_category]=refined_train_dataset_dataframe[one_category]\
.astype("category")
test_dataset_dataframe[one_category]=test_dataset_dataframe[one_category].astype("category")
# They show entire features
entire_feature_names_list\
=["season","weather","temp","atemp","humidity","windspeed","year","hour","dayofweek","holiday","workingday"]
# ['season',
# 'weather',
# 'temp',
# 'atemp',
# 'humidity',
# 'windspeed',
# 'year',
# 'hour',
# 'dayofweek',
# 'holiday',
# 'workingday']
# You will create new matrix X_train,
# after preprocessing performed from above
X_traindata_final_dataframe=refined_train_dataset_dataframe[entire_feature_names_list]
X_traindata_final_dataframe.shape
# (10886,11)
X_traindata_final_dataframe.head()
# season weather temp atemp humidity windspeed year hour dayofweek holiday workingday
# 0 1 2 9.84 12.880 75 6.0032 2011 5 5 0 0
# 1 1 1 15.58 19.695 76 16.9979 2011 10 5 0 0
# 2 1 1 14.76 16.665 81 19.0012 2011 11 5 0 0
# 3 1 1 17.22 21.210 77 19.0012 2011 12 5 0 0
# 4 1 2 18.86 22.725 72 19.9995 2011 13 5 0 0
# You create new matrix X_testdata_final_dataframe,
# after preprocessing performed from above
X_testdata_final_dataframe=test_dataset_dataframe[entire_feature_names_list]
# (6493,11)
X_testdata_final_dataframe.head()
# season weather temp atemp humidity windspeed year hour dayofweek holiday workingday
# 0 1 1 10.66 11.365 56 26.0027 2011 0 3 0 1
# 1 1 1 10.66 13.635 56 0.0000 2011 1 3 0 1
# 2 1 1 10.66 13.635 56 0.0000 2011 2 3 0 1
# 3 1 1 10.66 12.880 56 11.0014 2011 3 3 0 1
# 4 1 1 10.66 12.880 56 11.0014 2011 4 3 0 1
# You use "count" feature as y data
label_name_string="count"
y_label_of_traindata=refined_train_dataset_dataframe[label_name_string]
# (10886,)
y_label_of_traindata.head()
# 0 1
# 1 36
# 2 56
# 3 84
# 4 94
# Name: count,dtype: int64
# "bike sharing contest" is evaluated by RMSLE
# You will implement RMSLE algorithm in rmsle()
from sklearn.metrics import make_scorer
def rmsle(predicted_values,actual_values):
# I will use data as numpy array
predicted_values_nparray=np.array(predicted_values)
actual_values_nparray=np.array(actual_values)
# I should implement this formular
# $$$\sqrt{\frac{1}{n} \sum\limits_{i=1}^{n}(\log{(p_{i}+1)}-\log{(a_{i}+1)})^{2}}$$$
log_predict_value=np.log(predicted_values_nparray+1)
log_actual_value=np.log(actual_values_nparray+1)
difference_value=log_predict_value-log_actual_value
# difference=(log_predict_value-log_actual_value)**2
squared_difference_value=np.square(difference)
mean_of_squared_difference_value=squared_difference_value.mean()
rmsle_score_value=np.sqrt(mean_of_squared_difference_value)
return rmsle_score_value
rmsle_scorer=make_scorer(rmsle)
# @
# You will use KFold for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold_object=KFold(n_splits=10,shuffle=True,random_state=0)
# You will predict by random forest
from sklearn.ensemble import RandomForestRegressor
max_depth_list=[]
# Higher n_estimators makes better precision,
# but consuming more time to predict
randomforest_regressor_object=RandomForestRegressor(n_estimators=100,n_jobs=-1,random_state=0)
RandomForestRegressor(\
bootstrap=True,criterion='mse',max_depth=None,
max_features='auto',max_leaf_nodes=None,
min_impurity_decrease=0.0,min_impurity_split=None,
min_samples_leaf=1,min_samples_split=2,
min_weight_fraction_leaf=0.0,n_estimators=100,n_jobs=-1,
oob_score=False,random_state=0,verbose=0,warm_start=False)
%time score_value_from_cross_val_score=cross_val_score(\
randomforest_regressor_object\
,X_traindata_final_dataframe\
,y_label_of_traindata\
,cv=k_fold_object\
,scoring=rmsle_scorer)
mean_of_score_value_from_cross_val_score=score_value_from_cross_val_score.mean()
# Closer to 0, it's better data
"Score={0:.5f}".format(mean_of_score_value_from_cross_val_score)
# Wall time: 10.5 s
# 'Score= 0.33109'
# You will let model to learn,
# by inputting feature(X_traindata_final_dataframe), label(y_label_of_traindata)
randomforest_regressor_object.fit(X_traindata_final_dataframe,y_label_of_traindata)
# You will let model to predict,
# based on trained model by inputting X_testdata_final_dataframe
predictions_from_test_data=randomforest_regressor_object.predict(X_testdata_final_dataframe)
predictions_from_test_data.shape
# (6493,)
predictions_from_test_data[0:10]
# array([ 12.2 , 4.87, 4.19, 3.47, 3.03, 6.28, 38.1 ,105.49,
# 235.54,136.4 ])
# You will visualize predictions_from_test_data
figure_object,(subplot_object1,subplot_object2)= plt.subplots(ncols=2)
figure_object.set_size_inches(12,5)
sns.distplot(y_label_of_traindata,ax=subplot_object1,bins=50)
subplot_object1.set(title="train data of x features and y label")
sns.distplot(predictions_from_test_data,ax=subplot_object2,bins=50)
subplot_object2.set(title="predicted y value from test data of multiple x values")
# I can see similar distribution of data,
# between "y_label_of_traindata" and "predictions_from_test_data"
# img 87b94e55-5c34-472d-9bd1-ce1ddf1b4bf8
# You will submit this code
# For this, you need to input predicted values into sampleSubmission.csv file
# First, you load sampleSubmission.csv file
file_to_be_submitted=pd.read_csv("D://chromedown//kaggle-bike//sampleSubmission.csv")
# I input predictions into submission's count column
file_to_be_submitted["count"]=predictions_from_test_data
file_to_be_submitted.shape
# (6493,2)
print(file_to_be_submitted.head())
# datetime count
# 0 2011-01-20 00:00:00 12.20
# 1 2011-01-20 01:00:00 4.87
# 2 2011-01-20 02:00:00 4.19
# 3 2011-01-20 03:00:00 3.47
# 4 2011-01-20 04:00:00 3.03
# You create file storing scores
file_to_be_submitted.to_csv("D://chromedown//kaggle-bike//Score_{0:.5f}_submission.csv".format(mean_of_score_value_from_cross_val_score),index=False)