004-005. lending club loan" data by pandas, pd.read_csv(), df.to_csv()
import pandas as pd
import numpy as np
df=pd.read_csv("/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/lending-club-loan-data/loan.csv",sep=",")
df.shape
# (887379, 74)
df.columns
# Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
# 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
# 'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
# 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
# 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
# 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
# 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
# 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
# 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
# 'total_rec_int', 'total_rec_late_fee', 'recoveries',
# 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
# 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
# 'mths_since_last_major_derog', 'policy_code', 'application_type',
# 'annual_inc_joint', 'dti_joint', 'verification_status_joint',
# 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
# 'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
# 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
# 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl',
# 'inq_last_12m'],
# dtype='object')
# @
# We will extract only several columns which we need,
# with creating new dataframe
df2=df[["loan_amnt","loan_status","grade","int_rate","term"]]
df2.head()
# loan_amnt loan_status grade int_rate term
# 0 5000.0 Fully Paid B 10.65 36 months
# 1 2500.0 Charged Off C 15.27 60 months
# 2 2400.0 Fully Paid C 15.96 36 months
# 3 10000.0 Fully Paid C 13.49 36 months
# 4 3000.0 Current B 12.69 60 months
df2.tail()
# loan_amnt loan_status grade int_rate term
# 887374 10000.0 Current B 11.99 36 months
# 887375 24000.0 Current B 11.99 36 months
# 887376 13000.0 Current D 15.99 60 months
# 887377 12000.0 Current E 19.99 60 months
# 887378 20000.0 Current B 11.99 36 months
# This dataset provides category data type
# We want to remove duplicated data
df2["loan_status"].unique()
df2["grade"].unique()
df2["term"].unique()
df.shape
# (887379, 74)
# It turned out there is no duplicated data
# You will delete entire row which contains at least one nan
df2=df2.dropna(how="any")
df.shape
# It turned out this dataset didn't contain any nan value
# @
term_to_loan_amnt_dict={}
# You select "term" column data,
# and select unique data
uniq_terms=df2["term"].unique()
for term in uniq_terms:
# You create mask(boolean array),
# from true row data,
# select "load_amnt" column data,
# # and sum them up
loan_amnt_sum=df2.loc[df2["term"]==term,"loan_amnt"].sum()
# You store loan_amnt_sum into term_to_loan_amnt_dict
term_to_loan_amnt_dict[term]=loan_amnt_sum
term_to_loan_amnt_dict
# {' 36 months': 7752507375.0, ' 60 months': 5341004575.0}
# You will convert dictionary into Series
term_to_loan_amnt_series=pd.Series(term_to_loan_amnt_dict)
# 36 months 7.752507e+09
# 60 months 5.341005e+09
# @
df2.head()
# loan_amnt loan_status grade int_rate term
# 0 5000.0 Fully Paid B 10.65 36 months
# 1 2500.0 Charged Off C 15.27 60 months
# 2 2400.0 Fully Paid C 15.96 36 months
# 3 10000.0 Fully Paid C 13.49 36 months
# 4 3000.0 Current B 12.69 60 months
# You select "loan_status" column data,
# and find unique data
total_status_category=df2["loan_status"].unique()
# array(['Fully Paid', 'Charged Off', 'Current', 'Default',
# 'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
# 'Does not meet the credit policy. Status:Fully Paid',
# 'Does not meet the credit policy. Status:Charged Off', 'Issued'],
# dtype=object)
# You will extract value of bad status category
bad_status_category=total_status_category[[1,3,4,5,6,8]]
# array(['Charged Off', 'Default', 'Late (31-120 days)', 'In Grace Period',
# 'Late (16-30 days)',
# 'Does not meet the credit policy. Status:Charged Off'],
# dtype=object)
# You will find if "loan_status" column data is in bad_status_category array
df2["bad_loan_status"]=df2["loan_status"].isin(bad_status_category)
df2["bad_loan_status"].head(5)
# 0 False
# 1 True
# 2 False
# 3 False
# 4 False
# You select row data with bad_status_category==True,
# from those rows,
# you select "grade" column data
bad_loan_status_to_grades=df2.loc[df2["bad_loan_status"]==True,"grade"].value_counts()
# C 19054
# D 15859
# B 13456
# E 9745
# F 4383
# A 3663
# G 1269
# @
# What's relation,
# between "total amount of loan"(loan_amnt) and "interest rate"(int_rate)?
# You can find it by coefficient analysis
df2.head()
df2["loan_amnt"].corr(df2["int_rate"])
# 0.14502309929886237
# It turned out low correlation
# @
# We can write data into csv file
# bad_loan_status.csv is file name
# sep="," is delimiter
bad_loan_status_to_grades.to_csv("bad_loan_status.csv",sep=",")