004-005. lending club loan" data by pandas, pd.read_csv(), df.to_csv() import pandas as pd import numpy as np df=pd.read_csv("/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/lending-club-loan-data/loan.csv",sep=",") df.shape # (887379, 74) df.columns # Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', # 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', # 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', # 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', # 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', # 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', # 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', # 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', # 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', # 'total_rec_int', 'total_rec_late_fee', 'recoveries', # 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', # 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', # 'mths_since_last_major_derog', 'policy_code', 'application_type', # 'annual_inc_joint', 'dti_joint', 'verification_status_joint', # 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', # 'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', # 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', # 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', # 'inq_last_12m'], # dtype='object') # @ # We will extract only several columns which we need, # with creating new dataframe df2=df[["loan_amnt","loan_status","grade","int_rate","term"]] df2.head() # loan_amnt loan_status grade int_rate term # 0 5000.0 Fully Paid B 10.65 36 months # 1 2500.0 Charged Off C 15.27 60 months # 2 2400.0 Fully Paid C 15.96 36 months # 3 10000.0 Fully Paid C 13.49 36 months # 4 3000.0 Current B 12.69 60 months df2.tail() # loan_amnt loan_status grade int_rate term # 887374 10000.0 Current B 11.99 36 months # 887375 24000.0 Current B 11.99 36 months # 887376 13000.0 Current D 15.99 60 months # 887377 12000.0 Current E 19.99 60 months # 887378 20000.0 Current B 11.99 36 months # This dataset provides category data type # We want to remove duplicated data df2["loan_status"].unique() df2["grade"].unique() df2["term"].unique() df.shape # (887379, 74) # It turned out there is no duplicated data # You will delete entire row which contains at least one nan df2=df2.dropna(how="any") df.shape # It turned out this dataset didn't contain any nan value # @ term_to_loan_amnt_dict={} # You select "term" column data, # and select unique data uniq_terms=df2["term"].unique() for term in uniq_terms: # You create mask(boolean array), # from true row data, # select "load_amnt" column data, # # and sum them up loan_amnt_sum=df2.loc[df2["term"]==term,"loan_amnt"].sum() # You store loan_amnt_sum into term_to_loan_amnt_dict term_to_loan_amnt_dict[term]=loan_amnt_sum term_to_loan_amnt_dict # {' 36 months': 7752507375.0, ' 60 months': 5341004575.0} # You will convert dictionary into Series term_to_loan_amnt_series=pd.Series(term_to_loan_amnt_dict) # 36 months 7.752507e+09 # 60 months 5.341005e+09 # @ df2.head() # loan_amnt loan_status grade int_rate term # 0 5000.0 Fully Paid B 10.65 36 months # 1 2500.0 Charged Off C 15.27 60 months # 2 2400.0 Fully Paid C 15.96 36 months # 3 10000.0 Fully Paid C 13.49 36 months # 4 3000.0 Current B 12.69 60 months # You select "loan_status" column data, # and find unique data total_status_category=df2["loan_status"].unique() # array(['Fully Paid', 'Charged Off', 'Current', 'Default', # 'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)', # 'Does not meet the credit policy. Status:Fully Paid', # 'Does not meet the credit policy. Status:Charged Off', 'Issued'], # dtype=object) # You will extract value of bad status category bad_status_category=total_status_category[[1,3,4,5,6,8]] # array(['Charged Off', 'Default', 'Late (31-120 days)', 'In Grace Period', # 'Late (16-30 days)', # 'Does not meet the credit policy. Status:Charged Off'], # dtype=object) # You will find if "loan_status" column data is in bad_status_category array df2["bad_loan_status"]=df2["loan_status"].isin(bad_status_category) df2["bad_loan_status"].head(5) # 0 False # 1 True # 2 False # 3 False # 4 False # You select row data with bad_status_category==True, # from those rows, # you select "grade" column data bad_loan_status_to_grades=df2.loc[df2["bad_loan_status"]==True,"grade"].value_counts() # C 19054 # D 15859 # B 13456 # E 9745 # F 4383 # A 3663 # G 1269 # @ # What's relation, # between "total amount of loan"(loan_amnt) and "interest rate"(int_rate)? # You can find it by coefficient analysis df2.head() df2["loan_amnt"].corr(df2["int_rate"]) # 0.14502309929886237 # It turned out low correlation # @ # We can write data into csv file # bad_loan_status.csv is file name # sep="," is delimiter bad_loan_status_to_grades.to_csv("bad_loan_status.csv",sep=",")