004-004. df["one"] and df.loc["b"], axis=0, axis=1, skipna, df.corr(), df.cov(), df.sort_index(), df.sort_values(), df.apply()
# @
import pandas as pd
import numpy as np
list_data_for_dataframe=[[1.4,np.nan],
[7.1,-4.5],
[np.nan,np.nan],
[0.75,-1.3]]
dataframe1=pd.DataFrame(list_data_for_dataframe\
,columns=["one","two"]\
,index=["a","b","c","d"])
# one two
# a 1.40 NaN
# b 7.10 -4.5
# c NaN NaN
# d 0.75 -1.3
# When you use sum() with axis,
# result is calculated with excluding nan
# axis=0 means you sum column data
dataframe1.sum(axis=0)
# one 9.25
# two -5.80
# axis=1 means you sum row data
dataframe1.sum(axis=1)
# a 1.40
# b 2.60
# c 0.00
# d -0.55
# You select "one" column data,
# and sum them
dataframe1["one"].sum()
# 9.25
# Difference between df["one"] and df.loc["b"] is,
# that one is column name and b is index name
# Impossible:
# dataframe1["b"]
# dataframe1.loc["one"]
# You select "b" row data,
# and sum them up
dataframe1.loc["b"].sum()
# 2.5999999999999996
# @
# If you want to include nan,
# you can use skipna=False
# You sum row data
dataframe1.mean(axis=1,skipna=False)
# a NaN
# b 1.300
# c NaN
# d -0.275
# @
# You find mean value from column data,
# and select "one" column data
one_mean=dataframe1.mean(axis=0)["one"]
# 3.0833333333333335
two_min=dataframe1.min(axis=0)["two"]
# -4.5
# You fill nan from "one" column with "one_mean"
# You select "one" column data,
# and fill value into nan
dataframe1["one"]=dataframe1["one"].fillna(value=one_mean)
dataframe1["two"]=dataframe1["two"].fillna(value=two_min)
# @
dataframe2=pd.DataFrame(np.random.randn(6,4)\
,columns=["A","B","C","D"]\
,index=pd.date_range("20160701",periods=6))
# A B C D
# 2016-07-01 -0.933246 -0.027614 -0.334567 0.159137
# 2016-07-02 -0.249595 -0.913389 0.636678 0.430772
# 2016-07-03 -0.865065 -0.575583 -0.374929 -0.471759
# 2016-07-04 -0.578484 -1.255931 -0.295924 1.871001
# 2016-07-05 -1.067445 0.261680 0.147572 -1.697567
# 2016-07-06 1.628652 0.616124 0.387144 -0.374426
# You select "A" column data and "B" column data,
# and find correlation between them
dataframe2["A"].corr(dataframe2["B"])
# 0.4263502156022492
dataframe2["B"].cov(dataframe2["C"])
# You can find covariance value and correlation value,
# in respect to each column and each row
dataframe2.corr()
# A B C D
# A 1.000000 0.426350 0.546656 0.040890
# B 0.426350 1.000000 0.224085 -0.748110
# C 0.546656 0.224085 1.000000 -0.217909
# D 0.040890 -0.748110 -0.217909 1.000000
dataframe2.cov()
# A B C D
# A 1.019469 0.310033 0.235794 0.048692
# B 0.310033 0.518692 0.068945 -0.635448
# C 0.235794 0.068945 0.182500 -0.109791
# D 0.048692 -0.635448 -0.109791 1.390975
# When you want to know effect of one variable to other variable,
# above codes can be helpful
# @
# pandas provides sorting by "index" and "value"
index_of_dataframe2=dataframe2.index
# DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
# '2016-07-05', '2016-07-06'],
# dtype='datetime64[ns]', freq='D')
# We shuffle index values
suffled_index_value=np.random.permutation(index_of_dataframe2)
# array(['2016-07-03T00:00:00.000000000', '2016-07-01T00:00:00.000000000',
# '2016-07-02T00:00:00.000000000', '2016-07-06T00:00:00.000000000',
# '2016-07-05T00:00:00.000000000', '2016-07-04T00:00:00.000000000'],
# dtype='datetime64[ns]')
# We give new index and column to dataframe2
dataframe2=dataframe2.reindex(index=suffled_index_value,columns=["D","B","C","A"])
# D B C A
# 2016-07-03 -0.471759 -0.575583 -0.374929 -0.865065
# 2016-07-01 0.159137 -0.027614 -0.334567 -0.933246
# 2016-07-02 0.430772 -0.913389 0.636678 -0.249595
# 2016-07-06 -0.374426 0.616124 0.387144 1.628652
# 2016-07-05 -1.697567 0.261680 0.147572 -1.067445
# 2016-07-04 1.871001 -1.255931 -0.295924 -0.578484
# index(2016-07-01) became ascending
dataframe2.sort_index(axis=0)
# D B C A
# 2016-07-01 0.159137 -0.027614 -0.334567 -0.933246
# 2016-07-02 0.430772 -0.913389 0.636678 -0.249595
# 2016-07-03 -0.471759 -0.575583 -0.374929 -0.865065
# 2016-07-04 1.871001 -1.255931 -0.295924 -0.578484
# 2016-07-05 -1.697567 0.261680 0.147572 -1.067445
# 2016-07-06 -0.374426 0.616124 0.387144 1.628652
# row data of index became ascending
dataframe2.sort_index(axis=1)
# A B C D
# 2016-07-03 -0.865065 -0.575583 -0.374929 -0.471759
# 2016-07-01 -0.933246 -0.027614 -0.334567 0.159137
# 2016-07-02 -0.249595 -0.913389 0.636678 0.430772
# 2016-07-06 1.628652 0.616124 0.387144 -0.374426
# 2016-07-05 -1.067445 0.261680 0.147572 -1.697567
# 2016-07-04 -0.578484 -1.255931 -0.295924 1.871001
# You can sort index value in descending order
dataframe2.sort_index(axis=0,ascending=False)
# You can sort index row data in descending order
dataframe2.sort_index(axis=1,ascending=False)
# You can sort by "D" column data
dataframe2.sort.values(by="D")
dataframe2.sort.values(by="B")
# You will add 2 columns("E" and "F") and its data
dataframe2["E"]=np.random.randint(0,6,size=6)
dataframe2["F"]=["alpha","beta","gamma","gamma","alpha","gamma"]
# You can sort by values from both "E" and "F"
dataframe2.sort.values(by=["E","F"])
# @
dataframe2["F"].unique()
dataframe2["F"].value_counts()
dataframe2["F"].isin(["alpha","beta"])
dataframe2.loc[dataframe2["F"].isin(["alpha","beta"])]
# @
dataframe3=pd.DataFrame(np.random.randn(4,3)\
,columns=["b","d","e"]\
,index=["Seoul","Incheon","Busan","Daegu"])
func=lambda x:x.max()-x.min()
# You can use apply()
dataframe3.apply(func,axis=0)
dataframe3.apply(func,axis=1)