006-002. layered index
import pandas as pd
import numpy as np
# You can create index as 2 dimensional array,
# then, you will have 2 layered index
series=pd.Series(np.random.randn(10),
index=[["a","a","a","b","b","b","c","c","d","d"],
[1,2,3,1,2,3,1,2,2,3]])
# a 1 -0.513108
# 2 -0.379450
# 3 -0.962825
# b 1 -1.373376
# 2 -1.239304
# 3 -0.858371
# c 1 0.711247
# 2 1.668837
# d 2 1.672055
# 3 0.572363
# 1th layer index: a,b,c,d
# 2th layer index: 1,2,3
series.index
# MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
# labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
# @
# Let'series talk about how to perform layered indexing on layered index
# Rule: You should perform layered indexing from 1th index to inner index
series["b"]
# 1 -1.373376
# 2 -1.239304
# 3 -0.858371
series["b":"c"]
# b 1 -1.373376
# 2 -1.239304
# 3 -0.858371
# c 1 0.711247
# 2 1.668837
series[("b",3)]
# -0.8583708266256991
series[:,3]
# a -0.962825
# b -0.858371
# d 0.572363
# Let's create new dataframe
# index will be 2 dimensional array
# columns will be 2 dimensional array
dataframe=pd.DataFrame(np.arange(12).reshape((4,3))\
,index=[["a","a","b","b"],
[1,2,1,2]]\
,columns=[["Seoul","Seoul","Busan"],
["Green","Red","Green"]])
# Seoul Busan
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
# You can give name to index and column
dataframe.index.names=["key1","key2"]
dataframe.columns.names=["city","color"]
# city Seoul Busan
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
# You can perform layered indexing on dataframe
dataframe["Seoul"]
# color Green Red
# key1 key2
# a 1 0 1
# 2 3 4
# b 1 6 7
# 2 9 10
dataframe["Seoul","Green"]
# key1 key2
# a 1 0
# 2 3
# b 1 6
# 2 9
dataframe.loc["a"]
# city Seoul Busan
# color Green Red Green
# key2
# 1 0 1 2
# 2 3 4 5
dataframe.loc[("a",1)]
# city color
# Seoul Green 0
# Red 1
# Busan Green 2
dataframe.loc["b",("Seoul","Red")]
# key2
# 1 7
# 2 10
dataframe.loc[("b",2),"Busan"]
# color
# Green 11
dataframe.loc[("b",1),("Seoul","Green")]
# 6
# @
# You can sort based on layered index
# level=0 means most outter index key1(a,b,..)
# level=1 means index key2(1,2,..)
# You select key1,
# and sort column data
dataframe.sort_index(axis=0,level=0)
# city Seoul Busan
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
# You select key2,
# and, sort column data
dataframe.sort_index(axis=0,level=1)
# city Seoul Busan
# color Green Red Green
# key1 key2
# a 1 0 1 2
# b 1 6 7 8
# a 2 3 4 5
# b 2 9 10 11
# You can designate index name for sorting
dataframe.sort_index(axis=0,level="key2")
# Above code is same with dataframe.sort_index(axis=0,level=1)
# You can perform same task with above task,
# to sort row data
dataframe.sort_index(axis=1,level=0)
dataframe.sort_index(axis=1,level=1)
dataframe.sort_index(axis=1,level="color")
# @
# You can sort by value in layered index
dataframe.sort_values(by=("Busan","Green"))
# Seoul Busan
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
# You can use statistical method on layered index
# What you additionally need to do is to use level
# axis=0 means column data
# level=0 means key1
dataframe.sum(axis=0,level=0)
# city Seoul Busan
# color Green Red Green
# key1
# a 3 5 7
# b 15 17 19
# You use with key2
dataframe.sum(axis=0,level=1)
# city Seoul Busan
# color Green Red Green
# key2
# 1 6 8 10
# 2 12 14 16
dataframe.mean(axis=1,level="color")
# color Green Red
# key1 key2
# a 1 1 1
# 2 4 4
# b 1 7 7
# 2 10 10
# @
dataframe_2=pd.DataFrame({'a': range(7),'b': range(7,0,-1),
'c': ['one','one','one','two','two','two','two'],
'd': [0,1,2,0,1,2,3]})
# a b c d
# 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# 3 3 4 two 0
# 4 4 3 two 1
# 5 5 2 two 2
# 6 6 1 two 3
# You can change "c","d" column index into "c","d" row index name
dataframe_3=dataframe_2.set_index(["c","d"])
# a b
# c d
# one 0 0 7
# 1 1 6
# 2 2 5
# two 0 3 4
# 1 4 3
# 2 5 2
# 3 6 1
# You can sustain changed columns (c and d) from above
dataframe_2.set_index(["c","d"],drop=False)
# a b c d
# c d
# one 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# two 0 3 4 two 0
# 1 4 3 two 1
# 2 5 2 two 2
# 3 6 1 two 3
# You can creat integer index
dataframe_3.reset_index()
# c d a b
# 0 one 0 0 7
# 1 one 1 1 6
# 2 one 2 2 5
# 3 two 0 3 4
# 4 two 1 4 3
# 5 two 2 5 2
# 6 two 3 6 1
# @
# You can reshape dataframe
dataframe_4=pd.DataFrame(np.arange(6).reshape((2,3)),
index=['Seoul','Busan'],
columns=['one','two','three'])
# one two three
# Seoul 0 1 2
# Busan 3 4 5
# You give name to index and column
dataframe_4.index.name="city"
dataframe_4.columns.name="number"
# number one two three
# city
# Seoul 0 1 2
# Busan 3 4 5
# You can use stack()
# You make "one two three" as sub index of city
dataframe_5=dataframe_4.stack()
# city number
# Seoul one 0
# two 1
# three 2
# Busan one 3
# two 4
# three 5
# You should distinguish between stack() and set_index()
# You elevate most sub index into column
dataframe_5.unstack()
# number one two three
# city
# Seoul 0 1 2
# Busan 3 4 5
# You elevate most upper index into column
dataframe_5.unstack(level=0)
dataframe_5.unstack(level="city")
# @
s1=pd.Series([0,1,2,3],index=['a','b','c','d'])
s2=pd.Series([4,5,6],index=['c','d','e'])
s3=pd.concat([s1,s2],keys=["one","two"])
s3.unstack()
# @
# More complex case
df6=pd.DataFrame({"left":dataframe_5,"right":dataframe_5+5},
columns=["left","right"])
df6.columns.name="side"
df6.unstack(level="city")
df6.unstack(level="city").stack(level="side")
# @
# When you start using layered index,
# dataframe can be too complex
# So, I don't recommend you to use layered index for special case
# But there is chance to meet layered index,
# when you use dataframe
# So, you need to at least understand layered index