006-003. manipulate dataframe
import pandas as pd
import numpy as np
df=pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[1,1,2,3,3,4,4]})
# k1 k2
# 0 one 1
# 1 one 1
# 2 one 2
# 3 two 3
# 4 two 3
# 5 two 4
# 6 two 4
# You can see duplicated row in above dataframe
df.duplicated()
# 0 False
# 1 True
# 2 False
# 3 False
# 4 True
# 5 False
# 6 True
# You can see False at duplicated row,
# from above series boolean mask
# You can use drop_duplicates(),
# to removed duplicated row
df.drop_duplicates()
# k1 k2
# 0 one 1
# 2 one 2
# 3 two 3
# 5 two 4
# You will add new column named "v1"
df["v1"]=np.arange(7)
# k1 k2 v1
# 0 one 1 0
# 1 one 1 1
# 2 one 2 2
# 3 two 3 3
# 4 two 3 4
# 5 two 4 5
# 6 two 4
# You can remove entire row based on k1 column,
# if there is duplicated data
df.drop_duplicates(["k1"])
# k1 k2 v1
# 0 one 1 0
# 3 two 3 3
# You can remove entire row based on k1 and k2 columns,
# if there is duplicated data
# keep="last" means you remain last data
df.drop_duplicates(["k1","k2"],keep="last")
# k1 k2 v1
# 1 one 1 1
# 2 one 2 2
# 4 two 3 4
# 6 two 4 6
# You can use "first"(default)
# @
# Let's talk about data mapping
df2=pd.DataFrame({'food': ['bacon','pulled pork','bacon','Pastrami',
'corned beef','Bacon','pastrami','honey ham',
'nova lox'],
'ounces': [4,3,12,6,7.5,8,3,5,6]})
# food ounces
# 0 bacon 4.0
# 1 pulled pork 3.0
# 2 bacon 12.0
# 3 Pastrami 6.0
# 4 corned beef 7.5
# 5 Bacon 8.0
# 6 pastrami 3.0
# 7 honey ham 5.0
# 8 nova lox 6.0
# You create dictionary
meat_to_animal={
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
# You add "animal" column
# You first bring "food" column,
# and use apply() with passing lambda,
# which uses key of meat_to_animal
df2["animal"]=df2["food"].apply(lambda x: meat_to_animal[x.lower()])
df
# food ounces animal
# 0 bacon 4.0 pig
# 1 pulled pork 3.0 pig
# 2 bacon 12.0 pig
# 3 Pastrami 6.0 cow
# 4 corned beef 7.5 cow
# 5 Bacon 8.0 pig
# 6 pastrami 3.0 cow
# 7 honey ham 5.0 pig
# 8 nova lox 6.0 salmon
# @
s=pd.Series([1.,-999.,2.,-999.,-1000.,3.])
# 0 1.0
# 1 -999.0
# 2 2.0
# 3 -999.0
# 4 -1000.0
# 5 3.0
# You can use replace()
# -999 is value which you want to find
# np.nan is replacement for -999
s2=s.replace(-999,np.nan)
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 -1000.0
# 5 3.0
s2.replace(np.nan,0)
# @
# You can deal with category type data,
# because pandas provides category data type
# Benefits of category data type are reducing memory consumption,
# and increasing indexing performace
df3=pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":['a','b','b','a','a','e']})
# id raw_grade
# 0 1 a
# 1 2 b
# 2 3 b
# 3 4 a
# 4 5 a
# 5 6 e
# You can convert data type by using astype()
df3["grade"]=df3["raw_grade"].astype("category")
# id raw_grade grade
# 0 1 a a
# 1 2 b b
# 2 3 b b
# 3 4 a a
# 4 5 a a
# 5 6 e e
df3["grade"]
# 0 a
# 1 b
# 2 b
# 3 a
# 4 a
# 5 e
# Name: grade, dtype: category
# Categories (3, object): [a, b, e]
df3["grade"].cat.categories
# You can change category data type values
df3["grade"].cat.categories=["very good","good","very bad"]
# You can number of categories,
# from 3 categories to 5 categories
df3["grade"]=df3["grade"].cat.set_categories(["very bad","bad","medium","good","very good"])
# @
# You can sort in order of,
# "very bad","bad","medium","good","very good"
df3.sort_values(by="grade")
# @
# You can categorize numerical data
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
# This is sections you will use
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats.codes
group_names=["Youth","YoungAdult","MiddleAged","Senior"]
pd.cut(ages,bins,labels=group_names)
data=np.random.rand(20)
# This is simple way
# 4 is number of categories
# 2 is 0.0x for borderline of category
pd.cut(data,4,precision=2)
data2=np.random.randn(1000)
# You create sections based on percentile value
cats=pd.qcut(data2,4)