006-004. groupby()
df=pd.DataFrame({'key1' : ['a','a','b','b','a'],
'key2' : ['one','two','one','two','one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
# data1 data2 key1 key2
# 0 1.268888 -0.305794 a one
# 1 2.027708 0.735461 a two
# 2 -0.154353 0.917801 b one
# 3 -1.603911 -2.158357 b two
# 4 -1.144526 0.420716 a one
# df["data1"]: You extract data of "data1" column,
# and this is series
# You try to group by df["key1"]
grouped=df["data1"].groupby(df["key1"])
#
grouped.mean()
# key1
# a 0.717357
# b -0.879132
# Name: data1, dtype: float64
# Above result was from 2 groups(a and b)
means=df["data1"].groupby([df["key1"],df["key2"]]).mean()
# key1 key2
# a one 0.062181
# two 2.027708
# b one -0.154353
# two -1.603911
# Name: data1, dtype: float64
means.unstack()
# key2 one two
# key1
# a 0.062181 2.027708
# b -0.154353 -1.603911
# You can use groupby() on dataframe,
# with specifying name of column
df.groupby("key1").mean()
df.groupby("key1").count()
df.groupby(["key1","key2"]).mean()
df.groupby(["key1","key2"]).count()
# @
df.groupby(["key1","key2"])["data2"].mean()
# @
# You can see groups
for name,group in df.groupby("key1"):
print(name)
print(group)
for (k1,k2),group in df.groupby(["key1","key2"]):
print(k1,k2)
print(group)
pieces=dict(list(df.groupby("key1")))
pieces
pieces["b"]
# @
df2=pd.DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
map_dict={'a': 'red','b': 'red','c': 'blue',
'd': 'blue','e': 'red','f' : 'orange'}
df2.groupby(map_dict,axis=1).sum()
map_s=pd.Series(map_dict)
df2.groupby(map_dict,axis=1).count()
# @
grouped=df.groupby("key1")
# You can create method
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
grouped.agg("std")
grouped.describe()