006-001. merge series, merge dataframe # You can see one dataset which is composed of mulitple files # When you can encounter this case, # you should merge mulitple dataframes from multiple files into one dataframe # Task of merging 2 dataframes into one dataframe can be devided into 2 ways # 1. concatenating: # You attatch one dataframe onto other dataframe consecutively # 1. merging: # You first find one column which is commonly contained in 2 dataframes, # and then, you merge row based on common column import pandas as pd import numpy as np # @ # Let's talk about merging dataframe dataframe_1=pd.DataFrame({"key":list("bbacaab"),"data1":range(7)}) # data1 key # 0 0 b # 1 1 b # 2 2 a # 3 3 c # 4 4 a # 5 5 a # 6 6 b dataframe_2=pd.DataFrame({"key":list("abd"),"data2":range(3)}) # data2 key # 0 0 a # 1 1 b # 2 2 d # Above 2 dataframes have "key" column commonly # "key" column has categoriacal data ranged from a to d # You will merge 2 dataframes based on "key" column pd.merge(dataframe_1,dataframe_2,on="key") # data1 key data2 # 0 0 b 1 # 1 1 b 1 # 2 6 b 1 # 3 2 a 0 # 4 4 a 0 # 5 5 a 0 # Since c and d are only contained in dataframe_1, # rows which contain c or d aren't contained in merged dataframe # Above merge is similar to inner join # img dacefb71-a4ae-42aa-8b74-25f1f71ef2d7 # You can use other options to use outter join # Following rows will be also contained into merged dataframe # 3.0 c NaN # NaN d 2.0 pd.merge(dataframe_1,dataframe_2,on="key",how="outer") # data1 key data2 # 0 0.0 b 1.0 # 1 1.0 b 1.0 # 2 6.0 b 1.0 # 3 2.0 a 0.0 # 4 4.0 a 0.0 # 5 5.0 a 0.0 # 6 3.0 c NaN # 7 NaN d 2.0 # img ee57ed32-e1f7-4241-8331-c3de193dc675 # You can use left join, # which merges based on dataframe_1(left, 1st argument) # You first find row whose key is "b" in "data2", # and you attatch value onto merged dataframe pd.merge(dataframe_1,dataframe_2,on="key",how="left") # data1 key data2 # 0 0 b 1.0 # 1 1 b 1.0 # 2 2 a 0.0 # 3 3 c NaN # 4 4 a 0.0 # 5 5 a 0.0 # 6 6 b 1.0 # img 92f95106-8061-4e75-b9ec-96cf0135f327 pd.merge(dataframe_1,dataframe_2,on="key",how="right") # data1 key data2 # 0 0.0 b 1 # 1 1.0 b 1 # 2 6.0 b 1 # 3 2.0 a 0 # 4 4.0 a 0 # 5 5.0 a 0 # 6 NaN d 2 # img 702bc179-d5ca-4946-9874-3ba382d0bb77 # @ # inner(default), outter, left, right dataframe_3=pd.DataFrame({"lkey":list("bbacaab"),"data1": range(7)}) # data1 lkey # 0 0 b # 1 1 b # 2 2 a # 3 3 c # 4 4 a # 5 5 a # 6 6 b dataframe_4=pd.DataFrame({"rkey":list("abd"),"data2": range(3)}) # data2 rkey # 0 0 a # 1 1 b # 2 2 d pd.merge(dataframe_3,dataframe_4,left_on="lkey",right_on="rkey") # data1 lkey data2 rkey # 0 0 b 1 b # 1 1 b 1 b # 2 6 b 1 b # 3 2 a 0 a # 4 4 a 0 a # 5 5 a 0 a # img 4cea1664-62d0-4774-b1e9-823b326f8639 # @ # You can merge dataframes based on index value dataframe_left_1=pd.DataFrame({'key':['a','b','a','a','b','c'],'value': range(6)}) # key value # 0 a 0 # 1 b 1 # 2 a 2 # 3 a 3 # 4 b 4 # 5 c 5 dataframe_right_1=pd.DataFrame({'group_val':[3.5,7]},index=['a','b']) # group_val # a 3.5 # b 7.0 pd.merge(dataframe_left_1,dataframe_right_1,left_on="key",right_index=True) # key value group_val # 0 a 0 3.5 # 2 a 2 3.5 # 3 a 3 3.5 # 1 b 1 7.0 # 4 b 4 7.0 dataframe_left_2=pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]]\ ,index=['a','c','e']\ ,columns=['Seoul','Incheon']) # Seoul Incheon # a 1.0 2.0 # c 3.0 4.0 # e 5.0 6.0 dataframe_right_2=pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]]\ ,index=['b','c','d','e']\ ,columns=['Daegu','Ulsan']) # Daegu Ulsan # b 7.0 8.0 # c 9.0 10.0 # d 11.0 12.0 # e 13.0 14.0 pd.merge(dataframe_left_2,dataframe_right_2,how="outer",left_index=True,right_index=True) # Seoul Incheon Daegu Ulsan # a 1.0 2.0 NaN NaN # b NaN NaN 7.0 8.0 # c 3.0 4.0 9.0 10.0 # d NaN NaN 11.0 12.0 # e 5.0 6.0 13.0 14.0 # @ # Let's talk about concatenating DataFrame # You create 3 Serieses series_1=pd.Series([0,1],index=["a","b"]) # a 0 # b 1 series_2=pd.Series([2,3,4],index=["c","d","e"]) # c 2 # d 3 # e 4 series_3=pd.Series([5,6],index=["f","g"]) # f 5 # g 6 # You concatenate 3 Serieses pd.concat([series_1,series_2,series_3]) # a 0 # b 1 # c 2 # d 3 # e 4 # f 5 # g 6 # img dec586f3-b22a-42bb-9899-be536113a9ac # You can craete dataframe pd.concat([series_1,series_2,series_3],axis=1) # 0 1 2 # a 0.0 NaN NaN # b 1.0 NaN NaN # c NaN 2.0 NaN # d NaN 3.0 NaN # e NaN 4.0 NaN # f NaN NaN 5.0 # g NaN NaN 6.0 # img d29467c4-eeed-4d46-b237-721f6ab402ba # You create Series series_4=pd.concat([series_1*5,series_3]) # a 0 # b 5 # f 5 # g 6 pd.concat([series_1,series_4],axis=1) # 0 1 # a 0.0 0 # b 1.0 5 # f NaN 5 # g NaN 6 # You give name for concatenated dataframe's index pd.concat([series_1,series_2,series_3],axis=1,keys=["one","two","three"]) # one two three # a 0.0 NaN NaN # b 1.0 NaN NaN # c NaN 2.0 NaN # d NaN 3.0 NaN # e NaN 4.0 NaN # f NaN NaN 5.0 # g NaN NaN 6.0 # You perform concatenate with dataframes dataframe_1=pd.DataFrame(np.arange(6).reshape(3,2)\ ,index=['a','b','c']\ ,columns=['one','two']) # one two # a 0 1 # b 2 3 # c 4 5 dataframe_2=pd.DataFrame(5+np.arange(4).reshape(2,2)\ ,index=['a','c']\ ,columns=['three','four']) # three four # a 5 6 # c 7 8 pd.concat([dataframe_1,dataframe_2],axis=1) # one two three four # a 0 1 5.0 6.0 # b 2 3 NaN NaN # c 4 5 7.0 8.0 # @ # You create dataframes dataframe_3=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d']) # a b c d # 0 1.195624 -0.824417 0.244959 0.329864 # 1 -0.128856 -1.185168 -1.087919 -0.780488 # 2 -1.392841 -0.302125 1.597788 0.241045 dataframe_4=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a']) # b d a # 0 0.845702 -0.108371 -0.073878 # 1 0.952161 -0.012805 0.773525 # You first try this one pd.concat([dataframe_3,dataframe_4]) # a b c d # 0 1.195624 -0.824417 0.244959 0.329864 # 1 -0.128856 -1.185168 -1.087919 -0.780488 # 2 -1.392841 -0.302125 1.597788 0.241045 # 0 -0.073878 0.845702 NaN -0.108371 # 1 0.773525 0.952161 NaN -0.012805 # index is 0 1 2 0 1 pd.concat([dataframe_3,dataframe_4],ignore_index=True) # a b c d # 0 1.195624 -0.824417 0.244959 0.329864 # 1 -0.128856 -1.185168 -1.087919 -0.780488 # 2 -1.392841 -0.302125 1.597788 0.241045 # 3 -0.073878 0.845702 NaN -0.108371 # 4 0.773525 0.952161 NaN -0.012805 # index is 0 1 2 3 4