005-003. analyze and visualize "game of thrones" dataset by matplotlib and pandas
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# This data contains data related to battles_dataframe
battles_dataframe=pd.read_csv("/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/code/PythonPractice/game-of-thrones/battles.csv",sep=",")
deaths_dataframe=pd.read_csv("/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/code/PythonPractice/game-of-thrones/character-deaths.csv",sep=",")
battles_dataframe.shape
# (38, 25)
battles_dataframe.columns
# Index(['name', 'year', 'battle_number', 'attacker_king', 'defender_king',
# 'attacker_1', 'attacker_2', 'attacker_3', 'attacker_4', 'defender_1',
# 'defender_2', 'defender_3', 'defender_4', 'attacker_outcome',
# 'battle_type', 'major_death', 'major_capture', 'attacker_size',
# 'defender_size', 'attacker_commander', 'defender_commander', 'summer',
# 'location', 'region', 'note'],
# dtype='object')
battles_dataframe.head()
deaths_dataframe.shape
# (917, 13)
deaths_dataframe.columns
# Index(['Name', 'Allegiances', 'Death Year', 'Book of Death', 'Death Chapter',
# 'Book Intro Chapter', 'Gender', 'Nobility', 'GoT', 'CoK', 'SoS', 'FfC',
# 'DwD'],
# dtype='object')
deaths_dataframe.head()
# @
# You will visualize number of death per episode
# You select "Book of Death" column data,
# and get "Book of Death" column data's count data,
# 3.0 97
# 2.0 73
# 5.0 61
# 1.0 49
# 4.0 27
# and then sort them by index
number_of_death_per_episode_series=deaths_dataframe["Book of Death"].value_counts().sort_index()
# 1.0 49
# 2.0 73
# 3.0 97
# 4.0 27
# 5.0 61
# 1.0 49 means number of deaths_dataframe in episode 1
# You will draw line plot by using plot() on seires
# You get subplot instance
ax1=number_of_death_per_episode_series.plot(color="k",marker="o",linestyle="--")
# set_xticks(np.arange(1,6)): |1 2 3 4 5|
ax1.set_xticks(np.arange(1,6))
# set_xlim([0,6]): |0 1 2 3 4 5 6|
ax1.set_xlim([0,6])
# |0 20 40 60 80 100 120|
ax1.set_ylim([0,120])
# img be9013c7-ec05-4e46-b258-71ef58b2228f
# @
# You will visualize data related to battles
# You will give "name" (of entire index of battles_dataframe)
battles_dataframe=battles_dataframe.set_index(["name"])
# You get mask(boolean array)
# You select "attracker_size" column data,
# and select "defender_size" column data
# and sum them up,
# and create mask with condition ">10000"
large_battles_mask=battles_dataframe["attacker_size"]+battles_dataframe["defender_size"]>10000
# You use mask(large_battles_mask) to extract condition satisfied rows
# from those rows,
# select "attacker_size" column data and "defender_size" column data
large_battles_series=battles_dataframe.loc[large_battles_mask,["attacker_size","defender_size"]]
# attacker_size defender_size
# name
# Battle of the Golden Tooth 15000.0 4000.0
# Battle of Riverrun 15000.0 10000.0
# Battle of the Green Fork 18000.0 20000.0
# Battle of the Camps 6000.0 12625.0
# Battle of Oxcross 6000.0 10000.0
# Siege of Storm's End 5000.0 20000.0
# Battle of the Fords 20000.0 10000.0
# Battle of the Blackwater 21000.0 7250.0
# Battle of Castle Black 100000.0 1240.0
# Siege of Winterfell 5000.0 8000.0
# You draw barh plot with large_battles_series
ax2=large_battles_series.plot(kind="barh",stacked=True,fontsize=8)
# img 6144bc34-5645-4fb9-a1e6-a114cd602d84
# You add attacker_percentage (percent) column
# You find attacker_size/(attacker_size+defender_size),
# and assign result into attacker_percentage column
large_battles_series["attacker_percentage"]=\
large_battles_series["attacker_size"]/(large_battles_series["attacker_size"]+large_battles_series["defender_size"])
large_battles_series["defender_percentage"]=\
large_battles_series["defender_size"]/(large_battles_series["attacker_size"]+large_battles_series["defender_size"])
# You draw bar plot with 2 columns
ax3=large_battles_series[["attacker_percentage","defender_percentage"]].plot(kind="barh",stacked=True,fontsize=8)
# img 9ab54145-850f-4b8f-8f6f-2ce7174e8bdb
# @
# You will visualize frequency of intervention by each family on battles
# You will bring related columns
# attacker1,...attacker4,deffender1,...deffender4
columns_of_attacker_and_defender=battles_dataframe.columns[4:12]
# Index(['attacker_1', 'attacker_2', 'attacker_3', 'attacker_4', 'defender_1',
# 'defender_2', 'defender_3', 'defender_4'],
# dtype='object')
# You bring column data from "columns_of_attacker_and_defender",
# and replace nan with string "None",
# and extract value
family_names=battles_dataframe[columns_of_attacker_and_defender].fillna("None").values
family_names[:5]
# array([['Lannister', 'None', 'None', 'None', 'Tully', 'None', 'None',
# 'None'],
# ['Lannister', 'None', 'None', 'None', 'Baratheon', 'None', 'None',
# 'None'],
# ['Lannister', 'None', 'None', 'None', 'Tully', 'None', 'None',
# 'None'],
# ['Stark', 'None', 'None', 'None', 'Lannister', 'None', 'None',
# 'None'],
# ['Stark', 'Tully', 'None', 'None', 'Lannister', 'None', 'None',
# 'None']], dtype=object)
# You will get unique family names
duplicate_removed_family_names=np.unique(family_names)
# array(['Baratheon', 'Blackwood', 'Bolton', 'Bracken', 'Brave Companions',
# 'Brotherhood without Banners', 'Darry', 'Free folk', 'Frey',
# 'Giants', 'Glover', 'Greyjoy', 'Karstark', 'Lannister',
# 'Mallister', 'Mormont', "Night's Watch", 'None', 'Stark', 'Thenns',
# 'Tully', 'Tyrell'], dtype=object)
# You remove string "None" by using mask(boolean array)
none_removed_family_names=duplicate_removed_family_names[duplicate_removed_family_names!="None"]
# array(['Baratheon', 'Blackwood', 'Bolton', 'Bracken', 'Brave Companions',
# 'Brotherhood without Banners', 'Darry', 'Free folk', 'Frey',
# 'Giants', 'Glover', 'Greyjoy', 'Karstark', 'Lannister',
# 'Mallister', 'Mormont', "Night's Watch", 'Stark', 'Thenns',
# 'Tully', 'Tyrell'], dtype=object)
# You create series by using family name as index,
# with initializing Series by 0
family_names_intervented_to_battles_series=pd.Series(0,index=none_removed_family_names)
# Baratheon 0
# Blackwood 0
# Bolton 0
# Bracken 0
# Brave Companions 0
# Brotherhood without Banners 0
# Darry 0
# Free folk 0
# Frey 0
# Giants 0
# Glover 0
# Greyjoy 0
# Karstark 0
# Lannister 0
# Mallister 0
# Mormont 0
# Night's Watch 0
# Stark 0
# Thenns 0
# Tully 0
# Tyrell 0
for col in columns_of_attacker_and_defender:
family_names_intervented_to_battles_series\
# fill_value=0 means NaN is 0
=family_names_intervented_to_battles_series.add(battles_dataframe[col].value_counts(),fill_value=0)
# bins=10 means 10 intervals
ax4=family_names_intervented_to_battles_series.hist(bins=10)
# img 3246d2c8-70b9-4007-801c-7022a1364958