008-002. IO data, load csv, create csv, export csv
@
# Pandas can make dataframe by loading data file in various formats
CSV
Excel
HTML
JSON
HDF5
SAS
STATA
SQL
@
# You can create text file to make csv file by doing following magic command
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample1.csv
@
# You can create dataframe object by loading csv file
pd.read_csv('sample1.csv')
# c1 c2 c3
# 0 1 1.11 one
# 1 2 2.22 two
# 2 3 3.33 three
# We designated columns index
# But we didn't for row index, so that, integer row index is automatically inserted
@
# I create csv file without both row index and column index
%%writefile sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample2.csv
# If you want to add column index, you can add it
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])
# c1 c2 c3
# 0 1 1.11 one
# 1 2 2.22 two
# 2 3 3.33 three
@
# If you want to use specific column index as row index,
pd.read_csv('sample1.csv', index_col='c1')
# c2 c3
# c1
# 1 1.11 one
# 2 2.22 two
# 3 3.33 three
@
%%writefile sample3.txt
c1 c2 c3 c4
0.179181 -1.538472 1.347553 0.43381
1.024209 0.087307 -1.281997 0.49265
0.417899 -2.002308 0.255245 -1.10515
# Writing sample3.txt
# If you run into none-csv file without comma delimeter, you can designate comma delimeter by sep argument
# If delimiter is unfixed length whitespace, you can use regular expression patter \s+
pd.read_table('sample3.txt', sep='\s+')
# c1 c2 c3 c4
# 0 0.179181 -1.538472 1.347553 0.43381
# 1 1.024209 0.087307 -1.281997 0.49265
# 2 0.417899 -2.002308 0.255245 -1.10515
@
%%writefile sample4.txt
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample4.txt
# If you want to skip some rows in data file,
pd.read_csv('sample4.txt', skiprows=[0, 1])
# c1 c2 c3
# 2 3 3.33 three
@
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2,, two
empty, 3.33, three
# If you want to treat specific value as NaN,
# you put specific value you want to treat as NaN into na_values argument
df = pd.read_csv('sample5.csv', na_values=['empty'])
df
# c1 c2 c3
# 0 1.0 1.11 one
# 1 2.0 NaN two
# 2 NaN 3.33 three
@
%%writefile sample6.txt
c1,c2,c3
0,1.0,1.11,one
1,2.0,two
2,3.33,three
# If you want to export dataframe in form of csv file,
# you can use to_csv()
df.to_csv('sample6.csv')
# You can check contents of that csv file by using cat shell command
# In windows, you can use type command
# Exclamation mark(!) is magic command to use shell comman in IPython
!cat sample6.csv # In windows, you can use !type command
# ,c1, c2, c3
# 0,1.0,1.11, one
# 1,2.0,, two
# 2,,3.33, three
@
# When you export file, you can use sep argument to change delimeter
df.to_csv('sample7.txt', sep='|')
# !cat sample7.txt
# |c1| c2| c3
# 0|1.0|1.11| one
# 1|2.0|| two
# 2||3.33| three
@
# You can change NaN denotation by using na_rep argument
df.to_csv('sample8.csv', na_rep='empty')
# !cat sample8.csv
# ,c1, c2, c3
# 0,1.0,1.11, one
# 1,2.0,empty, two
# 2,empty,3.33, three
@
# You can designate column or row to show data table
df.index = ["a", "b", "c"]
# or
df.index = ["c1", "c2", "c3"]
df
# c1 c2 c3
# a 1.0 1.11 one
# b 2.0 NaN two
# c NaN 3.33 three
# You can designate wheter printing column or row to show
df.to_csv('sample9.csv', index=False, header=False)
# !cat sample9.csv
# 1.0,1.11, one
# 2.0,, two
# ,3.33, three
@
# You can find various data file in form of csv
# When you use read_csv(), you can use url instead of path of csv file
df = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')
df
# If data is too massive, dataframe shows partial table
# survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
# 0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
# 1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
# 2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
# 3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
# 4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
# 5 0 3 male NaN 0 0 8.4583 Q Third man True NaN Queenstown no True
# 6 0 1 male 54.0 0 0 51.8625 S First man True E Southampton no True
# 7 0 3 male 2.0 3 1 21.0750 S Third child False NaN Southampton no False
# 8 1 3 female 27.0 0 2 11.1333 S Third woman False NaN Southampton yes False
# 9 1 2 female 14.0 1 0 30.0708 C Second child False NaN Cherbourg yes False
# 10 1 3 female 4.0 1 1 16.7000 S Third child False G Southampton yes False
# 11 1 1 female 58.0 0 0 26.5500 S First woman False C Southampton yes True
# 12 0 3 male 20.0 0 0 8.0500 S Third man True NaN Southampton no True
# 13 0 3 male 39.0 1 5 31.2750 S Third man True NaN Southampton no False
# 14 0 3 female 14.0 0 0 7.8542 S Third child False NaN Southampton no True
# 15 1 2 female 55.0 0 0 16.0000 S Second woman False NaN Southampton yes True
# 16 0 3 male 2.0 4 1 29.1250 Q Third child False NaN Queenstown no False
# 17 1 2 male NaN 0 0 13.0000 S Second man True NaN Southampton yes True
# 18 0 3 female 31.0 1 0 18.0000 S Third woman False NaN Southampton no False
# 19 1 3 female NaN 0 0 7.2250 C Third woman False NaN Cherbourg yes True
# 20 0 2 male 35.0 0 0 26.0000 S Second man True NaN Southampton no True
# 21 1 2 male 34.0 0 0 13.0000 S Second man True D Southampton yes True
# 22 1 3 female 15.0 0 0 8.0292 Q Third child False NaN Queenstown yes True
# 23 1 1 male 28.0 0 0 35.5000 S First man True A Southampton yes True
# 24 0 3 female 8.0 3 1 21.0750 S Third child False NaN Southampton no False
# 25 1 3 female 38.0 1 5 31.3875 S Third woman False NaN Southampton yes False
# 26 0 3 male NaN 0 0 7.2250 C Third man True NaN Cherbourg no True
# 27 0 1 male 19.0 3 2 263.0000 S First man True C Southampton no False
# 28 1 3 female NaN 0 0 7.8792 Q Third woman False NaN Queenstown yes True
# 29 0 3 male NaN 0 0 7.8958 S Third man True NaN Southampton no True
# ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
# 861 0 2 male 21.0 1 0 11.5000 S Second man True NaN Southampton no False
# 862 1 1 female 48.0 0 0 25.9292 S First woman False D Southampton yes True
# 863 0 3 female NaN 8 2 69.5500 S Third woman False NaN Southampton no False
# 864 0 2 male 24.0 0 0 13.0000 S Second man True NaN Southampton no True
# 865 1 2 female 42.0 0 0 13.0000 S Second woman False NaN Southampton yes True
# 866 1 2 female 27.0 1 0 13.8583 C Second woman False NaN Cherbourg yes False
# 867 0 1 male 31.0 0 0 50.4958 S First man True A Southampton no True
# 868 0 3 male NaN 0 0 9.5000 S Third man True NaN Southampton no True
# 869 1 3 male 4.0 1 1 11.1333 S Third child False NaN Southampton yes False
# 870 0 3 male 26.0 0 0 7.8958 S Third man True NaN Southampton no True
# 871 1 1 female 47.0 1 1 52.5542 S First woman False D Southampton yes False
# 872 0 1 male 33.0 0 0 5.0000 S First man True B Southampton no True
# 873 0 3 male 47.0 0 0 9.0000 S Third man True NaN Southampton no True
# 874 1 2 female 28.0 1 0 24.0000 C Second woman False NaN Cherbourg yes False
# 875 1 3 female 15.0 0 0 7.2250 C Third child False NaN Cherbourg yes True
# 876 0 3 male 20.0 0 0 9.8458 S Third man True NaN Southampton no True
# 877 0 3 male 19.0 0 0 7.8958 S Third man True NaN Southampton no True
# 878 0 3 male NaN 0 0 7.8958 S Third man True NaN Southampton no True
# 879 1 1 female 56.0 0 1 83.1583 C First woman False C Cherbourg yes False
# 880 1 2 female 25.0 0 1 26.0000 S Second woman False NaN Southampton yes False
# 881 0 3 male 33.0 0 0 7.8958 S Third man True NaN Southampton no True
# 882 0 3 female 22.0 0 0 10.5167 S Third woman False NaN Southampton no True
# 883 0 2 male 28.0 0 0 10.5000 S Second man True NaN Southampton no True
# 884 0 3 male 25.0 0 0 7.0500 S Third man True NaN Southampton no True
# 885 0 3 female 39.0 0 5 29.1250 Q Third woman False NaN Queenstown no False
# 886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
# 887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
# 888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
# 889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
# 890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True
# 891 rows × 15 columns
@
# If you want to see specific number of data from front or back area, you use head() or tail()
# You can designate the number of row or column to show as argument
df.head()
# survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
# 0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
# 1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
# 2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
# 3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
# 4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
df.tail(2)
# survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
# 889 1 1 male 26.0 0 0 30.00 C First man True C Cherbourg yes True
# 890 0 3 male 32.0 0 0 7.75 Q Third man True NaN Queenstown no True
@
# If you use DataReader of pandas_datareader package, you can directly and automatically load data of some internet site
# To use pandas_datareader package, you should install separately regardless of pandas
# Following is site examples of pandas_datareader provding
# Some site can require register and payment
# FRED
# Fama/French
# World Bank
# OECD
# Eurostat
# EDGAR Index
# TSP Fund Data
# Oanda currency historical rate
# Nasdaq Trader Symbol Definitions
# For more detail, you can see following site
# https://pandas-datareader.readthedocs.io/en/latest/index.html
@
from pandas_datareader.data import DataReader
# You can designate datetime by using datetime package or in form of string
# If you use string for datetime, internally it uses dateutil package
import datetime
dt_start = datetime.datetime(2015, 1, 1)
dt_end = "2016, 6, 30"
# You designate web site name as argument to load data
# You should find code of data you want to explore in following sites
# https://fred.stlouisfed.org/series/GDP
# https://fred.stlouisfed.org/series/CPIAUCSL
# https://fred.stlouisfed.org/series/CPILFESL
# Following is example of loading GDP of USA
gdp = DataReader("GDP", "fred", dt_start, dt_end)
gdp.tail()
# GDP
# DATE
# 2015-04-01 18093.224
# 2015-07-01 18227.689
# 2015-10-01 18287.226
# 2016-01-01 18325.187
# 2016-04-01 18538.039
# If you put list for code, you can bring multiple data at the same time
# CPIAUCSL is consumer price index
# CPILFESL is consumer price index excluded food and energy
inflation = DataReader(["CPIAUCSL", "CPILFESL"], "fred", dt_start, dt_end)
inflation.tail()
# CPIAUCSL CPILFESL
# DATE
# 2016-02-01 237.808 245.845
# 2016-03-01 238.078 246.062
# 2016-04-01 238.908 246.517
# 2016-05-01 239.362 247.026
# 2016-06-01 239.842 247.389