008-002. IO data, load csv, create csv, export csv
@
# Pandas can make dataframe by loading data file in various formats
CSV
Excel
HTML
JSON
HDF5
SAS
STATA
SQL
@
# You can create text file to make csv file by doing following magic command
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample1.csv
@
# You can create dataframe object by loading csv file
pd.read_csv('sample1.csv')
# c1	c2	c3
# 0	1	1.11	one
# 1	2	2.22	two
# 2	3	3.33	three
# We designated columns index
# But we didn't for row index, so that, integer row index is automatically inserted
@
# I create csv file without both row index and column index
%%writefile sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample2.csv
# If you want to add column index, you can add it 
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])
# c1	c2	c3
# 0	1	1.11	one
# 1	2	2.22	two
# 2	3	3.33	three
@
# If you want to use specific column index as row index,
pd.read_csv('sample1.csv', index_col='c1')
# c2	c3
# c1		
# 1	1.11	one
# 2	2.22	two
# 3	3.33	three
@
%%writefile sample3.txt
c1        c2        c3        c4 
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515
# Writing sample3.txt
# If you run into none-csv file without comma delimeter, you can designate comma delimeter by sep argument
# If delimiter is unfixed length whitespace, you can use regular expression patter \s+
pd.read_table('sample3.txt', sep='\s+')
# c1	c2	c3	c4
# 0	0.179181	-1.538472	1.347553	0.43381
# 1	1.024209	0.087307	-1.281997	0.49265
# 2	0.417899	-2.002308	0.255245	-1.10515
@
%%writefile sample4.txt
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three
# Writing sample4.txt
# If you want to skip some rows in data file,
pd.read_csv('sample4.txt', skiprows=[0, 1])
# c1	c2	c3
# 2	3	3.33	three
@
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2,, two
empty, 3.33, three
# If you want to treat specific value as NaN, 
# you put specific value you want to treat as NaN into na_values argument
df = pd.read_csv('sample5.csv', na_values=['empty'])
df
# c1	c2	c3
# 0	1.0	1.11	one
# 1	2.0	NaN	two
# 2	NaN	3.33	three
@
%%writefile sample6.txt
c1,c2,c3
0,1.0,1.11,one
1,2.0,two
2,3.33,three
# If you want to export dataframe in form of csv file,
# you can use to_csv()
df.to_csv('sample6.csv')
# You can check contents of that csv file by using cat shell command
# In windows, you can use type command
# Exclamation mark(!) is magic command to use shell comman in IPython
!cat sample6.csv  # In windows, you can use !type command
# ,c1, c2, c3
# 0,1.0,1.11, one
# 1,2.0,, two
# 2,,3.33, three
@
# When you export file, you can use sep argument to change delimeter
df.to_csv('sample7.txt', sep='|')
# !cat sample7.txt
# |c1| c2| c3
# 0|1.0|1.11| one
# 1|2.0|| two
# 2||3.33| three
@
# You can change NaN denotation by using na_rep argument
df.to_csv('sample8.csv', na_rep='empty')
# !cat sample8.csv
# ,c1, c2, c3
# 0,1.0,1.11, one
# 1,2.0,empty, two
# 2,empty,3.33, three
@
# You can designate column or row to show data table
df.index = ["a", "b", "c"]
# or
df.index = ["c1", "c2", "c3"]
df
# c1	c2	c3
# a	1.0	1.11	one
# b	2.0	NaN	two
# c	NaN	3.33	three
# You can designate wheter printing column or row to show
df.to_csv('sample9.csv', index=False, header=False)
# !cat sample9.csv
# 1.0,1.11, one
# 2.0,, two
# ,3.33, three
@
# You can find various data file in form of csv
# When you use read_csv(), you can use url instead of path of csv file
df = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')
df
# If data is too massive, dataframe shows partial table
# survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
# 0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
# 1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
# 2	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
# 3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
# 4	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
# 5	0	3	male	NaN	0	0	8.4583	Q	Third	man	True	NaN	Queenstown	no	True
# 6	0	1	male	54.0	0	0	51.8625	S	First	man	True	E	Southampton	no	True
# 7	0	3	male	2.0	3	1	21.0750	S	Third	child	False	NaN	Southampton	no	False
# 8	1	3	female	27.0	0	2	11.1333	S	Third	woman	False	NaN	Southampton	yes	False
# 9	1	2	female	14.0	1	0	30.0708	C	Second	child	False	NaN	Cherbourg	yes	False
# 10	1	3	female	4.0	1	1	16.7000	S	Third	child	False	G	Southampton	yes	False
# 11	1	1	female	58.0	0	0	26.5500	S	First	woman	False	C	Southampton	yes	True
# 12	0	3	male	20.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
# 13	0	3	male	39.0	1	5	31.2750	S	Third	man	True	NaN	Southampton	no	False
# 14	0	3	female	14.0	0	0	7.8542	S	Third	child	False	NaN	Southampton	no	True
# 15	1	2	female	55.0	0	0	16.0000	S	Second	woman	False	NaN	Southampton	yes	True
# 16	0	3	male	2.0	4	1	29.1250	Q	Third	child	False	NaN	Queenstown	no	False
# 17	1	2	male	NaN	0	0	13.0000	S	Second	man	True	NaN	Southampton	yes	True
# 18	0	3	female	31.0	1	0	18.0000	S	Third	woman	False	NaN	Southampton	no	False
# 19	1	3	female	NaN	0	0	7.2250	C	Third	woman	False	NaN	Cherbourg	yes	True
# 20	0	2	male	35.0	0	0	26.0000	S	Second	man	True	NaN	Southampton	no	True
# 21	1	2	male	34.0	0	0	13.0000	S	Second	man	True	D	Southampton	yes	True
# 22	1	3	female	15.0	0	0	8.0292	Q	Third	child	False	NaN	Queenstown	yes	True
# 23	1	1	male	28.0	0	0	35.5000	S	First	man	True	A	Southampton	yes	True
# 24	0	3	female	8.0	3	1	21.0750	S	Third	child	False	NaN	Southampton	no	False
# 25	1	3	female	38.0	1	5	31.3875	S	Third	woman	False	NaN	Southampton	yes	False
# 26	0	3	male	NaN	0	0	7.2250	C	Third	man	True	NaN	Cherbourg	no	True
# 27	0	1	male	19.0	3	2	263.0000	S	First	man	True	C	Southampton	no	False
# 28	1	3	female	NaN	0	0	7.8792	Q	Third	woman	False	NaN	Queenstown	yes	True
# 29	0	3	male	NaN	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
# ...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
# 861	0	2	male	21.0	1	0	11.5000	S	Second	man	True	NaN	Southampton	no	False
# 862	1	1	female	48.0	0	0	25.9292	S	First	woman	False	D	Southampton	yes	True
# 863	0	3	female	NaN	8	2	69.5500	S	Third	woman	False	NaN	Southampton	no	False
# 864	0	2	male	24.0	0	0	13.0000	S	Second	man	True	NaN	Southampton	no	True
# 865	1	2	female	42.0	0	0	13.0000	S	Second	woman	False	NaN	Southampton	yes	True
# 866	1	2	female	27.0	1	0	13.8583	C	Second	woman	False	NaN	Cherbourg	yes	False
# 867	0	1	male	31.0	0	0	50.4958	S	First	man	True	A	Southampton	no	True
# 868	0	3	male	NaN	0	0	9.5000	S	Third	man	True	NaN	Southampton	no	True
# 869	1	3	male	4.0	1	1	11.1333	S	Third	child	False	NaN	Southampton	yes	False
# 870	0	3	male	26.0	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
# 871	1	1	female	47.0	1	1	52.5542	S	First	woman	False	D	Southampton	yes	False
# 872	0	1	male	33.0	0	0	5.0000	S	First	man	True	B	Southampton	no	True
# 873	0	3	male	47.0	0	0	9.0000	S	Third	man	True	NaN	Southampton	no	True
# 874	1	2	female	28.0	1	0	24.0000	C	Second	woman	False	NaN	Cherbourg	yes	False
# 875	1	3	female	15.0	0	0	7.2250	C	Third	child	False	NaN	Cherbourg	yes	True
# 876	0	3	male	20.0	0	0	9.8458	S	Third	man	True	NaN	Southampton	no	True
# 877	0	3	male	19.0	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
# 878	0	3	male	NaN	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
# 879	1	1	female	56.0	0	1	83.1583	C	First	woman	False	C	Cherbourg	yes	False
# 880	1	2	female	25.0	0	1	26.0000	S	Second	woman	False	NaN	Southampton	yes	False
# 881	0	3	male	33.0	0	0	7.8958	S	Third	man	True	NaN	Southampton	no	True
# 882	0	3	female	22.0	0	0	10.5167	S	Third	woman	False	NaN	Southampton	no	True
# 883	0	2	male	28.0	0	0	10.5000	S	Second	man	True	NaN	Southampton	no	True
# 884	0	3	male	25.0	0	0	7.0500	S	Third	man	True	NaN	Southampton	no	True
# 885	0	3	female	39.0	0	5	29.1250	Q	Third	woman	False	NaN	Queenstown	no	False
# 886	0	2	male	27.0	0	0	13.0000	S	Second	man	True	NaN	Southampton	no	True
# 887	1	1	female	19.0	0	0	30.0000	S	First	woman	False	B	Southampton	yes	True
# 888	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False
# 889	1	1	male	26.0	0	0	30.0000	C	First	man	True	C	Cherbourg	yes	True
# 890	0	3	male	32.0	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True
# 891 rows × 15 columns
@
# If you want to see specific number of data from front or back area, you use head() or tail()
# You can designate the number of row or column to show as argument
df.head()
# survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
# 0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
# 1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
# 2	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
# 3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
# 4	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
df.tail(2)
# survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
# 889	1	1	male	26.0	0	0	30.00	C	First	man	True	C	Cherbourg	yes	True
# 890	0	3	male	32.0	0	0	7.75	Q	Third	man	True	NaN	Queenstown	no	True
@
# If you use DataReader of pandas_datareader package, you can directly and automatically load data of some internet site
# To use pandas_datareader package, you should install separately regardless of pandas
# Following is site examples of pandas_datareader provding
# Some site can require register and payment
# FRED
# Fama/French
# World Bank
# OECD
# Eurostat
# EDGAR Index
# TSP Fund Data
# Oanda currency historical rate
# Nasdaq Trader Symbol Definitions
# For more detail, you can see following site
# https://pandas-datareader.readthedocs.io/en/latest/index.html
@
from pandas_datareader.data import DataReader
# You can designate datetime by using datetime package or in form of string
# If you use string for datetime, internally it uses dateutil package
import datetime
dt_start = datetime.datetime(2015, 1, 1)
dt_end = "2016, 6, 30"
# You designate web site name as argument to load data
# You should find code of data you want to explore in following sites
# https://fred.stlouisfed.org/series/GDP
# https://fred.stlouisfed.org/series/CPIAUCSL
# https://fred.stlouisfed.org/series/CPILFESL
# Following is example of loading GDP of USA
gdp = DataReader("GDP", "fred", dt_start, dt_end)
gdp.tail()
# GDP
# DATE	
# 2015-04-01	18093.224
# 2015-07-01	18227.689
# 2015-10-01	18287.226
# 2016-01-01	18325.187
# 2016-04-01	18538.039
# If you put list for code, you can bring multiple data at the same time
# CPIAUCSL is consumer price index
# CPILFESL is consumer price index excluded food and energy
inflation = DataReader(["CPIAUCSL", "CPILFESL"], "fred", dt_start, dt_end)
inflation.tail()
# CPIAUCSL	CPILFESL
# DATE		
# 2016-02-01	237.808	245.845
# 2016-03-01	238.078	246.062
# 2016-04-01	238.908	246.517
# 2016-05-01	239.362	247.026
# 2016-06-01	239.842	247.389