google machine learning course 001. pandas
import pandas as pd
pd.__version__
# Pandas' data type is composed of 2 classes(DataFrame and Series)
# Series1 = [1,2,3,...]
# Series2 = [4,5,6,...]
# Dataframe=
# series1 series2
# 1 4
# 2 5
# 3 6
# @
# I can create series by creating Series object
pd.Series(['San Francisco', 'San Jose', 'Sacramento'])
# I can create dataframe
city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])
population = pd.Series([852469, 1015785, 485199])
print(pd.DataFrame({'City name': city_names, 'Population': population}))
# City name Population
# 0 San Francisco 852469
# 1 San Jose 1015785
# 2 Sacramento 485199
# I load california_housing_train.csv file as dataframe
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
# Or you can use that file from your local folder after downloading it
california_housing_dataframe = pd.read_csv("california_housing_train.csv", sep=",")
california_housing_dataframe.describe()
# longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
# count 17000.000000 17000.000000 17000.000000 17000.000000 17000.000000 17000.000000 17000.000000 17000.000000 17000.000000
# mean -119.562108 35.625225 28.589353 2643.664412 539.410824 1429.573941 501.221941 3.883578 207300.912353
# std 2.005166 2.137340 12.586937 2179.947071 421.499452 1147.852959 384.520841 1.908157 115983.764387
# min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
# 25% -121.790000 33.930000 18.000000 1462.000000 297.000000 790.000000 282.000000 2.566375 119400.000000
# 50% -118.490000 34.250000 29.000000 2127.000000 434.000000 1167.000000 409.000000 3.544600 180400.000000
# 75% -118.000000 37.720000 37.000000 3151.250000 648.250000 1721.000000 605.250000 4.767000 265000.000000
# max -114.310000 41.950000 52.000000 37937.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
# You can use head() or tail() which shows several data from the first one or the last one
# Default is 5
california_housing_dataframe.head()
# longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
# 0 -114.31 34.19 15.0 5612.0 1283.0 1015.0 472.0 1.4936 66900.0
# 1 -114.47 34.40 19.0 7650.0 1901.0 1129.0 463.0 1.8200 80100.0
# 2 -114.56 33.69 17.0 720.0 174.0 333.0 117.0 1.6509 85700.0
# 3 -114.57 33.64 14.0 1501.0 337.0 515.0 226.0 3.1917 73400.0
# 4 -114.57 33.57 20.0 1454.0 326.0 624.0 262.0 1.9250 65500.0
# Yuo can visualize dataframe
# housing_median_age: column
# %matplotlib inline
california_housing_dataframe.hist('housing_median_age')
# img 29dde448-e3d7-47e4-9361-66cfe9c39d57
# array([[]],
# dtype=object)
# @
# You can access to dataframe data by using dictionary and list
cities = pd.DataFrame({ 'City name': city_names, 'Population': population })
print(type(cities['City name']))
#
cities['City name']
# 0 San Francisco
# 1 San Jose
# 2 Sacramento
# Name: City name, dtype: object
print(type(cities['City name'][1]))
# cities['City name'][1]
#
# 'San Jose'
print(type(cities[0:2]))
#
cities[0:2]
# City name Population
# 0 San Francisco 852469
# 1 San Jose 1015785
# @
# Arithmetics on series
population / 1000.
# 0 852.469
# 1 1015.785
# 2 485.199
# dtype: float64
# @
# You can use most of pandas series as argument of numpy method
import numpy as np
np.log(population)
# 0 13.655892
# 1 13.831172
# 2 13.092314
# dtype: float64
# @
# You can use apply() for complex transformation
# Like python map(), Series.apply() can take lambda function as arguement
# Following code makes new series
# population > 1000000
population.apply(lambda val: val > 1000000)
# 0 False
# 1 True
# 2 False
# dtype: bool
@
# You can edit dataframe
# Following code adds 2 series on exsiting dataframe
cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])
cities['Population density'] = cities['Population'] / cities['Area square miles']
print(cities)
# < City name Population Area square miles Population density
# < 0 San Francisco 852469 46.87 18187.945381
# < 1 San Jose 1015785 176.53 5754.177760
# < 2 Sacramento 485199 97.92 4955.055147
# Practice 1
# You edit city table with adding new true boolean row only if following 2 statements are true
# 1. City name is given from saint person name
# 1. City area is wider than 130 squre kilometer
cities['Is wide and has saint name'] = (cities['Area square miles'] > 130) & cities['City name'].apply(lambda name: name.startswith('San'))
print(cities)
# < City name Population Area square miles Population density Is wide and has saint name
# < 0 San Francisco 852469 46.87 18187.945381 False
# < 1 San Jose 1015785 176.53 5754.177760 True
# < 2 Sacramento 485199 97.92 4955.055147 False
# @
# Index
# Series and DataFrame have index attribute
# Basically, pandas assigns index numbers to represent order
# Index values are fixed
# It's not changed even when order of data is rearranged
print(city_names.index)
# RangeIndex(start=0, stop=3, step=1)
print(cities.index)
# RangeIndex(start=0, stop=3, step=1)
# You can manually rearrage order of rows by using DataFrame.reindex()
# 0 2
# 1 -> 0
# 2 1
# For example, following code has same effect with classifying cities by city name
print(cities.reindex([2, 0, 1]))
# < City name Population Area square miles Population density Is wide and has saint name
# < 2 Sacramento 485199 97.92 4955.055147 False
# < 0 San Francisco 852469 46.87 18187.945381 False
# < 1 San Jose 1015785 176.53 5754.177760 True
# reindex() is good way for shuffling data
# In following code, I shuffle index array by using numpy's random.permutation()
# And then, I invoke reindex() on above shuffled array, then I get shuffled rows
cities.reindex(np.random.permutation(cities.index))
# < City name Population Area square miles Population density Is wide and has saint name
# < 0 San Francisco 852469 46.87 18187.945381 False
# < 2 Sacramento 485199 97.92 4955.055147 False
# < 1 San Jose 1015785 176.53 5754.177760 True
# You can give index as string
index = pd.Index(['e', 'd', 'a', 'b'])
print(index)
# Index(['e', 'd', 'a', 'b'], dtype='object')
print('d' in index)
# True
# You can give range to index
index = pd.Index(list(range(5)), name='rows')
columns = pd.Index(['A', 'B', 'C'], name='cols')
df = pd.DataFrame(np.random.randn(5,3), index=index, columns=columns)
print(df)
# cols A B C
# rows
# 0 randomnumber randomnumber randomnumber
# 1
# 2
# 3
# 4
# @
# Practice 2
# You can give non exsiting value as index and it will show as NaN
cities.reindex([0, 4, 5, 2])
# < City name Population Area square miles Population density Is wide and has saint name
# < 0 San Francisco 852469.0 46.87 18187.945381 False
# < 4 NaN NaN NaN NaN NaN
# < 5 NaN NaN NaN NaN NaN
# < 2 Sacramento 485199.0 97.92 4955.055147 False