011. basic of csv
# @
# CSV
# 1000,soap,300
# 1001,glove,150
# 1002,mask,230
# @
# SSV(space separated values)
# 1000 soap 300
# 1001 glove 150
# 1002 mask 230
# @
# TSV(tab separated values)
# 1000 soap 300
# 1001 glove 150
# 1002 mask 230
# @
# Properties of CSV:
# 1. CSV has one data by one line
# 1. Attributes of each data are separated by comma
# 1. Most top line can be used as header
# ID,name,price
# 1000,soap,300 : data1
# 1001,glove,150 : data2
# 1002,mask,230 : data3
# @
# Comparison between xml, json, csv
# size: xml > json > csv
# expression: xml > json > csv
# readability: json > xml > csv
# @
# Poisonous mushroom data
# https://archive.ics.uci.edu/ml/datasets/mushroom
# go to data folder
# download agaricus-lepiota.data
# @
# Analyzing csv
csv = """\
p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m\
"""
# If you want to separate sequence of characters by particular symbol,
# you can use split()
# First, you can split sequence of characters by "\n"
splitted = csv.split("\n")
# You pass split symbol as first argument into split(),
# you can omit the name of arugment if you use only first argumet
# split(sep=",")
for item in splitted:
list_mushroom = csv.split(",")
print(list_mushroom)
# output:
# ['p','x','s','n','t','p','f','c','n','k','e','e','s','s','w','w','p','w','o','p','k','s','u']
# ['e','x','s','y','t','a','f','c','b','k','e','c','s','s','w','w','p','w','o','p','n','n','g']
# ['e','b','s','w','t','l','f','c','b','n','e','c','s','s','w','w','p','w','o','p','n','n','m']
# @
# How to deal with "python list"
# 1. When you choose one element
list_mushroom[0]
# output:
# p
# e
# e
list_mushroom[1]
# output:
# x
# x
# b
# The far last element in the list
list_mushroom[-1]
# The second element from the most far back
list_mushroom[-2]
# 1. When you choose range
list_mushroom[1:4]
list_mushroom[5:]
list_mushroom[:5]
# @
# CSV files is generally created by excel
# And many exported csv files from excel have euc-kr encoding
# If you can export csv file as utf-8 encoding, you should do it
# @
# codecs is module which converts encoding
import csv, codecs
filename = "test.csv"
# When we read file in read only mode so we should use r
# "euc-kr" means encoding format
# When you read file as euc-kr format, you should use codecs module
file = codecs.open(filename, "r", "euc-kr")
# 1 arg: iterable object
# 2 arg: delimiter
# 3 arg is quotechar'"' : if it need, for example, 'a','b','c'
csv.reader(file, delimiter=",")
for cells in reader:
print(cells[1], cells[2])
# @
# It's actually not much used to read csv file with csv module
# We use pandas to read csv file