Data Analytics - Week 2
The Four processes of Analyzing Data
Call -> Examine -> Process -> Visualize
Different types of graphs
Bar Graphs
Line Graphs
Pie Chart
Hit Map
Map
1. Call
import pandas as pd
import matplotlib.pyplot as plt
cm = pd.read_csv('./data/commercial.csv')
2. Examine
cm
cm.tail(5)
cm.head(5)
list(cm), len(list(cm))
cm.groupby('')[''].count().sort_values(ascending=True)
cm['one of the items in list(cm)']
cmset = set(cm[''])
cmset, len(cmset)
3. Process
cm['roadname'].str.split(' ', n=2)
# means I will split roadnames by 2 based on spaces, creating three words
cm[['city', 'region', 'street']] = cm['roadname'].str.split(' ', n=2, expand=True)
seoul = cm[cm['city'] == 'seoul']
# extracting data that has seoul as its city value
# checking accuracy
city = set(seoul['city'])
city, len(city)
# Details
seoulchicken = seoul[seoul['store's type']=='chicken']
region = seoulchicken.groupby('region')
category = region['store's type']
count = category.count()
count.sort_values(ascending=False)
# prints regions with their numbers (number of chicken stores)
# ascending is false, so from highest to lowest.
4. Visualize
Basic Bar Graph
plt.figure(figsize=(10,5))
plt.bar(count.index, count)
plt.title('chicken')
plt.show()
folium / json
If you do not have folium, copy and paste this code and then import them
conda install -c conda-forge folium
import folium
import json
Pulling json file
geo_data = json.load(open('./data/seoul_geo.json', encoding='utf-8'))
Creating Map
map = folium.Map(location=[37.5502,126.982], zoom_start = 11)
folium.Choropleth(geo_data=geo_data,
data=count,
columns=[count.index, count],
fill_color='PuRd',
key_on='feature.properties.name').add_to(map)
Repeat, Covering other data
# 1. Call
population = pd.read_csv('./data/population07.csv')
# 2. Examine
list(population), len(list(population))
set(population[''])
# 3. Process
count_sum = population.groupby('군구')['유동인구수'].sum()
# 4. Visualize
plt.rcParams['font.family'] = "Malgun Gothic"
plt.rcParams['font.family']
plt.figure(figsize=(8,5))
plt.bar(count_sum.index, count_sum)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()
# Extra Processing
sorted_sum = count_sum.sort_values(ascending=True)
plt.figure(figsize=(8,5))
plt.bar(sorted_sum.index, sorted_sum)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()
gangnam = population[population['region']=='gangnam']
daily_gangnam = gangnam.groupby('day')['population']
plt.figure(figsize=(10,5))
date = []
for day in daily_gangnam.index:
print(day)
date.append(str(day))
plt.plot(date, daily_gangnam)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()
map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='stamentoner')
seoul = json.load(open('./data/seoul_geo.json', encoding='utf-8'))
folium.Choropleth(geo_data=seoul,
data=count_sum,
columns=[count_sum.index, count_sum],
fill_color='PuRd',
key_on='properties.name').add_to(map)
Combination of datasets
new_count = pd.DataFrame(count).reset_index()
count_sum = pd.DataFrame(count_sum).reset_index()
gu_chicken = new_count.join(count_sum.set_index('군구'), on='군')
gu_chicken[''] =gu_chicken[''] / gu_chicken['']
gu_chicken.sort_values(by='', ascending=True)
plt.figure(figsize=(8,5))
plt.bar(gu_chicken[''], gu_chicken[''])
plt.xticks(rotation=90)
plt.show()
Combination of two Graphs
plt.figure(figsize=(20,5))
date1 = []
for day in daily_04.index:
date1.append(str(day))
plt.plot(date1, daily_04)
plt.xticks(rotation=90)
date2 = []
for day in daily_07.index:
date2.append(str(day))
plt.plot(date2, daily_07)
plt.xticks(rotation=90)
plt.show()