Data Analytics - Week 2

·

2 min read

The Four processes of Analyzing Data

Call -> Examine -> Process -> Visualize

Different types of graphs

Bar Graphs

image.png

Line Graphs

image.png

Pie Chart

image.png

Hit Map

image.png

Map

image.png

1. Call

import pandas as pd
import matplotlib.pyplot as plt

cm = pd.read_csv('./data/commercial.csv')

2. Examine

cm
cm.tail(5)
cm.head(5)
list(cm), len(list(cm))

cm.groupby('')[''].count().sort_values(ascending=True)
cm['one of the items in list(cm)']
cmset = set(cm[''])
cmset, len(cmset)

3. Process

cm['roadname'].str.split(' ', n=2)
# means I will split roadnames by 2 based on spaces, creating three words

cm[['city', 'region', 'street']] = cm['roadname'].str.split(' ', n=2, expand=True)
seoul = cm[cm['city'] == 'seoul']
# extracting data that has seoul as its city value

# checking accuracy
city = set(seoul['city'])
city, len(city)

# Details
seoulchicken = seoul[seoul['store's type']=='chicken']

region = seoulchicken.groupby('region')
category = region['store's type']
count = category.count()
count.sort_values(ascending=False)
# prints regions with their numbers (number of chicken stores)
# ascending is false, so from highest to lowest.

4. Visualize

Basic Bar Graph

plt.figure(figsize=(10,5))
plt.bar(count.index, count)
plt.title('chicken')
plt.show()

folium / json

If you do not have folium, copy and paste this code and then import them

conda install -c conda-forge folium

import folium
import json

Pulling json file

geo_data = json.load(open('./data/seoul_geo.json', encoding='utf-8'))

Creating Map

map = folium.Map(location=[37.5502,126.982], zoom_start = 11)

folium.Choropleth(geo_data=geo_data,
              data=count,
              columns=[count.index, count],
              fill_color='PuRd',
              key_on='feature.properties.name').add_to(map)

Repeat, Covering other data

# 1. Call
population = pd.read_csv('./data/population07.csv')

# 2. Examine
list(population), len(list(population))
set(population[''])

# 3. Process
count_sum = population.groupby('군구')['유동인구수'].sum()

# 4. Visualize
plt.rcParams['font.family'] = "Malgun Gothic"
plt.rcParams['font.family']
plt.figure(figsize=(8,5))
plt.bar(count_sum.index, count_sum)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()

# Extra Processing
sorted_sum = count_sum.sort_values(ascending=True)

plt.figure(figsize=(8,5))
plt.bar(sorted_sum.index, sorted_sum)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()

gangnam = population[population['region']=='gangnam']
daily_gangnam = gangnam.groupby('day')['population']

plt.figure(figsize=(10,5))

date = []
for day in daily_gangnam.index:
    print(day)
    date.append(str(day))

plt.plot(date, daily_gangnam)
plt.title('population')
plt.xlabel('region')
plt.ylabel('population')
plt.xticks(rotation=-45)
plt.show()


map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='stamentoner')
seoul = json.load(open('./data/seoul_geo.json', encoding='utf-8'))
folium.Choropleth(geo_data=seoul,
                   data=count_sum,
                    columns=[count_sum.index, count_sum],
                    fill_color='PuRd',
                    key_on='properties.name').add_to(map)

Combination of datasets

new_count = pd.DataFrame(count).reset_index()

count_sum = pd.DataFrame(count_sum).reset_index()

gu_chicken = new_count.join(count_sum.set_index('군구'), on='군')


gu_chicken[''] =gu_chicken[''] / gu_chicken['']
gu_chicken.sort_values(by='', ascending=True)


plt.figure(figsize=(8,5))
plt.bar(gu_chicken[''], gu_chicken[''])
plt.xticks(rotation=90)
plt.show()

Combination of two Graphs

plt.figure(figsize=(20,5))
date1 = []
for day in daily_04.index:
    date1.append(str(day))
plt.plot(date1, daily_04)
plt.xticks(rotation=90)
date2 = []
for day in daily_07.index:
    date2.append(str(day))
plt.plot(date2, daily_07)
plt.xticks(rotation=90)
plt.show()

image.png