import pandas as pd

# Obtained from https://simplemaps.com/data/canada-cities
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'

# https://github.com/icyrockcom/country-capitals/blob/master/data/country-list.csv
#World_Capitals_CSV = '/home/jovyan/work/shared/datasets/country-list.csv'

cities = pd.read_csv(Canadian_Cities_CSV)

cities.head()

cities.shape

(1738, 12)

cities.describe()

Working with Columns¶

# Columns are a series of strings.
cities.columns

Index(['city', 'city_ascii', 'province_id', 'province_name', 'lat', 'lng',
       'population', 'density', 'timezone', 'ranking', 'postal', 'id'],
      dtype='object')

# Extract a single column as a series
cities['city_ascii']

0                 Toronto
1                Montreal
2               Vancouver
3                 Calgary
4                Edmonton
              ...        
1733           Durham-Sud
1734            Melbourne
1735      Nipawin No. 487
1736    Duck Lake No. 463
1737                 Oyen
Name: city_ascii, Length: 1738, dtype: object

cities[['city', 'population']]

#
# Data transformation
# - Pandas encourages functional programming style of
#   data analysis
# - The preferred API is read-only to the original dataframe
#

#
# Create a new column, call it 'country'
# - DataFrame.insert(new_position, column_name, series) modifies the original dataframe
# - DataFrame['new_column_name'] = ...

cities['country'] = 'Canada'

cities.head(2)

#
# Deleting columns
#
# 1. in-place: original df will be modified
# 2. out-place: original df is unmodified, and a transformed df is returned.
#

# out-place:
# DataFrame.drop(columns=[...])
cities.drop(columns=['country'])

#
# This drops the column in the original dataframe as well.
#
cities.drop(columns=['country'], inplace=True)

# Note:
# Equivalent to:
# del cities['country']

cities.head(2)

#
# Rename the columns, by default inplace=False
#
cities.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)

cities.head(2)

# Rename the columns back
cities.rename(columns={'latitude': 'lat', 'longitude': 'lng'}, inplace=True)

Working with Indexes¶

cities.index

RangeIndex(start=0, stop=1738, step=1)

#
# We can reassign the index to any series.
#
cities.index = cities['id']

cities.head(4)[['city', 'province_name']]

#
# Restore the index to the original 0, 1, 2, 3, ...
# by default, inplace=False
#
cities.reset_index(drop=True, inplace=True)

#
# Now the index is dropped (restored)
#
cities.head()

#
# The preferred way to set the index is using
# DataFrame.set_index(..., inplace=..)
# default inplace=False
#

cities.set_index('city_ascii', inplace=True)

# Note:
# - The column called 'city' is now GONE.
# - The index has the name 'city'.
# - This is different from cities.index = cities['city']

#
# Original dataframe
#
cities

#
# DataFrame.loc[...] is an accessor by index values.
#
cities.loc['Oshawa']

city                          Oshawa
province_id                       ON
province_name                Ontario
lat                             43.9
lng                           -78.85
population                    166000
density                         1027
timezone             America/Toronto
ranking                            2
postal           L1L L1H L1J L1K L1G
id                        1124541904
Name: Oshawa, dtype: object

#
# Recall the slicing syntax from NumPy.
# That works here too.
#
# Returns all cities that come AFTER "Oshawa"
cities.loc['Oshawa':]

#
# Task: find all cities that start with "V"
#
cities[cities.index.str.startswith('V')].sort_index().tail()

Working with Values¶

Compute new values from existing values
Retrieve values
Query values based on some condition
Aggregate

Retrieve values¶

cities[["city", "province_name", "population"]].head()

#
# Retrieve values by index
#
cities.loc["Oshawa":"Vancouver"]

# Empty because "Vancouver" occurs before Oshawa.

cities.sort_index(inplace=True)

cities.loc["Oshawa":"Vancouver"]

Compute new values from existing¶

#
# Derive new columns
#
import numpy as np

cities['pop_million'] = np.round(cities['population'] * 1E-6, 2)

cities.loc['Toronto']

city                                                       Toronto
province_id                                                     ON
province_name                                              Ontario
lat                                                        43.7417
lng                                                       -79.3733
population                                             5.42952e+06
density                                                     4334.4
timezone                                           America/Toronto
ranking                                                          1
postal           M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5...
id                                                      1124279679
pop_million                                                   5.43
Name: Toronto, dtype: object

#
# Functional programming using Series.apply
#

def city_size_by_population(p):
    if p > 1e4:
        return 'large'
    else:
        return 'not large'

#
# Get the series for population, and apply function to compute city_size
#
city_sizes = cities['population'].apply(city_size_by_population)
city_sizes['Toronto':]

city_ascii
Toronto            large
Tracadie           large
Trail          not large
Trecesson      not large
Trent Hills        large
                 ...    
Yellowhead     not large
Yellowknife        large
Yorkton            large
Youbou         not large
Zorra          not large
Name: population, Length: 153, dtype: object

#
# Get series, and compute a series
#
cities['population'].apply(lambda x: pd.Series([city_size_by_population(x), x], index=['size', 'pop']))

#
# Try DataFrame.apply
#
cities.apply(lambda series: series.max())

city                           Zorra
province_id                       YT
province_name                  Yukon
lat                          72.6808
lng                           -52.68
population               5.42952e+06
density                       5492.6
timezone         America/Yellowknife
ranking                            4
postal                           Y1A
id                        1124995979
pop_million                     5.43
dtype: object

#
# Try dataframe, apply to rows
#
cities.apply(lambda city: pd.Series([city['population'], city_size_by_population(city['population'])]), axis=1)

	city	city_ascii	province_id	province_name	lat	lng	population	density	timezone	ranking	postal	id
0	Toronto	Toronto	ON	Ontario	43.7417	-79.3733	5429524.0	4334.4	America/Toronto	1	M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5...	1124279679
1	Montréal	Montreal	QC	Quebec	45.5089	-73.5617	3519595.0	3889.0	America/Montreal	1	H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1...	1124586170
2	Vancouver	Vancouver	BC	British Columbia	49.2500	-123.1000	2264823.0	5492.6	America/Vancouver	1	V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6...	1124825478
3	Calgary	Calgary	AB	Alberta	51.0500	-114.0667	1239220.0	1501.1	America/Edmonton	1	T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2...	1124690423
4	Edmonton	Edmonton	AB	Alberta	53.5344	-113.4903	1062643.0	1360.9	America/Edmonton	1	T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5...	1124290735

	lat	lng	population	density	ranking	id
count	1738.000000	1738.000000	1.738000e+03	1738.000000	1738.000000	1.738000e+03
mean	47.846277	-83.943544	2.310799e+04	265.713011	3.173188	1.124317e+09
std	3.612196	18.925380	1.777508e+05	530.142598	0.511734	3.371888e+05
min	42.066700	-139.389300	1.001000e+03	0.000000	1.000000	1.124000e+09
25%	45.502075	-99.262200	1.617500e+03	13.150000	3.000000	1.124001e+09
50%	46.679000	-76.000000	2.909000e+03	45.600000	3.000000	1.124199e+09
75%	49.613925	-71.654175	8.221000e+03	320.000000	3.000000	1.124616e+09
max	72.680800	-52.680000	5.429524e+06	5492.600000	4.000000	1.124996e+09

	city	city_ascii	province_id	province_name	lat	lng	population	density	timezone	ranking	postal	id
0	Toronto	Toronto	ON	Ontario	43.7417	-79.3733	5429524.0	4334.4	America/Toronto	1	M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5...	1124279679
1	Montréal	Montreal	QC	Quebec	45.5089	-73.5617	3519595.0	3889.0	America/Montreal	1	H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1...	1124586170
2	Vancouver	Vancouver	BC	British Columbia	49.2500	-123.1000	2264823.0	5492.6	America/Vancouver	1	V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6...	1124825478
3	Calgary	Calgary	AB	Alberta	51.0500	-114.0667	1239220.0	1501.1	America/Edmonton	1	T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2...	1124690423
4	Edmonton	Edmonton	AB	Alberta	53.5344	-113.4903	1062643.0	1360.9	America/Edmonton	1	T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5...	1124290735

	city	province_id	province_name	lat	lng	population	density	timezone	ranking	postal	id
city_ascii
Toronto	Toronto	ON	Ontario	43.7417	-79.3733	5429524.0	4334.4	America/Toronto	1	M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5...	1124279679
Montreal	Montréal	QC	Quebec	45.5089	-73.5617	3519595.0	3889.0	America/Montreal	1	H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1...	1124586170
Vancouver	Vancouver	BC	British Columbia	49.2500	-123.1000	2264823.0	5492.6	America/Vancouver	1	V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6...	1124825478
Calgary	Calgary	AB	Alberta	51.0500	-114.0667	1239220.0	1501.1	America/Edmonton	1	T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2...	1124690423
Edmonton	Edmonton	AB	Alberta	53.5344	-113.4903	1062643.0	1360.9	America/Edmonton	1	T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5...	1124290735
...	...	...	...	...	...	...	...	...	...	...	...
Durham-Sud	Durham-Sud	QC	Quebec	45.6667	-72.3333	1008.0	10.8	America/Montreal	4	J0H	1124105436
Melbourne	Melbourne	QC	Quebec	45.5800	-72.1700	1004.0	5.8	America/Montreal	3	J0B	1124850489
Nipawin No. 487	Nipawin No. 487	SK	Saskatchewan	53.2881	-104.0544	1004.0	1.1	America/Regina	4	S0E	1124001339
Duck Lake No. 463	Duck Lake No. 463	SK	Saskatchewan	52.9596	-106.2089	1004.0	1.0	America/Regina	4	S0K S6V	1124001661
Oyen	Oyen	AB	Alberta	51.3522	-110.4739	1001.0	189.6	America/Edmonton	3	T0J	1124000494

	city	province_id	province_name	lat	lng	population	density	timezone	ranking	postal	id
city_ascii
Oshawa	Oshawa	ON	Ontario	43.9000	-78.8500	166000.0	1027.0	America/Toronto	2	L1L L1H L1J L1K L1G	1124541904
Sherbrooke	Sherbrooke	QC	Quebec	45.4000	-71.9000	161323.0	456.0	America/Montreal	2	J1N J1L J1M J1J J1K J1H J1G J1E J1C J1R	1124559506
Saguenay	Saguenay	QC	Quebec	48.4167	-71.0667	144746.0	128.5	America/Montreal	2	G8A G7N G7H G7K G7T G7Z G7G G7B G7J G7P G7S G7...	1124001930
Levis	Lévis	QC	Quebec	46.8000	-71.1833	143414.0	319.4	America/Montreal	2	G7A G6J G6K G6C G6Z G6X G6Y G6V G6W	1124958950
Kelowna	Kelowna	BC	British Columbia	49.8881	-119.4956	142146.0	601.3	America/Vancouver	2	V1X V1Y V1P V1W V1V	1124080626
...	...	...	...	...	...	...	...	...	...	...	...
Durham-Sud	Durham-Sud	QC	Quebec	45.6667	-72.3333	1008.0	10.8	America/Montreal	4	J0H	1124105436
Melbourne	Melbourne	QC	Quebec	45.5800	-72.1700	1004.0	5.8	America/Montreal	3	J0B	1124850489
Nipawin No. 487	Nipawin No. 487	SK	Saskatchewan	53.2881	-104.0544	1004.0	1.1	America/Regina	4	S0E	1124001339
Duck Lake No. 463	Duck Lake No. 463	SK	Saskatchewan	52.9596	-106.2089	1004.0	1.0	America/Regina	4	S0K S6V	1124001661
Oyen	Oyen	AB	Alberta	51.3522	-110.4739	1001.0	189.6	America/Edmonton	3	T0J	1124000494

Index

DataFrames

Data Analysis

Working with Columns¶

Working with Indexes¶

Working with Values¶

Retrieve values¶

Compute new values from existing¶

	city	province_id	province_name	lat	lng	population	density	timezone	ranking	postal	id
city_ascii
View Royal	View Royal	BC	British Columbia	48.4517	-123.4339	10408.0	724.8	America/Vancouver	3	V9B	1124001985
Viking	Viking	AB	Alberta	53.0953	-111.7769	1083.0	292.5	America/Edmonton	3	T0B	1124502081
Ville-Marie	Ville-Marie	QC	Quebec	47.3333	-79.4333	2595.0	424.8	America/Montreal	3	J9V	1124001938
Virden	Virden	MB	Manitoba	49.8508	-100.9317	3114.0	370.2	America/Winnipeg	3	R0M	1124620072
Vulcan	Vulcan	AB	Alberta	50.4000	-113.2500	1917.0	302.3	America/Edmonton	3	T0L	1124607765

	size	pop
city_ascii
Abbotsford	large	141397.0
Acton Vale	not large	7664.0
Addington Highlands	not large	2323.0
Adelaide-Metcalfe	not large	2990.0
Adjala-Tosorontio	large	10975.0
...	...	...
Yellowhead	not large	2011.0
Yellowknife	large	19569.0
Yorkton	large	16343.0
Youbou	not large	1086.0
Zorra	not large	8138.0