{:draft ["true"]}

Index

DataFrames

Data Analysis

In [14]:
import pandas as pd
In [15]:
# Obtained from https://simplemaps.com/data/canada-cities
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'

# https://github.com/icyrockcom/country-capitals/blob/master/data/country-list.csv
#World_Capitals_CSV = '/home/jovyan/work/shared/datasets/country-list.csv'
In [16]:
cities = pd.read_csv(Canadian_Cities_CSV)
In [17]:
cities.head()
Out[17]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
2 Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478
3 Calgary Calgary AB Alberta 51.0500 -114.0667 1239220.0 1501.1 America/Edmonton 1 T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2... 1124690423
4 Edmonton Edmonton AB Alberta 53.5344 -113.4903 1062643.0 1360.9 America/Edmonton 1 T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5... 1124290735
In [21]:
cities.shape
Out[21]:
(1738, 12)
In [22]:
cities.describe()
Out[22]:
lat lng population density ranking id
count 1738.000000 1738.000000 1.738000e+03 1738.000000 1738.000000 1.738000e+03
mean 47.846277 -83.943544 2.310799e+04 265.713011 3.173188 1.124317e+09
std 3.612196 18.925380 1.777508e+05 530.142598 0.511734 3.371888e+05
min 42.066700 -139.389300 1.001000e+03 0.000000 1.000000 1.124000e+09
25% 45.502075 -99.262200 1.617500e+03 13.150000 3.000000 1.124001e+09
50% 46.679000 -76.000000 2.909000e+03 45.600000 3.000000 1.124199e+09
75% 49.613925 -71.654175 8.221000e+03 320.000000 3.000000 1.124616e+09
max 72.680800 -52.680000 5.429524e+06 5492.600000 4.000000 1.124996e+09

Working with Columns

In [18]:
# Columns are a series of strings.
cities.columns
Out[18]:
Index(['city', 'city_ascii', 'province_id', 'province_name', 'lat', 'lng',
       'population', 'density', 'timezone', 'ranking', 'postal', 'id'],
      dtype='object')
In [23]:
# Extract a single column as a series
cities['city_ascii']
Out[23]:
0                 Toronto
1                Montreal
2               Vancouver
3                 Calgary
4                Edmonton
              ...        
1733           Durham-Sud
1734            Melbourne
1735      Nipawin No. 487
1736    Duck Lake No. 463
1737                 Oyen
Name: city_ascii, Length: 1738, dtype: object
In [25]:
cities[['city', 'population']]
Out[25]:
city population
0 Toronto 5429524.0
1 Montréal 3519595.0
2 Vancouver 2264823.0
3 Calgary 1239220.0
4 Edmonton 1062643.0
... ... ...
1733 Durham-Sud 1008.0
1734 Melbourne 1004.0
1735 Nipawin No. 487 1004.0
1736 Duck Lake No. 463 1004.0
1737 Oyen 1001.0

1738 rows × 2 columns

In [37]:
#
# Data transformation
# - Pandas encourages functional programming style of
#   data analysis
# - The preferred API is read-only to the original dataframe
#

#
# Create a new column, call it 'country'
# - DataFrame.insert(new_position, column_name, series) modifies the original dataframe
# - DataFrame['new_column_name'] = ...

cities['country'] = 'Canada'
In [40]:
cities.head(2)
Out[40]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id country
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679 Canada
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170 Canada
In [41]:
#
# Deleting columns
#
# 1. in-place: original df will be modified
# 2. out-place: original df is unmodified, and a transformed df is returned.
#
In [ ]:
# out-place:
# DataFrame.drop(columns=[...])
cities.drop(columns=['country'])
In [ ]:
#
# This drops the column in the original dataframe as well.
#
cities.drop(columns=['country'], inplace=True)

# Note:
# Equivalent to:
# del cities['country']
In [51]:
cities.head(2)
Out[51]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
In [54]:
#
# Rename the columns, by default inplace=False
#
cities.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)
In [55]:
cities.head(2)
Out[55]:
city city_ascii province_id province_name latitude longitude population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
In [56]:
# Rename the columns back
cities.rename(columns={'latitude': 'lat', 'longitude': 'lng'}, inplace=True)

Working with Indexes

In [57]:
cities.index
Out[57]:
RangeIndex(start=0, stop=1738, step=1)
In [58]:
#
# We can reassign the index to any series.
#
cities.index = cities['id']
In [61]:
cities.head(4)[['city', 'province_name']]
Out[61]:
city province_name
id
1124279679 Toronto Ontario
1124586170 Montréal Quebec
1124825478 Vancouver British Columbia
1124690423 Calgary Alberta
In [69]:
#
# Restore the index to the original 0, 1, 2, 3, ...
# by default, inplace=False
#
cities.reset_index(drop=True, inplace=True)
In [71]:
#
# Now the index is dropped (restored)
#
cities.head()
Out[71]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
2 Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478
3 Calgary Calgary AB Alberta 51.0500 -114.0667 1239220.0 1501.1 America/Edmonton 1 T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2... 1124690423
4 Edmonton Edmonton AB Alberta 53.5344 -113.4903 1062643.0 1360.9 America/Edmonton 1 T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5... 1124290735
In [75]:
#
# The preferred way to set the index is using
# DataFrame.set_index(..., inplace=..)
# default inplace=False
#

cities.set_index('city_ascii', inplace=True)

# Note:
# - The column called 'city' is now GONE.
# - The index has the name 'city'.
# - This is different from cities.index = cities['city']
In [76]:
#
# Original dataframe
#
cities
Out[76]:
city province_id province_name lat lng population density timezone ranking postal id
city_ascii
Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
Montreal Montréal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478
Calgary Calgary AB Alberta 51.0500 -114.0667 1239220.0 1501.1 America/Edmonton 1 T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2... 1124690423
Edmonton Edmonton AB Alberta 53.5344 -113.4903 1062643.0 1360.9 America/Edmonton 1 T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5... 1124290735
... ... ... ... ... ... ... ... ... ... ... ...
Durham-Sud Durham-Sud QC Quebec 45.6667 -72.3333 1008.0 10.8 America/Montreal 4 J0H 1124105436
Melbourne Melbourne QC Quebec 45.5800 -72.1700 1004.0 5.8 America/Montreal 3 J0B 1124850489
Nipawin No. 487 Nipawin No. 487 SK Saskatchewan 53.2881 -104.0544 1004.0 1.1 America/Regina 4 S0E 1124001339
Duck Lake No. 463 Duck Lake No. 463 SK Saskatchewan 52.9596 -106.2089 1004.0 1.0 America/Regina 4 S0K S6V 1124001661
Oyen Oyen AB Alberta 51.3522 -110.4739 1001.0 189.6 America/Edmonton 3 T0J 1124000494

1738 rows × 11 columns

In [77]:
#
# DataFrame.loc[...] is an accessor by index values.
#
cities.loc['Oshawa']
Out[77]:
city                          Oshawa
province_id                       ON
province_name                Ontario
lat                             43.9
lng                           -78.85
population                    166000
density                         1027
timezone             America/Toronto
ranking                            2
postal           L1L L1H L1J L1K L1G
id                        1124541904
Name: Oshawa, dtype: object
In [82]:
#
# Recall the slicing syntax from NumPy.
# That works here too.
#
# Returns all cities that come AFTER "Oshawa"
cities.loc['Oshawa':]
Out[82]:
city province_id province_name lat lng population density timezone ranking postal id
city_ascii
Oshawa Oshawa ON Ontario 43.9000 -78.8500 166000.0 1027.0 America/Toronto 2 L1L L1H L1J L1K L1G 1124541904
Sherbrooke Sherbrooke QC Quebec 45.4000 -71.9000 161323.0 456.0 America/Montreal 2 J1N J1L J1M J1J J1K J1H J1G J1E J1C J1R 1124559506
Saguenay Saguenay QC Quebec 48.4167 -71.0667 144746.0 128.5 America/Montreal 2 G8A G7N G7H G7K G7T G7Z G7G G7B G7J G7P G7S G7... 1124001930
Levis Lévis QC Quebec 46.8000 -71.1833 143414.0 319.4 America/Montreal 2 G7A G6J G6K G6C G6Z G6X G6Y G6V G6W 1124958950
Kelowna Kelowna BC British Columbia 49.8881 -119.4956 142146.0 601.3 America/Vancouver 2 V1X V1Y V1P V1W V1V 1124080626
... ... ... ... ... ... ... ... ... ... ... ...
Durham-Sud Durham-Sud QC Quebec 45.6667 -72.3333 1008.0 10.8 America/Montreal 4 J0H 1124105436
Melbourne Melbourne QC Quebec 45.5800 -72.1700 1004.0 5.8 America/Montreal 3 J0B 1124850489
Nipawin No. 487 Nipawin No. 487 SK Saskatchewan 53.2881 -104.0544 1004.0 1.1 America/Regina 4 S0E 1124001339
Duck Lake No. 463 Duck Lake No. 463 SK Saskatchewan 52.9596 -106.2089 1004.0 1.0 America/Regina 4 S0K S6V 1124001661
Oyen Oyen AB Alberta 51.3522 -110.4739 1001.0 189.6 America/Edmonton 3 T0J 1124000494

1706 rows × 11 columns

In [92]:
#
# Task: find all cities that start with "V"
#
cities[cities.index.str.startswith('V')].sort_index().tail()
Out[92]:
city province_id province_name lat lng population density timezone ranking postal id
city_ascii
View Royal View Royal BC British Columbia 48.4517 -123.4339 10408.0 724.8 America/Vancouver 3 V9B 1124001985
Viking Viking AB Alberta 53.0953 -111.7769 1083.0 292.5 America/Edmonton 3 T0B 1124502081
Ville-Marie Ville-Marie QC Quebec 47.3333 -79.4333 2595.0 424.8 America/Montreal 3 J9V 1124001938
Virden Virden MB Manitoba 49.8508 -100.9317 3114.0 370.2 America/Winnipeg 3 R0M 1124620072
Vulcan Vulcan AB Alberta 50.4000 -113.2500 1917.0 302.3 America/Edmonton 3 T0L 1124607765

Working with Values

  1. Compute new values from existing values
  2. Retrieve values
  3. Query values based on some condition
  4. Aggregate

Retrieve values

In [93]:
cities[["city", "province_name", "population"]].head()
Out[93]:
city province_name population
city_ascii
Toronto Toronto Ontario 5429524.0
Montreal Montréal Quebec 3519595.0
Vancouver Vancouver British Columbia 2264823.0
Calgary Calgary Alberta 1239220.0
Edmonton Edmonton Alberta 1062643.0
In [96]:
#
# Retrieve values by index
#
cities.loc["Oshawa":"Vancouver"]

# Empty because "Vancouver" occurs before Oshawa.
Out[96]:
city province_id province_name lat lng population density timezone ranking postal id
city_ascii
In [97]:
cities.sort_index(inplace=True)
In [98]:
cities.loc["Oshawa":"Vancouver"]
Out[98]:
city province_id province_name lat lng population density timezone ranking postal id
city_ascii
Oshawa Oshawa ON Ontario 43.9000 -78.8500 166000.0 1027.0 America/Toronto 2 L1L L1H L1J L1K L1G 1124541904
Osler Osler SK Saskatchewan 52.3700 -106.5400 1237.0 796.5 America/Regina 3 S0K 1124000037
Osoyoos Osoyoos BC British Columbia 49.0325 -119.4661 5085.0 598.2 America/Vancouver 3 V0H 1124713973
Otonabee-South Monaghan Otonabee-South Monaghan ON Ontario 44.2333 -78.2333 6670.0 19.2 America/Toronto 3 K9J K0L 1124000517
Ottawa Ottawa ON Ontario 45.4247 -75.6950 989567.0 334.0 America/Montreal 1 K4P K4M K4A K4B K4C K7S K1S K1R K1P K1W K1V K1... 1124399363
... ... ... ... ... ... ... ... ... ... ... ...
Valcourt Valcourt QC Quebec 45.5000 -72.3167 2349.0 467.3 America/Montreal 3 J0E 1124334549
Valemount Valemount BC British Columbia 52.8284 -119.2659 1021.0 197.6 America/Vancouver 4 V0E 1124899599
Vallee-Jonction Vallée-Jonction QC Quebec 46.3667 -70.9167 1940.0 76.6 America/Montreal 4 G0S 1124672986
Valleyview Valleyview AB Alberta 55.0686 -117.2683 1863.0 199.9 America/Edmonton 3 T0H 1124211786
Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478

687 rows × 11 columns

Compute new values from existing

In [99]:
#
# Derive new columns
#
import numpy as np

cities['pop_million'] = np.round(cities['population'] * 1E-6, 2)
In [101]:
cities.loc['Toronto']
Out[101]:
city                                                       Toronto
province_id                                                     ON
province_name                                              Ontario
lat                                                        43.7417
lng                                                       -79.3733
population                                             5.42952e+06
density                                                     4334.4
timezone                                           America/Toronto
ranking                                                          1
postal           M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5...
id                                                      1124279679
pop_million                                                   5.43
Name: Toronto, dtype: object
In [108]:
#
# Functional programming using Series.apply
#

def city_size_by_population(p):
    if p > 1e4:
        return 'large'
    else:
        return 'not large'

#
# Get the series for population, and apply function to compute city_size
#
city_sizes = cities['population'].apply(city_size_by_population)
city_sizes['Toronto':]
Out[108]:
city_ascii
Toronto            large
Tracadie           large
Trail          not large
Trecesson      not large
Trent Hills        large
                 ...    
Yellowhead     not large
Yellowknife        large
Yorkton            large
Youbou         not large
Zorra          not large
Name: population, Length: 153, dtype: object
In [110]:
#
# Get series, and compute a series
#
cities['population'].apply(lambda x: pd.Series([city_size_by_population(x), x], index=['size', 'pop']))
Out[110]:
size pop
city_ascii
Abbotsford large 141397.0
Acton Vale not large 7664.0
Addington Highlands not large 2323.0
Adelaide-Metcalfe not large 2990.0
Adjala-Tosorontio large 10975.0
... ... ...
Yellowhead not large 2011.0
Yellowknife large 19569.0
Yorkton large 16343.0
Youbou not large 1086.0
Zorra not large 8138.0

1738 rows × 2 columns

In [112]:
#
# Try DataFrame.apply
#
cities.apply(lambda series: series.max())
Out[112]:
city                           Zorra
province_id                       YT
province_name                  Yukon
lat                          72.6808
lng                           -52.68
population               5.42952e+06
density                       5492.6
timezone         America/Yellowknife
ranking                            4
postal                           Y1A
id                        1124995979
pop_million                     5.43
dtype: object
In [114]:
#
# Try dataframe, apply to rows
#
cities.apply(lambda city: pd.Series([city['population'], city_size_by_population(city['population'])]), axis=1)
Out[114]:
0 1
city_ascii
Abbotsford 141397.0 large
Acton Vale 7664.0 not large
Addington Highlands 2323.0 not large
Adelaide-Metcalfe 2990.0 not large
Adjala-Tosorontio 10975.0 large
... ... ...
Yellowhead 2011.0 not large
Yellowknife 19569.0 large
Yorkton 16343.0 large
Youbou 1086.0 not large
Zorra 8138.0 not large

1738 rows × 2 columns

In [ ]: