{:draft ["true"], :rank ["aggregation" "groupby" "merge"]}

Index

Aggregation, Groupby and Merge

Aggregation

In [1]:
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'
import pandas as pd
In [2]:
cities = pd.read_csv(Canadian_Cities_CSV)
In [3]:
cities.head()
Out[3]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
2 Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478
3 Calgary Calgary AB Alberta 51.0500 -114.0667 1239220.0 1501.1 America/Edmonton 1 T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2... 1124690423
4 Edmonton Edmonton AB Alberta 53.5344 -113.4903 1062643.0 1360.9 America/Edmonton 1 T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5... 1124290735
In [6]:
cities['population'].sum()
Out[6]:
40161685.0
In [8]:
cities[['lat', 'lng']].mean()
Out[8]:
lat    47.846277
lng   -83.943544
dtype: float64
In [12]:
cities['postal'].apply(lambda postal_code: len(postal_code.split())).sum()
Out[12]:
2972
In [ ]:
 

Groupby

In [1]:
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'
import pandas as pd
In [2]:
cities = pd.read_csv(Canadian_Cities_CSV)
In [3]:
cities
Out[3]:
city city_ascii province_id province_name lat lng population density timezone ranking postal id
0 Toronto Toronto ON Ontario 43.7417 -79.3733 5429524.0 4334.4 America/Toronto 1 M5T M5V M5P M5S M5R M5E M5G M5A M5C M5B M5M M5... 1124279679
1 Montréal Montreal QC Quebec 45.5089 -73.5617 3519595.0 3889.0 America/Montreal 1 H1X H1Y H1Z H1P H1R H1S H1T H1V H1W H1H H1J H1... 1124586170
2 Vancouver Vancouver BC British Columbia 49.2500 -123.1000 2264823.0 5492.6 America/Vancouver 1 V6Z V6S V6R V6P V6N V6M V6L V6K V6J V6H V6G V6... 1124825478
3 Calgary Calgary AB Alberta 51.0500 -114.0667 1239220.0 1501.1 America/Edmonton 1 T1Y T2H T2K T2J T2L T2N T2A T2C T2B T2E T2G T2... 1124690423
4 Edmonton Edmonton AB Alberta 53.5344 -113.4903 1062643.0 1360.9 America/Edmonton 1 T5X T5Y T5Z T5P T5R T5S T5T T5V T5W T5H T5J T5... 1124290735
... ... ... ... ... ... ... ... ... ... ... ... ...
1733 Durham-Sud Durham-Sud QC Quebec 45.6667 -72.3333 1008.0 10.8 America/Montreal 4 J0H 1124105436
1734 Melbourne Melbourne QC Quebec 45.5800 -72.1700 1004.0 5.8 America/Montreal 3 J0B 1124850489
1735 Nipawin No. 487 Nipawin No. 487 SK Saskatchewan 53.2881 -104.0544 1004.0 1.1 America/Regina 4 S0E 1124001339
1736 Duck Lake No. 463 Duck Lake No. 463 SK Saskatchewan 52.9596 -106.2089 1004.0 1.0 America/Regina 4 S0K S6V 1124001661
1737 Oyen Oyen AB Alberta 51.3522 -110.4739 1001.0 189.6 America/Edmonton 3 T0J 1124000494

1738 rows × 12 columns

In [6]:
#
# Project to columns, group by, aggregate
#

cities[['province_id', 'population']].groupby(['province_id']).sum()
Out[6]:
population
province_id
AB 3488355.0
BC 6101869.0
MB 1196667.0
NB 743629.0
NL 389968.0
NS 678507.0
NT 31958.0
NU 29274.0
ON 16672868.0
PE 80445.0
QC 9865133.0
SK 856552.0
YT 26460.0
In [9]:
#
# Groupby, project to columns, aggregate
#
cities.groupby(['province_id'])['population'].sum()
Out[9]:
province_id
AB     3488355.0
BC     6101869.0
MB     1196667.0
NB      743629.0
NL      389968.0
NS      678507.0
NT       31958.0
NU       29274.0
ON    16672868.0
PE       80445.0
QC     9865133.0
SK      856552.0
YT       26460.0
Name: population, dtype: float64
In [11]:
#
# Groupby, aggregate, project to columns
#
cities.groupby(['province_id']).sum()[['population']]
Out[11]:
population
province_id
AB 3488355.0
BC 6101869.0
MB 1196667.0
NB 743629.0
NL 389968.0
NS 678507.0
NT 31958.0
NU 29274.0
ON 16672868.0
PE 80445.0
QC 9865133.0
SK 856552.0
YT 26460.0

Task 2. Find the number of different timezones for each province

In [17]:
cities.groupby('province_name')['timezone'].nunique()
Out[17]:
province_name
Alberta                      1
British Columbia             5
Manitoba                     1
New Brunswick                3
Newfoundland and Labrador    2
Northwest Territories        2
Nova Scotia                  2
Nunavut                      4
Ontario                      7
Prince Edward Island         1
Quebec                       5
Saskatchewan                 4
Yukon                        1
Name: timezone, dtype: int64
In [30]:
#
# What are the different timezones in each province
#
cities[['province_name','timezone']].drop_duplicatesates()
Out[30]:
province_name timezone
0 Ontario America/Toronto
1 Quebec America/Montreal
2 British Columbia America/Vancouver
3 Alberta America/Edmonton
5 Ontario America/Montreal
7 Manitoba America/Winnipeg
14 Nova Scotia America/Halifax
23 Saskatchewan America/Regina
48 Newfoundland and Labrador America/St_Johns
49 New Brunswick America/Moncton
50 Ontario America/Thunder_Bay
58 Nova Scotia America/Glace_Bay
114 Prince Edward Island America/Halifax
143 Saskatchewan America/Edmonton
167 Yukon America/Whitehorse
194 British Columbia America/Dawson_Creek
205 Northwest Territories America/Yellowknife
210 British Columbia America/Edmonton
247 Saskatchewan America/Swift_Current
261 Ontario America/Winnipeg
324 Quebec America/Halifax
440 Newfoundland and Labrador America/Goose_Bay
455 Nunavut America/Iqaluit
588 British Columbia America/Fort_Nelson
593 British Columbia America/Creston
609 Saskatchewan America/Winnipeg
670 New Brunswick America/New_York
809 Northwest Territories America/Inuvik
831 New Brunswick America/Montreal
882 Nunavut America/Rankin_Inlet
903 Ontario America/Atikokan
998 Quebec America/Toronto
1235 Nunavut America/Cambridge_Bay
1289 Ontario America/Nipigon
1380 Nunavut America/Pangnirtung
1387 Quebec America/Moncton
1626 Quebec America/Blanc-Sablon
1699 Ontario America/Detroit

Task: how many quebec residence live according to the Halifax timezone?

In [34]:
cities.groupby(['province_name', 'timezone'])[['population']].sum()
Out[34]:
population
province_name timezone
Alberta America/Edmonton 3488355.0
British Columbia America/Creston 5351.0
America/Dawson_Creek 40140.0
America/Edmonton 46407.0
America/Fort_Nelson 5393.0
America/Vancouver 6004578.0
Manitoba America/Winnipeg 1196667.0
New Brunswick America/Moncton 736088.0
America/Montreal 3126.0
America/New_York 4415.0
Newfoundland and Labrador America/Goose_Bay 18360.0
America/St_Johns 371608.0
Northwest Territories America/Inuvik 3243.0
America/Yellowknife 28715.0
Nova Scotia America/Glace_Bay 94285.0
America/Halifax 584222.0
Nunavut America/Cambridge_Bay 5610.0
America/Iqaluit 13533.0
America/Pangnirtung 1481.0
America/Rankin_Inlet 8650.0
Ontario America/Atikokan 2753.0
America/Detroit 1047.0
America/Montreal 1008893.0
America/Nipigon 1642.0
America/Thunder_Bay 107909.0
America/Toronto 15507044.0
America/Winnipeg 43580.0
Prince Edward Island America/Halifax 80445.0
Quebec America/Blanc-Sablon 1118.0
America/Halifax 12010.0
America/Moncton 1472.0
America/Montreal 9848148.0
America/Toronto 2385.0
Saskatchewan America/Edmonton 36348.0
America/Regina 797013.0
America/Swift_Current 16604.0
America/Winnipeg 6587.0
Yukon America/Whitehorse 26460.0
In [35]:
# If we don't want hierarchical indexes, just
# use DataFrame.reset_index()

cities.groupby(['province_name', 'timezone'])[['population']].sum().reset_index()
Out[35]:
province_name timezone population
0 Alberta America/Edmonton 3488355.0
1 British Columbia America/Creston 5351.0
2 British Columbia America/Dawson_Creek 40140.0
3 British Columbia America/Edmonton 46407.0
4 British Columbia America/Fort_Nelson 5393.0
5 British Columbia America/Vancouver 6004578.0
6 Manitoba America/Winnipeg 1196667.0
7 New Brunswick America/Moncton 736088.0
8 New Brunswick America/Montreal 3126.0
9 New Brunswick America/New_York 4415.0
10 Newfoundland and Labrador America/Goose_Bay 18360.0
11 Newfoundland and Labrador America/St_Johns 371608.0
12 Northwest Territories America/Inuvik 3243.0
13 Northwest Territories America/Yellowknife 28715.0
14 Nova Scotia America/Glace_Bay 94285.0
15 Nova Scotia America/Halifax 584222.0
16 Nunavut America/Cambridge_Bay 5610.0
17 Nunavut America/Iqaluit 13533.0
18 Nunavut America/Pangnirtung 1481.0
19 Nunavut America/Rankin_Inlet 8650.0
20 Ontario America/Atikokan 2753.0
21 Ontario America/Detroit 1047.0
22 Ontario America/Montreal 1008893.0
23 Ontario America/Nipigon 1642.0
24 Ontario America/Thunder_Bay 107909.0
25 Ontario America/Toronto 15507044.0
26 Ontario America/Winnipeg 43580.0
27 Prince Edward Island America/Halifax 80445.0
28 Quebec America/Blanc-Sablon 1118.0
29 Quebec America/Halifax 12010.0
30 Quebec America/Moncton 1472.0
31 Quebec America/Montreal 9848148.0
32 Quebec America/Toronto 2385.0
33 Saskatchewan America/Edmonton 36348.0
34 Saskatchewan America/Regina 797013.0
35 Saskatchewan America/Swift_Current 16604.0
36 Saskatchewan America/Winnipeg 6587.0
37 Yukon America/Whitehorse 26460.0
In [ ]:
 

Merge

In [2]:
Canadian_cities = '/home/jovyan/work/shared/datasets/canadacities.csv'
Country_list = '/home/jovyan/work/shared/datasets/country-list.csv'
import pandas as pd
In [3]:
cities = pd.read_csv(Canadian_cities)
countries = pd.read_csv(Country_list)
In [6]:
countries.set_index('country', inplace=True)
In [17]:
cities.set_index('city').loc['London']
Out[17]:
city_ascii                                                  London
province_id                                                     ON
province_name                                              Ontario
lat                                                        42.9836
lng                                                       -81.2497
population                                                  383822
density                                                      913.1
timezone                                           America/Toronto
ranking                                                          2
postal           N5Z N5X N5Y N5V N5W N6A N6P N6G N6E N6C N6N N6...
id                                                      1124469960
Name: London, dtype: object
In [20]:
#
# The first way is merge by index.
#

cities.set_index('city_ascii', inplace=True)
In [23]:
#
# Have countries indexed by capital
#
countries = countries.reset_index().set_index('capital')
In [25]:
pd.merge(cities, countries, left_index=True, right_index=True)[['province_name', 'population', 'country']]
Out[25]:
province_name population country
Athens Ontario 3013.0 Greece
Douglas New Brunswick 6154.0 Isle of Man
Hamilton Ontario 693645.0 Bermuda
Kingston New Brunswick 2913.0 Jamaica
Kingston New Brunswick 2913.0 Norfolk Island
London Ontario 383822.0 United Kingdom; England
Ottawa Ontario 989567.0 Canada
St. George's Newfoundland and Labrador 1203.0 Grenada
St. John's Newfoundland and Labrador 108860.0 Antigua and Barbuda
Stanley Manitoba 9038.0 Falkland Islands
Victoria British Columbia 335696.0 Seychelles
Victoria Newfoundland and Labrador 1800.0 Seychelles
Victoria Manitoba 1514.0 Seychelles
Wellington New Brunswick 3079.0 New Zealand
Wellington Ontario 1932.0 New Zealand

Another way of merge is on columns instead of indexes

In [26]:
cities = pd.read_csv(Canadian_cities)
countries = pd.read_csv(Country_list)
In [28]:
pd.merge(cities, countries, left_on=['city_ascii'], right_on=['capital'])[['city', 'province_name', 'country']]
Out[28]:
city province_name country
0 Ottawa Ontario Canada
1 Hamilton Ontario Bermuda
2 London Ontario United Kingdom; England
3 Victoria British Columbia Seychelles
4 Victoria Newfoundland and Labrador Seychelles
5 Victoria Manitoba Seychelles
6 St. John's Newfoundland and Labrador Antigua and Barbuda
7 Stanley Manitoba Falkland Islands
8 Douglas New Brunswick Isle of Man
9 Wellington New Brunswick New Zealand
10 Wellington Ontario New Zealand
11 Athens Ontario Greece
12 Kingston New Brunswick Jamaica
13 Kingston New Brunswick Norfolk Island
14 St. George's Newfoundland and Labrador Grenada
In [ ]: