{:draft ["true"], :rank ["aggregation" "groupby" "merge"]}
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'
import pandas as pd
cities = pd.read_csv(Canadian_Cities_CSV)
cities.head()
cities['population'].sum()
cities[['lat', 'lng']].mean()
cities['postal'].apply(lambda postal_code: len(postal_code.split())).sum()
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'
import pandas as pd
cities = pd.read_csv(Canadian_Cities_CSV)
cities
#
# Project to columns, group by, aggregate
#
cities[['province_id', 'population']].groupby(['province_id']).sum()
#
# Groupby, project to columns, aggregate
#
cities.groupby(['province_id'])['population'].sum()
#
# Groupby, aggregate, project to columns
#
cities.groupby(['province_id']).sum()[['population']]
cities.groupby('province_name')['timezone'].nunique()
#
# What are the different timezones in each province
#
cities[['province_name','timezone']].drop_duplicatesates()
cities.groupby(['province_name', 'timezone'])[['population']].sum()
# If we don't want hierarchical indexes, just
# use DataFrame.reset_index()
cities.groupby(['province_name', 'timezone'])[['population']].sum().reset_index()
Canadian_cities = '/home/jovyan/work/shared/datasets/canadacities.csv'
Country_list = '/home/jovyan/work/shared/datasets/country-list.csv'
import pandas as pd
cities = pd.read_csv(Canadian_cities)
countries = pd.read_csv(Country_list)
countries.set_index('country', inplace=True)
cities.set_index('city').loc['London']
#
# The first way is merge by index.
#
cities.set_index('city_ascii', inplace=True)
#
# Have countries indexed by capital
#
countries = countries.reset_index().set_index('capital')
pd.merge(cities, countries, left_index=True, right_index=True)[['province_name', 'population', 'country']]
cities = pd.read_csv(Canadian_cities)
countries = pd.read_csv(Country_list)
pd.merge(cities, countries, left_on=['city_ascii'], right_on=['capital'])[['city', 'province_name', 'country']]