{:draft ["true"]}
import pandas as pd
# Obtained from https://simplemaps.com/data/canada-cities
Canadian_Cities_CSV = '/home/jovyan/work/shared/datasets/canadacities.csv'
# https://github.com/icyrockcom/country-capitals/blob/master/data/country-list.csv
#World_Capitals_CSV = '/home/jovyan/work/shared/datasets/country-list.csv'
cities = pd.read_csv(Canadian_Cities_CSV)
cities.head()
cities.shape
cities.describe()
# Columns are a series of strings.
cities.columns
# Extract a single column as a series
cities['city_ascii']
cities[['city', 'population']]
#
# Data transformation
# - Pandas encourages functional programming style of
# data analysis
# - The preferred API is read-only to the original dataframe
#
#
# Create a new column, call it 'country'
# - DataFrame.insert(new_position, column_name, series) modifies the original dataframe
# - DataFrame['new_column_name'] = ...
cities['country'] = 'Canada'
cities.head(2)
#
# Deleting columns
#
# 1. in-place: original df will be modified
# 2. out-place: original df is unmodified, and a transformed df is returned.
#
# out-place:
# DataFrame.drop(columns=[...])
cities.drop(columns=['country'])
#
# This drops the column in the original dataframe as well.
#
cities.drop(columns=['country'], inplace=True)
# Note:
# Equivalent to:
# del cities['country']
cities.head(2)
#
# Rename the columns, by default inplace=False
#
cities.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)
cities.head(2)
# Rename the columns back
cities.rename(columns={'latitude': 'lat', 'longitude': 'lng'}, inplace=True)
cities.index
#
# We can reassign the index to any series.
#
cities.index = cities['id']
cities.head(4)[['city', 'province_name']]
#
# Restore the index to the original 0, 1, 2, 3, ...
# by default, inplace=False
#
cities.reset_index(drop=True, inplace=True)
#
# Now the index is dropped (restored)
#
cities.head()
#
# The preferred way to set the index is using
# DataFrame.set_index(..., inplace=..)
# default inplace=False
#
cities.set_index('city_ascii', inplace=True)
# Note:
# - The column called 'city' is now GONE.
# - The index has the name 'city'.
# - This is different from cities.index = cities['city']
#
# Original dataframe
#
cities
#
# DataFrame.loc[...] is an accessor by index values.
#
cities.loc['Oshawa']
#
# Recall the slicing syntax from NumPy.
# That works here too.
#
# Returns all cities that come AFTER "Oshawa"
cities.loc['Oshawa':]
#
# Task: find all cities that start with "V"
#
cities[cities.index.str.startswith('V')].sort_index().tail()
cities[["city", "province_name", "population"]].head()
#
# Retrieve values by index
#
cities.loc["Oshawa":"Vancouver"]
# Empty because "Vancouver" occurs before Oshawa.
cities.sort_index(inplace=True)
cities.loc["Oshawa":"Vancouver"]
#
# Derive new columns
#
import numpy as np
cities['pop_million'] = np.round(cities['population'] * 1E-6, 2)
cities.loc['Toronto']
#
# Functional programming using Series.apply
#
def city_size_by_population(p):
if p > 1e4:
return 'large'
else:
return 'not large'
#
# Get the series for population, and apply function to compute city_size
#
city_sizes = cities['population'].apply(city_size_by_population)
city_sizes['Toronto':]
#
# Get series, and compute a series
#
cities['population'].apply(lambda x: pd.Series([city_size_by_population(x), x], index=['size', 'pop']))
#
# Try DataFrame.apply
#
cities.apply(lambda series: series.max())
#
# Try dataframe, apply to rows
#
cities.apply(lambda city: pd.Series([city['population'], city_size_by_population(city['population'])]), axis=1)