Hands-on session for Week 2

Author

Junjun Yin

Published

September 4, 2025

DATS 2102 – Week 2: Language of Graphs

Session 2 Hands‑On: Seaborn (Categorical vs Continuous) + Altair (Grammar of Graphics)

Learning objectives - Apply tidy data reshaping to prepare data for plotting - Load the Gapminder dataset and perform a quick EDA - Use Seaborn for categorical vs. continuous plots - Use Altair for a declarative grammar of graphics and interactivity - Practice multiple encodings (color, size, shape) and reflect on readability

1) Setup

pip install pandas numpy matplotlib seaborn altair plotly

import seaborn as sns
import pandas as pd

2) Tidy Reshaping (pd.melt)

Demo for converting messy data/dataframe into tidy data/dataframe Please refer to the details of the pandas melt function here: https://pandas.pydata.org/docs/reference/api/pandas.melt.html

messy_data = pd.DataFrame({
    "Country": ["USA", "China", "UK"],
    "2000": [280, 1260, 59],
    "2005": [295, 1320, 60],
    "2010": [310, 1390, 63]
})

display(messy_data)
print(messy_data)
messy_data

	Country	2000	2005	2010
0	USA	280	295	310
1	China	1260	1320	1390
2	UK	59	60	63

  Country  2000  2005  2010
0     USA   280   295   310
1   China  1260  1320  1390
2      UK    59    60    63

	Country	2000	2005	2010
0	USA	280	295	310
1	China	1260	1320	1390
2	UK	59	60	63

tidy_data = messy_data.melt(id_vars='Country', value_vars=['2000', '2005','2010'], var_name='Year', value_name='Population')

##make a tiny wide table and reshape it to long
tidy_data['Year'] = tidy_data['Year'].astype(int)

tidy_data

	Country	Year	Population
0	USA	2000	280
1	China	2000	1260
2	UK	2000	59
3	USA	2005	295
4	China	2005	1320
5	UK	2005	60
6	USA	2010	310
7	China	2010	1390
8	UK	2010	63

One more example

df_wide = pd.DataFrame({
'name': ['Alice', 'Bob'],
'math': [90, 80],
'english': [85, 78]
})

df_tidy = df_wide.melt(id_vars='name', var_name='subject', value_name='score')

Load Gapminder dataset

We can use the one from Plotly’s GitHub for this session https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv

##load the dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv"
gapminder = pd.read_csv(url)

gapminder.head()

	country	year	pop	continent	lifeExp	gdpPercap
0	Afghanistan	1952	8425333.0	Asia	28.801	779.445314
1	Afghanistan	1957	9240934.0	Asia	30.332	820.853030
2	Afghanistan	1962	10267083.0	Asia	31.997	853.100710
3	Afghanistan	1967	11537966.0	Asia	34.020	836.197138
4	Afghanistan	1972	13079460.0	Asia	36.088	739.981106

3) Quick EDA

gapminder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   year       1704 non-null   int64  
 2   pop        1704 non-null   float64
 3   continent  1704 non-null   object 
 4   lifeExp    1704 non-null   float64
 5   gdpPercap  1704 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 80.0+ KB

gapminder.describe(include='all').T.head(10)

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
country	1704	142	Afghanistan	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
year	1704.0	NaN	NaN	NaN	1979.5	17.26533	1952.0	1965.75	1979.5	1993.25	2007.0
pop	1704.0	NaN	NaN	NaN	29601212.325117	106157896.746828	60011.0	2793664.0	7023595.5	19585221.75	1318683096.0
continent	1704	5	Africa	624	NaN	NaN	NaN	NaN	NaN	NaN	NaN
lifeExp	1704.0	NaN	NaN	NaN	59.474439	12.917107	23.599	48.198	60.7125	70.8455	82.603
gdpPercap	1704.0	NaN	NaN	NaN	7215.327081	9857.454543	241.165876	1202.060309	3531.846988	9325.462346	113523.1329

sorted(gapminder['year'].unique())[:10], gapminder['continent'].unique()

([np.int64(1952),
  np.int64(1957),
  np.int64(1962),
  np.int64(1967),
  np.int64(1972),
  np.int64(1977),
  np.int64(1982),
  np.int64(1987),
  np.int64(1992),
  np.int64(1997)],
 array(['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'], dtype=object))

4) Seaborn: Categorical vs Continuous


import seaborn as sns
import matplotlib.pyplot as plt

# Categorical example: life expectancy by continent (boxplot)
plt.figure(figsize=(8,6))
sns.boxplot(x="continent", y="lifeExp", data=gapminder)
plt.title("Life Expectancy by Continent")
plt.show()

# Continuous example: GDP vs Life Expectancy (scatter; hue = continent)
plt.figure(figsize=(8,6))
sns.scatterplot(x="gdpPercap", y="lifeExp", hue="continent", data=gapminder, s=60, alpha=0.8)
plt.xscale("log")
plt.title("GDP per Capita vs Life Expectancy (log x)")
plt.show()

5) Seaborn: Multiple Encodings

Example 1

# Use size to encode population; hue for continent
plt.figure(figsize=(8,6))
sns.scatterplot(
    x="gdpPercap", y="lifeExp",
    size="pop", hue="continent",
    data=gapminder[gapminder['year']==2007],
    sizes=(20, 400), alpha=0.7
)
plt.xscale("log")
plt.title("2007: GDP vs LifeExp (size = population, color = continent)")
plt.show()

Example 2

# Load tips dataset
tips = sns.load_dataset("tips")
tips.head()

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

sns.scatterplot(data=tips,x="total_bill", 
                y="tip",
                hue="day", 
                style="time", 
                size="size", 
                palette="deep", 
                sizes=(20, 200))

plt.figure(figsize=(12,9))
sns.scatterplot(data=tips,x="total_bill", 
                y="tip",
                hue="day", 
                style="time", 
                size="size", 
                palette="deep", 
                sizes=(20, 200))
# plt.xscale("log")
plt.show()

6) Altair: Grammar of Graphics + Interactivity

import altair as alt

alt.Chart(gapminder).mark_circle().encode(
    x='gdpPercap:Q',
    y='lifeExp:Q',
    color='continent:N',
    size='pop:Q',
    tooltip=['country','year','lifeExp','gdpPercap']
).interactive()

import altair as alt

# Make charts renderable in notebooks
alt.data_transformers.disable_max_rows()

base = alt.Chart(gapminder).mark_circle().encode(
    x=alt.X('gdpPercap:Q', scale=alt.Scale(type='log')),
    y='lifeExp:Q',
    color='continent:N',
    size=alt.Size('pop:Q', title='Population'),
    tooltip=['country:N','continent:N','year:O','lifeExp:Q','gdpPercap:Q','pop:Q']
).properties(width=600, height=480)

base.interactive()

Altair’s Grammar of Graphics

Altair Type Shorthands

Each variable you encode in Altair must declare its data type. There are four main types:

Q = Quantitative

Numeric, continuous values (GDP, population, life expectancy)

Example: x=‘gdpPercap:Q’ → treat GDP per capita as a number with a continuous scale.

N = Nominal

Categorical, unordered labels (continent, country names, gender)

Example: color=‘continent:N’ → treat continent as categories, map to distinct colors.

O = Ordinal

Ordered categories (education levels, small/medium/large, survey responses 1–5)

Example: x=‘year:O’ if you want to treat years as ordered categories instead of continuous.

T = Temporal

Time/date values (year, month, datetime)

Example: x=‘year:T’ → treat year as a time axis.

Altair Selections (Brush + Filter)


brush = alt.selection_interval(encodings=['x','y'])

points = alt.Chart(gapminder).mark_point().encode(
    x=alt.X('gdpPercap:Q', scale=alt.Scale(type='log'), title='GDP per Capita (log)'),
    y=alt.Y('lifeExp:Q', title='Life Expectancy'),
    color=alt.condition(brush, 'continent:N', alt.value('lightgray')),
    tooltip=['country','year','lifeExp','gdpPercap','pop']
).add_params(brush).properties(width=350, height=280)

hist = alt.Chart(gapminder).mark_bar().encode(
    x=alt.X('lifeExp:Q', bin=alt.Bin(maxbins=30)),
    y='count()',
    color='continent:N'
).transform_filter(brush).properties(width=350, height=280)

(points | hist)

7) Guided Exercise

Create a chart with multiple encodings using gapminder[gapminder['year'] == 2007]: - x = GDP per capita (log) - y = Life expectancy - color = continent - size = population

Questions to reflect on: - Which encoding contributes most to readability? - Is any encoding redundant or distracting? - Try swapping encodings (e.g., shape for continent, size for GDP).


year_df = gapminder[gapminder['year'] == 2007].copy()

# TODO: your plot here (Seaborn or Altair)
# Example scaffold (Seaborn):
import seaborn as sns, matplotlib.pyplot as plt
plt.figure(figsize=(9,6))
sns.scatterplot(
    x="gdpPercap", y="lifeExp",
    hue="continent", size="pop",
    data=year_df, sizes=(20, 400), alpha=0.75
)
plt.xscale("log")
plt.title("2007: GDP vs LifeExp (your encodings)")
plt.show()

8) Challenges 1

Starting from gapminder, compute the top 10 countries by GDP per capita in 2007.
Make an Altair bar chart with tooltips.
Then switch to Seaborn and try a horizontal bar chart.
Reflect: which was faster to write and more readable?


top10_2007 = (
    gapminder[gapminder['year']==2007]
    .nlargest(10, 'gdpPercap')
    .sort_values('gdpPercap')
)
top10_2007[['country','continent','gdpPercap']].head()

	country	continent	gdpPercap
695	Iceland	Europe	36180.78919
251	Canada	Americas	36319.23501
1091	Netherlands	Europe	36797.93332
1487	Switzerland	Europe	37506.41907
671	Hong Kong, China	Asia	39724.97867


import altair as alt
alt.Chart(top10_2007).mark_bar().encode(
    x=alt.X('gdpPercap:Q', title='GDP per Capita'),
    y=alt.Y('country:N', sort='x'),
    color='continent:N',
    tooltip=['country','continent','gdpPercap']
).properties(width=450, height=300)


import seaborn as sns, matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
sns.barplot(y='country', x='gdpPercap', hue='continent', data=top10_2007, dodge=False)
plt.title('Top 10 GDP per Capita (2007)')
plt.tight_layout()
plt.show()

8) Challenges 2

A scatterplot of GDP per capita vs. life expectancy using plotly
Encode continent as color and year as an animation frame.
What does each encoding reveal?
Which encoding is most effective at showing inequality?
How does animation enhance or hinder interpretation?

import plotly.express as px
gap = px.data.gapminder()
px.scatter(
gap, x="gdpPercap", y="lifeExp",
color="continent", size="pop",
hover_name="country", animation_frame="year",
log_x=True
)