pip install altair

import altair as alt

import numpy as np
import pandas as pd
from scipy.stats import linregress

manufacturer_epa_path = "manufacturer_epa.csv"
manufacturer_epa_df = pd.read_csv(manufacturer_epa_path)

manufacturer_epa_df.head()

print("Manufacturer Dataset Data Types:\n", manufacturer_epa_df.dtypes)

Manufacturer Dataset Data Types:
 Manufacturer                  object
Model Year                    object
Regulatory Class              object
Real-World MPG                object
Real-World MPG_City           object
Real-World MPG_Hwy            object
Real-World CO2 (g/mi)         object
Real-World CO2_City (g/mi)    object
Real-World CO2_Hwy (g/mi)     object
Weight (lbs)                  object
Horsepower (HP)               object
Footprint (sq. ft.)           object
dtype: object

num_columns = [
    "Model Year", "Real-World MPG", "Real-World MPG_City", "Real-World MPG_Hwy",
    "Real-World CO2 (g/mi)", "Real-World CO2_City (g/mi)", "Real-World CO2_Hwy (g/mi)",
    "Weight (lbs)", "Horsepower (HP)", "Footprint (sq. ft.)"
]

manufacturer_epa_df[num_columns] = manufacturer_epa_df[num_columns].apply(pd.to_numeric, errors="coerce")
manufacturer_epa_df["Model Year"] = manufacturer_epa_df["Model Year"].astype("Int64")

print(manufacturer_epa_df.dtypes)

Manufacturer                   object
Model Year                      Int64
Regulatory Class               object
Real-World MPG                float64
Real-World MPG_City           float64
Real-World MPG_Hwy            float64
Real-World CO2 (g/mi)         float64
Real-World CO2_City (g/mi)    float64
Real-World CO2_Hwy (g/mi)     float64
Weight (lbs)                  float64
Horsepower (HP)               float64
Footprint (sq. ft.)           float64
dtype: object

manufacturer_epa_df = manufacturer_epa_df[manufacturer_epa_df["Model Year"] >= 2010]

manufacturer_epa_df.head()

car_manufacturers = manufacturer_epa_df["Manufacturer"].unique()

print(car_manufacturers)

['All' 'BMW' 'Ford' 'GM' 'Honda' 'Hyundai' 'Kia' 'Mazda' 'Mercedes'
 'Nissan' 'Stellantis' 'Subaru' 'Tesla' 'Toyota' 'VW']

manufacturer_epa_df = manufacturer_epa_df[manufacturer_epa_df["Manufacturer"] != "All"]

manufacturer_epa_df.head()

manufacturer_epa_df_corr = manufacturer_epa_df.drop(columns=['Manufacturer', 'Model Year', 'Regulatory Class']).corr()
manufacturer_epa_df_corr

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

def categorize_correlation(value):
    if abs(value) < 0.2:
        return 'Very Low'
    elif 0.2 <= abs(value) < 0.4:
        return 'Low'
    elif 0.4 <= abs(value) < 0.6:
        return 'Standard (Strong enough)'
    elif 0.6 <= abs(value) < 0.8:
        return 'Strong'
    else:
        return 'Very Strong'

correlation_categories = manufacturer_epa_df_corr.map(categorize_correlation)

corr_long = correlation_categories.reset_index().melt(id_vars='index')
corr_long.columns = ['Variable 1', 'Variable 2', 'Correlation Category']

color_scale = alt.Scale(
    domain=['Very Low', 'Low', 'Standard (Strong enough)', 'Strong', 'Very Strong'],
    range=['#ffffb2', '#fed976', '#fd8d3c', '#e31a1c', '#800026']
)

heatmap = alt.Chart(corr_long).mark_rect().encode(
    x=alt.X(
        'Variable 1:N', 
        title='Variable 1', 
        sort=alt.EncodingSortField(field="Correlation Category", order='descending')
    ),
    y=alt.Y(
        'Variable 2:N',
        title='Variable 2',
        sort=alt.EncodingSortField(field="Correlation Category", order='descending')
    ),
    color=alt.Color(
        'Correlation Category:N',
        scale=color_scale, 
        legend=alt.Legend(title="Correlation Strength")
    ),
    tooltip=['Variable 1', 'Variable 2', 'Correlation Category']
)

heatmap = heatmap.properties(title="Categorized Correlation Heatmap", width=350, height=350)

heatmap

line_chart = alt.Chart(manufacturer_epa_df).mark_line(point=True).encode(
    x=alt.X('Model Year:O', title="Model Year"),
    y=alt.Y('mean(Real-World MPG):Q', title="Average MPG"),
    color=alt.value("#1f77b4")
)

line_chart = line_chart.properties(title="Trend of Real-World Average MPG Over Model Years", height=300, width=900)

line_chart

line_chart = alt.Chart(manufacturer_epa_df).mark_line(point=True).encode(
    x=alt.X('Model Year:O', title="Model Year"),
    y=alt.Y('mean(Real-World MPG):Q', title="Average MPG"),
    color=alt.value("#1f77b4"),
    tooltip=[
    alt.Tooltip('Model Year:O', title="Model Year"),
    alt.Tooltip('mean(Real-World MPG):Q', title="Average MPG", format=".3f")
    ]
)

line_chart = line_chart.properties(title="Trend of Real-World Average MPG Over Model Years", height = 300, width = 900)

line_chart

line_chart = alt.Chart(manufacturer_epa_df).mark_line(point=True).encode(
    x=alt.X('Model Year:O', title="Model Year"),
    y=alt.Y('mean(Real-World MPG):Q', title="Average MPG"),
    color=alt.Color('Manufacturer:N', title="Manufacturer"),
    tooltip=[
        alt.Tooltip('Manufacturer:N', title="Manufacturer"),
        alt.Tooltip('Model Year:O', title="Model Year"),
        alt.Tooltip('Real-World MPG:Q', title="Real-World MPG", format=".3f")
    ]
)

line_chart = line_chart.properties(
    title="Trend of Real-World Average MPG Over Model Years",
    height=300,
    width=900
)

line_chart

violin_plot = alt.Chart(manufacturer_epa_df).transform_density(
    'Real-World MPG',
    as_=['Real-World MPG', 'density'],
    extent = [10, 35],
    groupby=['Model Year']
).mark_area(orient='horizontal').encode(
    alt.X('density:Q', stack='center', impute=None, title=None, axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True)),
    alt.Y('Real-World MPG:Q', title="Real-World MPG"),
    alt.Color('Model Year:N', title="Model Year"),
    alt.Column('Model Year:N', spacing=0, header=alt.Header(titleOrient='bottom', labelOrient='bottom', labelPadding=0)),
    tooltip=[
        alt.Tooltip('Model Year:O', title="Model Year"),
        alt.Tooltip('density:Q', title="Density"),
        alt.Tooltip('Real-World MPG:Q', title="Real-World MPG", format=".3f")
    ]
)

violin_plot = violin_plot.properties(
    title=alt.TitleParams(
        text="Trend of Real-World MPG Over Model Years",
        anchor="middle"
    ),
    width = 100    
)

violin_plot = violin_plot.configure_view(stroke=None)

violin_plot

scatter_plot = alt.Chart(manufacturer_epa_df).mark_circle(size=50, opacity=0.5).encode(
    x=alt.X('Real-World MPG:Q', title="Real-World MPG"),
    y=alt.Y(
        'Horsepower (HP):Q', title="Horsepower (HP)"
    ),
    color=alt.Color('Manufacturer:N', title="Car Brands", legend=alt.Legend(title="Car Brands")),
    tooltip=[
        alt.Tooltip('Manufacturer:N', title="Car Brand"),
        alt.Tooltip('Model Year:O', title="Model Year"),
        alt.Tooltip('Weight (lbs):Q', title="Vehicle Weight (lbs)", format=".3f"),
        alt.Tooltip('Real-World MPG:Q', title="Real-World MPG", format=".3f")
    ]
)

scatter_plot = scatter_plot.properties(title="Relationship Between Horsepower (HP) and MPG", height=300, width=900)

scatter_plot

manufacturer_epa_df = manufacturer_epa_df[manufacturer_epa_df["Manufacturer"] != "Tesla"]

scatter_plot = alt.Chart(manufacturer_epa_df).mark_circle(size=50, opacity=0.5).encode(
    x=alt.X('Real-World MPG:Q', title="Real-World MPG"),
    y=alt.Y(
        'Horsepower (HP):Q', title="Horsepower (HP)"
    ),
    color=alt.Color('Manufacturer:N', title="Car Brands", legend=alt.Legend(title="Car Brands")),
    tooltip=[
        alt.Tooltip('Manufacturer:N', title="Car Brand"),
        alt.Tooltip('Model Year:O', title="Model Year"),
        alt.Tooltip('Weight (lbs):Q', title="Vehicle Weight (lbs)", format=".3f"),
        alt.Tooltip('Real-World MPG:Q', title="Real-World MPG", format=".3f")
    ]
)

scatter_plot = scatter_plot.properties(title="Relationship Between Horsepower (HP) and MPG", height=300, width=900)

scatter_plot

manufacturer_epa_df = manufacturer_epa_df[manufacturer_epa_df["Manufacturer"] != "Tesla"]

# Define the dropdown selection for filtering by manufacturer
car_manufacturers = ["All Brands"] + manufacturer_epa_df["Manufacturer"].unique().tolist()

# Create a dropdown menu to select car brands
dropdown = alt.binding_select(options=car_manufacturers, name="Select Brand: ")
selection = alt.param(name="Manufacturer", bind=dropdown, value="All Brands")

scatter = alt.Chart(manufacturer_epa_df).mark_circle(size=60, opacity=0.5).encode(
    x=alt.X('Real-World MPG:Q', title="Real-World MPG", scale=alt.Scale(domain=[18, 32])),
    y=alt.Y('Real-World CO2 (g/mi):Q', title="CO2 Emissions (g/mi)"),
    color=alt.Color('Manufacturer:N', title="Car Brands", legend=alt.Legend(title="Car Brands")),
    tooltip=[
        alt.Tooltip('Manufacturer:N', title="Car Brand"),
        alt.Tooltip('Real-World MPG:Q', title="Real-World MPG", format=".3f"),
        alt.Tooltip('Real-World CO2 (g/mi):Q', title="CO2 Emissions (g/mi)", format=".3f"),
        alt.Tooltip('Weight (lbs):Q', title="Weight (lbs)", format=".3f"),
        alt.Tooltip('Horsepower (HP):Q', title="Horsepower (HP)", format=".3f"),
        alt.Tooltip('Footprint (sq. ft.):Q', title="Footprint (sq. ft.)", format=".3f")
    ]
)

scatter = scatter.add_params(selection).transform_filter(
    alt.expr.if_(selection == "All Brands", True, alt.datum.Manufacturer == selection)
)

trend_line = alt.Chart(manufacturer_epa_df).transform_regression('Real-World MPG', 'Real-World CO2 (g/mi)', method='linear')

trend_line = trend_line.mark_line(color='black', opacity=0.8).encode(
    x='Real-World MPG:Q',
    y='Real-World CO2 (g/mi):Q'
)

zoom_selection = alt.selection_interval(bind='scales')

multi_layer_chart_interactive = (scatter + trend_line).properties(
    title="CO2 Emissions vs MPG (With Trend Line)",
    height=300,
    width=900
)

multi_layer_chart_interactive = multi_layer_chart_interactive.add_params(zoom_selection)

multi_layer_chart_interactive

manufacturer_epa_df = manufacturer_epa_df.dropna(subset=['Real-World MPG', 'Real-World CO2 (g/mi)'])

slope, intercept, r_value, p_value, std_err = linregress(manufacturer_epa_df['Real-World MPG'], manufacturer_epa_df['Real-World CO2 (g/mi)'])

regression_results = pd.DataFrame({
    "Metric": ["Slope", "Intercept", "R-value", "R-squared", "P-value", "Std Error"],
    "Value": [slope, intercept, r_value, r_value**2, p_value, std_err]
})

regression_results

model_years = ["All Years"] + sorted(manufacturer_epa_df["Model Year"].unique().tolist())
dropdown = alt.binding_select(options=model_years, name="Select Model Year: ")
year_selection = alt.param(name="Year", bind=dropdown, value="All Years")

box_plot = alt.Chart(manufacturer_epa_df).mark_boxplot().encode(
    x=alt.X('Weight (lbs):Q', title="Vehicle Weight (lbs)", bin=alt.Bin(step=400)),
    y=alt.Y('Real-World MPG:Q', title="Real-World MPG"),
    color=alt.Color('Model Year:N', title="Model Year", legend=alt.Legend(title="Model Year"))
)

box_plot = box_plot.add_params(year_selection).transform_filter((alt.datum["Model Year"] == year_selection) | (year_selection == "All Years"))

box_plot = box_plot.properties(title="Distribution of MPG Across Vehicle Weights", height=300, width=900)

box_plot

brush = alt.selection_interval(encodings=['x'])
base = alt.Chart(manufacturer_epa_df).add_params(year_selection).transform_filter((alt.datum["Model Year"] == year_selection) | (year_selection == "All Years"))

brush_chart = base.mark_bar().encode(
    x=alt.X('Weight (lbs):Q', title="Vehicle Weight (lbs)", bin=alt.Bin(step=200)),
    y=alt.Y('count()', title="Number of Records")
)

brush_chart = brush_chart.add_params(brush)
brush_chart = brush_chart.properties(height=100,width=900,title="Brush Filter")

box_plot = base.mark_boxplot().encode(
    x=alt.X('Weight (lbs):Q', title="Vehicle Weight (lbs)", bin=alt.Bin(step=400)),
    y=alt.Y('Real-World MPG:Q', title="Real-World MPG"),
    color=alt.Color('Model Year:N', title="Model Year")
)

box_plot = box_plot.transform_filter(brush)
box_plot = box_plot.properties(height=300, width=900, title="Distribution of MPG Across Vehicle Weights (Filtered by Brush)")

final_chart = alt.vconcat(box_plot, brush_chart)
final_chart

summary_stats = manufacturer_epa_df.groupby("Model Year")["Real-World MPG"].describe(percentiles=[0.25, 0.5, 0.75]).reset_index()

summary_stats = summary_stats.rename(columns={
    "max": "Max of Average MPG",
    "75%": "Q3 of Average MPG",
    "50%": "Median of Average MPG",
    "25%": "Q1 of Average MPG",
    "min": "Min of Average MPG"
})

summary_stats

Feature	Altair	Matplotlib	Seaborn	Plotly
Code Complexity	Low (declarative)	High (procedural)	Medium	Medium
Interactivity	Built-in	None (static)	None (static)	High
Customization	Moderate	High	Medium	High
Best Use Case	Dashboards, Exploratory Analysis	Static Reports	Statistical Charts	Web Apps

	Manufacturer	Model Year	Regulatory Class	Real-World MPG	Real-World MPG_City	Real-World MPG_Hwy	Real-World CO2 (g/mi)	Real-World CO2_City (g/mi)	Real-World CO2_Hwy (g/mi)	Weight (lbs)	Horsepower (HP)	Footprint (sq. ft.)
0	All	1975	All	13.05970	12.01552	14.61167	680.59612	739.73800	608.31160	4060.399	137.3346	-
1	All	1976	All	14.22136	13.18117	15.73946	625.02238	674.34147	564.74348	4079.198	135.0839	-
2	All	1977	All	15.06743	14.00580	16.60587	589.99880	634.71366	535.34732	3981.818	135.9847	-
3	All	1978	All	15.83777	14.68193	17.52390	561.62442	605.82637	507.59981	3715.238	129.0248	-
4	All	1979	All	15.91271	14.87711	17.39245	559.69495	598.63764	512.09833	3655.465	123.5922	-

	Manufacturer	Model Year	Regulatory Class	Real-World MPG	Real-World MPG_City	Real-World MPG_Hwy	Real-World CO2 (g/mi)	Real-World CO2_City (g/mi)	Real-World CO2_Hwy (g/mi)	Weight (lbs)	Horsepower (HP)	Footprint (sq. ft.)
35	All	2010	All	22.59206	19.11219	26.18930	393.65429	465.33221	339.58148	4001.323	213.6361	48.54913
36	All	2011	All	22.28844	18.83713	25.86317	398.99558	472.11781	343.83319	4125.934	229.9718	49.54439
37	All	2012	All	23.56593	19.94669	27.30319	377.31888	445.79746	325.65960	3978.812	221.7796	48.81134
38	All	2013	All	24.17888	20.49116	27.97717	367.53789	433.74031	317.59572	4002.973	225.8506	49.08053
39	All	2014	All	24.11047	20.44020	27.88816	368.65513	434.90361	318.67820	4059.639	230.2484	49.72043

	Manufacturer	Model Year	Regulatory Class	Real-World MPG	Real-World MPG_City	Real-World MPG_Hwy	Real-World CO2 (g/mi)	Real-World CO2_City (g/mi)	Real-World CO2_Hwy (g/mi)	Weight (lbs)	Horsepower (HP)	Footprint (sq. ft.)
85	BMW	2010	All	22.11894	18.26597	26.30477	403.87825	489.03931	339.63393	3899.421	254.9807	45.78178
86	BMW	2011	All	22.64959	18.71925	26.91229	394.58177	477.44425	332.07147	4045.147	262.2890	46.78290
87	BMW	2012	All	23.53401	19.74701	27.51463	380.20532	453.05973	325.24498	4070.738	264.2034	47.33868
88	BMW	2013	All	24.30114	20.21850	28.66816	366.24242	440.17913	310.46561	4012.190	267.1041	47.37450
89	BMW	2014	All	26.10456	21.73755	30.76751	340.99678	409.76701	289.11748	4016.625	264.8449	47.83606

	Real-World MPG	Real-World MPG_City	Real-World MPG_Hwy	Real-World CO2 (g/mi)	Real-World CO2_City (g/mi)	Real-World CO2_Hwy (g/mi)	Weight (lbs)	Horsepower (HP)	Footprint (sq. ft.)
Real-World MPG	1.000000	0.999087	0.999248	-0.942325	-0.929937	-0.951147	0.224892	0.624383	0.142867
Real-World MPG_City	0.999087	1.000000	0.996743	-0.931691	-0.919647	-0.940200	0.233201	0.631641	0.151653
Real-World MPG_Hwy	0.999248	0.996743	1.000000	-0.949318	-0.936128	-0.958948	0.214880	0.616615	0.132379
Real-World CO2 (g/mi)	-0.942325	-0.931691	-0.949318	1.000000	0.997956	0.997771	-0.037447	-0.437486	0.020574
Real-World CO2_City (g/mi)	-0.929937	-0.919647	-0.936128	0.997956	1.000000	0.991467	-0.023020	-0.414921	0.029797
Real-World CO2_Hwy (g/mi)	-0.951147	-0.940200	-0.958948	0.997771	0.991467	1.000000	-0.052340	-0.459132	0.010860
Weight (lbs)	0.224892	0.233201	0.214880	-0.037447	-0.023020	-0.052340	1.000000	0.846337	0.858628
Horsepower (HP)	0.624383	0.631641	0.616615	-0.437486	-0.414921	-0.459132	0.846337	1.000000	0.696805
Footprint (sq. ft.)	0.142867	0.151653	0.132379	0.020574	0.029797	0.010860	0.858628	0.696805	1.000000

1. Introduction¶

1.1 Visualization Library¶

Why Use Altair?¶

Advantages of Altair¶

Comparison: Altair vs. Other Visualization Libraries¶

Limitations of Altair¶

Installation Instructions¶

2. Import Libraries¶

3. Dataset Cleaning and Preprocessing¶

3.1 Overview¶

3.2 Dataset Details¶

3.3 Data Cleaning Overview¶

4. Data Cleaning Steps¶

5. Creating a Correlation Heatmap¶

5.1 Key Insights from the Correlation Table¶

6. Plotting the Data¶

Visualization #1: Trend of Real-World Average MPG Over Model Years¶

Step 1: Creating a Basic Line Chart¶

Step 2: Enhancing with Interactivity¶

Step 3: Adding Color Encoding¶

Step 4: Creating the Violin Plot¶

Conclusion:¶

Visualization #2: Relationship Between MPG and Horsepower (HP)¶

Step 1: Creating the Initial Scatter Plot¶

Step 2: Filtering Out Tesla Vehicles¶

Conclusion:¶

Visualization #3: Fuel Economy vs. CO₂ Emissions¶

Step 1: Filtering Out Tesla Vehicles¶

Step 3: Building the Scatter Plot with Tooltips¶

Step 4: Adding a Trend Line¶

Step 5: Adding Zoom and Finalizing the Chart¶

Conclusion:¶

Visualization #4: Distribution of MPG Across Vehicle Weights¶

Step 2: Creating the Box Plot¶

Step 3: Creating a Bar Chart for Weight Range¶

Conclusion:¶

Conclusion: How the Visualizations Relate to Each Other¶

	Metric	Value
0	Slope	-1.481132e+01
1	Intercept	7.297596e+02
2	R-value	-9.914404e-01
3	R-squared	9.829540e-01
4	P-value	4.201752e-161
5	Std Error	1.453792e-01

	Model Year	count	mean	std	Min of Average MPG	Q1 of Average MPG	Median of Average MPG	Q3 of Average MPG	Max of Average MPG
0	2010	13.0	23.252292	2.648653	18.94624	21.26286	23.43180	24.92316	27.00315
1	2011	13.0	23.108409	2.420629	19.10806	21.02658	23.75675	24.75829	26.89448
2	2012	13.0	24.364857	2.434075	20.07502	22.66097	25.03342	26.23225	28.04818
3	2013	13.0	25.133734	2.580860	20.87185	22.23815	25.88618	27.19169	28.98618
4	2014	13.0	25.360903	2.368688	20.73541	23.05124	26.08121	26.97831	29.01930
5	2015	13.0	25.784959	2.522853	21.77917	23.39981	26.06393	28.01994	29.20490
6	2016	13.0	25.845334	2.623183	21.54052	23.64929	26.23074	28.06045	29.55620
7	2017	13.0	25.912703	2.719810	21.14576	23.04015	26.37891	28.50861	29.40652
8	2018	13.0	25.967881	2.735232	21.72275	23.50904	25.98177	28.55494	29.98078
9	2019	13.0	25.896834	2.611461	21.23923	23.67964	26.16839	28.06764	28.88833
10	2020	13.0	25.976643	2.590614	21.29298	23.42644	26.98401	27.93080	29.08240
11	2021	13.0	25.960418	2.825803	21.25992	23.63750	27.10319	28.54817	28.75033
12	2022	13.0	26.010073	2.680600	21.31014	23.71639	27.04424	27.93890	29.11395
13	2023	13.0	26.951973	2.738880	21.82984	27.04400	27.57798	28.37400	30.35988

1. Introduction¶

1.1 Visualization Library¶

Why Use Altair?¶

Advantages of Altair¶

Comparison: Altair vs. Other Visualization Libraries¶

Limitations of Altair¶

Installation Instructions¶

2. Import Libraries¶

3. Dataset Cleaning and Preprocessing¶

3.1 Overview¶

3.2 Dataset Details¶

3.3 Data Cleaning Overview¶

4. Data Cleaning Steps¶

5. Creating a Correlation Heatmap¶

5.1 Key Insights from the Correlation Table¶

6. Plotting the Data¶

Visualization #1: Trend of Real-World Average MPG Over Model Years¶

Step 1: Creating a Basic Line Chart¶

Step 2: Enhancing with Interactivity¶

Step 3: Adding Color Encoding¶

Step 4: Creating the Violin Plot¶

Conclusion:¶

Visualization #2: Relationship Between MPG and Horsepower (HP)¶

Step 1: Creating the Initial Scatter Plot¶

Step 2: Filtering Out Tesla Vehicles¶

Conclusion:¶

Visualization #3: Fuel Economy vs. CO₂ Emissions¶

Step 1: Filtering Out Tesla Vehicles¶

Step 2: Adding a Dropdown for Brand Selection¶

Step 3: Building the Scatter Plot with Tooltips¶

Step 4: Adding a Trend Line¶

Step 5: Adding Zoom and Finalizing the Chart¶

Conclusion:¶

Visualization #4: Distribution of MPG Across Vehicle Weights¶

Step 1: Adding a Dropdown for Model Year¶

Step 2: Creating the Box Plot¶

Step 3: Creating a Bar Chart for Weight Range¶

Conclusion:¶

Conclusion: How the Visualizations Relate to Each Other¶