What are the weights of different health data in predicting median household income in California?
Can we use a model trained on California data to predict median household income in Colorado from health data?
import json
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.linear_model
import holoviews as hv
import colorcet as cc
from health import *
hv.extension('bokeh')
red = cc.b_glasbey_hv[1]
blue = cc.b_glasbey_hv[0]
Drop 'Limited access to healthy foods' (15% of this data is missing).
health_file = 'CA_health_data_by_census_tract.txt'
health_data = load_health_data(health_file).drop(['Limited access to healthy foods'], axis=1)
income_data_file = 'CA_household_income_by_census_tract.csv'
income_data = load_hinc_data(income_data_file)
df = pd.merge(health_data, income_data[['median income', 'stcotr_fips']])
Let's drop missing values.
n = len(df)
print(f'Original number of data points: {n}.')
df.dropna(inplace=True)
print(f'Number of rows dropped: {n - len(df)}, {100 * (n - len(df)) / n:.1f}%.')
Original number of data points: 6075. Number of rows dropped: 407, 6.7%.
Income distributions tend to be log-normal, so let's look at correlations of health data with household income and $\log10$(household income).
df['log10(median income)'] = np.log10(np.array(df['median income']))
df['median income (thousands)'] = df['median income'] / 1000
column_renaming = {column_name: column_name.replace(' ', '\n') for column_name in df.columns}
ds = hv.Dataset(
data=df.drop(
columns=['median income', 'stcotr_fips']
).rename(
columns=column_renaming
)
)
(
hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'median\nincome\n(thousands)'] +
hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).cols(
1
)
We will use ridge regression with built-in cross-validation, provided by the sklearn library.
X = df.drop(
['median income', 'median income (thousands)', 'log10(median income)', 'stcotr_fips'], axis=1
)
y = np.log10(
np.array(df['median income (thousands)']).reshape(-1, 1)
)
transformer = sklearn.preprocessing.RobustScaler()
transformer.fit(X)
X_scaled = transformer.transform(X)
model = sklearn.linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model.fit(X_scaled, y)
RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))
model.alpha_
1.0
train_score = model.score(X_scaled, y)
train_score
0.9490166406907407
y_pred = model.predict(X_scaled)
plot = plot_prediction_vs_actual(y_pred, y, 'median income (thousands)',log10=True)
plot.opts(
hv.opts.Layout(title=f'score: {train_score:.2f}'),
hv.opts.Points(alpha=0.4)
)
We tend to overpredict high incomes - this may be because the highest category of income is >\$250,000, which we truncated to \\$250,000.
metrics = X.columns
metrics_sorted = np.array(metrics)[np.argsort(model.coef_)]
metrics_sorted = metrics_sorted.tolist()[0]
weights = model.coef_.copy()
weights.sort()
weights = weights.tolist()[0]
df_weights = pd.DataFrame({'metrics': metrics_sorted, 'weights': weights})
feature_weights = hv.Bars(df_weights, kdims='metrics')
feature_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
metrics = X.columns
metrics_abs_sorted = np.array(metrics)[np.argsort(np.abs(model.coef_))]
metrics_abs_sorted = metrics_abs_sorted.tolist()[0]
abs_weights = model.coef_.copy()
abs_weights = np.abs(abs_weights)
abs_weights.sort()
abs_weights = abs_weights.tolist()[0]
df_abs_weights = pd.DataFrame({'metrics': metrics_abs_sorted, 'weights (absolute value)': abs_weights})
feature_abs_weights = hv.Bars(df_abs_weights, kdims='metrics')
feature_abs_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
Can we use what we learned in California to predict median income in Colorado?
test_health_file = 'CO_health_data_by_census_tract.txt'
test_health_data = load_health_data(test_health_file).drop(['Limited access to healthy foods'], axis=1)
test_income_data_file = 'CO_household_income_by_census_tract.csv'
test_income_data = load_hinc_data(test_income_data_file)
test_df = pd.merge(test_health_data, test_income_data[['median income', 'stcotr_fips']])
test_df['log10(median income)'] = np.log10(np.array(test_df['median income']))
test_df['median income (thousands)'] = test_df['median income'] / 1000
n = len(test_df)
print(f'Original number of data points: {n}.')
test_df.dropna(inplace=True)
print(f'Number of rows dropped: {n - len(test_df)}, {100 * (n - len(test_df)) / n:.1f}%.')
Original number of data points: 804. Number of rows dropped: 111, 13.8%.
test_ds = hv.Dataset(
data=test_df.drop(
columns=['median income', 'stcotr_fips']
).rename(
columns=column_renaming
)
)
(
hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'median\nincome\n(thousands)'] +
hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).opts(
hv.opts.Points(color=red),
hv.opts.Histogram(color=red),
).cols(
1
)
X_test = test_df.drop(
['median income', 'median income (thousands)', 'log10(median income)', 'stcotr_fips'], axis=1
)
y_test_actual = np.log10(
np.array(test_df['median income (thousands)']).reshape(-1, 1)
)
X_test_scaled = transformer.transform(X_test)
y_test_pred = model.predict(X_test_scaled)
test_score = model.score(X_test_scaled, y_test_actual)
test_score
0.8969461153830847
plot_test = plot_prediction_vs_actual(y_test_pred, y_test_actual, label='median income', log10=True)
plot_test.opts(
hv.opts.Layout(title=f'score: {test_score:.2f}'),
hv.opts.Points(color=red, alpha=0.4),
hv.opts.Distribution(color=red)
)
The model scored quite poorly on Colorado data -- but, it looks like there might be some systematic error...
(
hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_actual, kdims=['log10(income)'], label='Colorado')
).opts(
legend_position='right',
frame_width=400,
frame_height=300,
)
Indeed, the whole distribution of Colorado incomes is shifted to the left of California incomes, which may contribute to why our model predicts a higher income than the true value consistently.
colorado_state_median = np.median(y_test_actual)
colorado_state_median
1.8251858585595182
california_state_median = np.median(y)
california_state_median
1.8504991994262454
delta = colorado_state_median - california_state_median
y_test_adj = y_test_actual - delta
(
hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_adj, kdims=['log10(income)'], label='Colorado (adjusted to California median)')
).opts(
legend_position='right',
frame_width=400,
frame_height=300,
)
test_score = model.score(X_test_scaled, y_test_adj)
test_score
0.9343895929208701
plot_test_adj = plot_prediction_vs_actual(y_test_pred, y_test_adj, label='median income', log10=True)
plot_test_adj.opts(
hv.opts.Layout(title=f'score: {test_score:.2f}'),
hv.opts.Points(color=red, alpha=0.4),
hv.opts.Distribution(color=red)
)
This really improves the predictive power of the model.