Geospatial ML Challenges: A prospectivity analysis example

# import basic libraries
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# set some default plotting parameters for nicer looking plots
mpl.rcParams.update({"axes.grid":True, "grid.color":"gray", "grid.linestyle":'--','figure.figsize':(10,10)})
# set path to data directory
data_dir = r'/mnt/c/working/transform_2022/data/'

# set paths to key data sets
point_fn = os.path.join(data_dir, 'sn_w_minoccs.gpkg')
raster_fns = [os.path.join(data_dir, x) for x in os.listdir(data_dir) if '.tif' in x and 'xml' not in x]
point_fn, raster_fns
('/mnt/c/working/transform_2022/data/sn_w_minoccs.gpkg', ['/mnt/c/working/transform_2022/data/tasgrav_IR.tif', '/mnt/c/working/transform_2022/data/tasgrav_IR_1VD.tif', '/mnt/c/working/transform_2022/data/tasmag_TMI.tif', '/mnt/c/working/transform_2022/data/tasmag_TMI_1VD.tif', '/mnt/c/working/transform_2022/data/tasrad_K_pct.tif', '/mnt/c/working/transform_2022/data/tasrad_Th_ppm.tif', '/mnt/c/working/transform_2022/data/tasrad_U_ppm.tif'])
# import import geopandas
import geopandas as gpd

# read the mineral occurences
df = gpd.read_file(point_fn)
df.head(2)
Loading...
# import rasterio library for raster reading
import rasterio

# loop through geotiff files and read bands into an array
data, names = [], []
for fn in raster_fns:
  with rasterio.open(fn, 'r') as src:
    # read the coordinate transform
    transf = src.transform
    # create an extent tuple containing (xmin, xmax, ymin, ymax) for the rasters
    region = (src.bounds.left, src.bounds.right, src.bounds.bottom, src.bounds.top)
    # read the data, derive a nodata mask and nullify nodata pixels
    d = src.read(1)
    nodata_mask = d == src.nodata
    d[nodata_mask] = np.nan
    # append data to data list and append file names to names list (remove .tif file extension)
    data.append(d)
    names.append(os.path.basename(fn).split('.')[0])

# combine 2D raster arrays into 3D stack, print some details
data = np.stack(data)
print ('3D (Nbands, Nycolumns, Nxcolumns) data cube shape: {}'.format(data.shape))
print ('Geophysical data sets in cube:\n', names)
3D (Nbands, Nycolumns, Nxcolumns) data cube shape: (7, 2633, 1876)
Geophysical data sets in cube:
 ['tasgrav_IR', 'tasgrav_IR_1VD', 'tasmag_TMI', 'tasmag_TMI_1VD', 'tasrad_K_pct', 'tasrad_Th_ppm', 'tasrad_U_ppm']
# plot the data sets
fig, axes = plt.subplots(3, 3, figsize=(12,18), constrained_layout=True)
for i, ax in enumerate(axes.flatten()):
  if i < data.shape[0]:
    ax.imshow(data[i], extent=region, vmin=np.nanpercentile(data[i], 5), vmax=np.nanpercentile(data[i], 95))
    df.plot(ax=ax, markersize=150, facecolor='r', color='k', marker='*', linewidth=1)
    ax.set(title=names[i])
  else:
    ax.axis('off')
plt.show()
<Figure size 864x1296 with 9 Axes>
# import the rasterize module
from rasterio.features import rasterize

# rasterise the occurence points such that pixels within 500m are labelled '1', and all other are labelled '0'
labels = rasterize(shapes=((geom,1.) for geom in df.buffer(500).geometry), out_shape=data[0].shape, fill=0., transform=transf)

# apply the no data mask
labels[nodata_mask] = np.nan

# plot the predictions
fig, ax = plt.subplots(figsize=(8,8))
ax.imshow(labels, extent=region, interpolation='nearest')
plt.show()
<Figure size 576x576 with 1 Axes>
# reshape arrays to derive a X (Npixels, Nfeatures) array and y (Npixels) array
X_pix = data.reshape((data.shape[0], data.shape[1] * data.shape[2])).T
y_pix = labels.flatten()
print ('X_pix data array shape is {}, y_pix labels array shape is {}'.format(X_pix.shape, y_pix.shape))

# remove nodata pixels from both data sets
X = X_pix[~np.isnan(y_pix)]
y = y_pix[~np.isnan(y_pix)]
print ('X data array shape is {}, y labels array shape is {}'.format(X.shape, y.shape))

# summarise the data set
fig, ax = plt.subplots(figsize=(5,5))
ax.bar(0, y[y==0].shape, label='Distal pixels')
ax.bar(1, y[y==1].shape, label='Proximal pixels')
ax.legend()
plt.show()
X_pix data array shape is (4939508, 7), y_pix labels array shape is (4939508,)
X data array shape is (4002948, 7), y labels array shape is (4002948,)
<Figure size 360x360 with 1 Axes>
# visualise histograms
fig, axes = plt.subplots(3, 3, figsize=(12,12), constrained_layout=True)
for i, ax in enumerate(axes.flatten()):
  if i < data.shape[0]:
    bins = np.linspace(np.percentile(X[:,i], 5), np.percentile(X[:,i], 95), 50)
    for j, label in zip([0,1], ['Distal','Proximal']):
      ax.hist(X[y==j, i], bins=bins, density=True, alpha=0.5, label=label)
    ax.legend()
    ax.set(title=names[i], ylabel='Probability Density', xlabel='Value')
  else:
    ax.axis('off')
plt.show()
<Figure size 864x864 with 9 Axes>

Train Models

# import some sci-kit learn modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# build a random training and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# define model, fit it to the data
model1 = RandomForestClassifier(n_estimators=15, n_jobs=-1)
model1.fit(X_train, y_train)
RandomForestClassifier(n_estimators=15, n_jobs=-1)
# import reciever operating characteristic curve and area under the curve metrics
from sklearn.metrics import roc_curve, auc

# evaluate the model on the test data set
y_preds = model1.predict(X_test)
y_proba = model1.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_proba[:,1])
roc_auc = auc(fpr, tpr)

# visualise this
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
ax.plot([0,1], [0,1], 'r--')
ax.set(title='Reciever Operating Characteristic', 
       ylabel='True Positive Rate', xlabel='False Positive Rate')
ax.legend()
plt.show()
<Figure size 360x360 with 1 Axes>
# define get probability map function
def get_proba_map(data_cube, nodata_mask, model):
    # reshape data cube to 2d and remove nans
    X = data_cube.reshape((data_cube.shape[0], data_cube.shape[1]*data_cube.shape[2])).T
    X = X[~nodata_mask.flatten()]
    # generate prediction probabilities for class 1
    predictions = model.predict_proba(X)[:,1]
    # create an output array based on flattened mask raster
    pred_ar = np.zeros(shape=nodata_mask.flatten().shape, dtype='float32')
    # insert predictions, reshape to spatial and nullify nodata area
    pred_ar[~nodata_mask.flatten()] = predictions
    pred_ar = pred_ar.reshape(nodata_mask.shape)
    pred_ar[nodata_mask] = np.nan
    return pred_ar

# use function to generate a prediction raster
pred_ar1 = get_proba_map(data, nodata_mask, model1)

# plot the predictions
fig, ax = plt.subplots(figsize=(13,13))
im = ax.imshow(pred_ar1, extent=region, cmap='plasma')
fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
# df.plot(ax=ax, markersize=250, facecolor='w', color='k', marker='*', linewidth=1)
plt.show()
/tmp/ipykernel_3073/758455179.py:23: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
<Figure size 936x936 with 2 Axes>
# import random undersampling module from imbalanced learn library
from imblearn.under_sampling import RandomUnderSampler 

# stratify the classes with a random undersampler
rus = RandomUnderSampler(random_state=42)
X_strat, y_strat = rus.fit_resample(X, y)
print ('Before random undersampling:\n\t{} class 0 samples vs. {} class 1 samples'.format(len(y_train[y_train==0]), len(y_train[y_train==1])))
print ('After random undersampling:\n\t{} class 0 samples vs. {} class 1 samples'.format(len(y_strat[y_strat==0]), len(y_strat[y_strat==1])))

# build a random training and testing split
X_train, X_test, y_train, y_test = train_test_split(X_strat, y_strat, test_size=0.33, random_state=42)

# define model, fit it to the stratified data data
model2 = RandomForestClassifier(n_estimators=15, n_jobs=-1)
model2.fit(X_train, y_train)
Before random undersampling:
	2653929 class 0 samples vs. 28046 class 1 samples
After random undersampling:
	41895 class 0 samples vs. 41895 class 1 samples
RandomForestClassifier(n_estimators=15, n_jobs=-1)
# evaluate the model on the test data set
y_preds = model2.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_preds[:,1])
roc_auc = auc(fpr, tpr)

# visualise this
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
ax.plot([0,1], [0,1], 'r--')
ax.set(title='Reciever Operating Characteristic', 
       ylabel='True Positive Rate', xlabel='False Positive Rate')
ax.legend()
plt.show()
<Figure size 360x360 with 1 Axes>
# get spatial results
pred_ar2 = np.zeros_like(data[0].flatten())
pred_ar2[~nodata_mask.flatten()] = model2.predict_proba(X)[:,1]
pred_ar2[nodata_mask.flatten()] = np.nan
pred_ar2 = pred_ar2.reshape(data[0].shape)

# plot the predictions
fig, ax = plt.subplots(figsize=(13,13))
im = ax.imshow(pred_ar2, extent=region, cmap='plasma')
fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
df.plot(ax=ax, markersize=250, facecolor='w', color='k', marker='*', linewidth=1)
plt.show()
/tmp/ipykernel_28321/1785768611.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
<Figure size 936x936 with 2 Axes>

This is better, but the workflow suffers from a critical issue that makes us overestimate model purformance - spatial autocorrelation.

# define checkerboard function 
def make_checkerboard(boardsize, squaresize):
  '''
  props to stackoverflow user Blubberguy22, posted March 17, 2020 at 19:00
  https://stackoverflow.com/questions/2169478/how-to-make-a-checkerboard-in-numpy
  '''
  return np.fromfunction(lambda i, j: (i//squaresize[0])%2 != (j//squaresize[1])%2, boardsize).astype(int)

# make a checkerboard, plot it
checker = make_checkerboard(data[0].shape, (500,500))
fig, ax = plt.subplots(figsize=(8,8))
ax.imshow(checker, extent=region, interpolation='nearest')
df.plot(ax=ax, markersize=150, facecolor='w', color='k', marker='*', linewidth=1)
ax.set(title='Pixel Sampling Checkerboard')
plt.show()
<Figure size 576x576 with 1 Axes>
# split these data into two using the checkerboard
X_check0 = X_pix[checker.flatten()==0]
y_check0 = y_pix[checker.flatten()==0]

X_check1 = X_pix[checker.flatten()==1]
y_check1 = y_pix[checker.flatten()==1]

# remove nans
X_check0 = X_check0[~np.isnan(y_check0)]
y_check0 = y_check0[~np.isnan(y_check0)]

X_check1 = X_check1[~np.isnan(y_check1)]
y_check1 = y_check1[~np.isnan(y_check1)]

# print some details
print ('Checker 0: X data array shape is {}, y labels array shape is {}'.format(X_check0.shape, y_check0.shape))
print ('Checker 1: X data array shape is {}, y labels array shape is {}'.format(X_check1.shape, y_check1.shape))

# summarise the data set
fig, (ax0, ax1) = plt.subplots(1,2,figsize=(10,5))
ax0.bar(0, y_check0[y_check0==0].shape, label='Distal pixels')
ax0.bar(1, y_check0[y_check0==1].shape, label='Proximal pixels')
ax1.bar(0, y_check1[y_check1==0].shape, label='Distal pixels')
ax1.bar(1, y_check1[y_check1==1].shape, label='Proximal pixels')
[ax.legend() for ax in [ax0, ax1]]
[ax.set(title=t) for ax, t in zip([ax0, ax1], ['Checker 0 Data', 'Checker 1 Data'])]
plt.show()
Checker 0: X data array shape is (2009157, 7), y labels array shape is (2009157,)
Checker 1: X data array shape is (1993791, 7), y labels array shape is (1993791,)
<Figure size 720x360 with 2 Axes>
# stratify the checker data sets
X_check0, y_check0 = rus.fit_resample(X_check0, y_check0)
X_check1, y_check1 = rus.fit_resample(X_check1, y_check1)

# define models fit them to different checkerboard data selections
model3 = RandomForestClassifier(n_estimators=15, n_jobs=-1)
model3.fit(X_check0, y_check0)

model4 = RandomForestClassifier(n_estimators=15, n_jobs=-1)
model4.fit(X_check1, y_check1)

# evaluate the models on checker data unseen in training
roc_data = []
for model, X_check, y_check in zip([model3, model4], [X_check1, X_check0], [y_check1, y_check0]):
  y_pred = model.predict_proba(X_check)
  fpr, tpr, threshold = roc_curve(y_check, y_pred[:,1])
  roc_auc = auc(fpr, tpr)
  roc_data.append((fpr, tpr, roc_auc))

# visualise this
fig, ax = plt.subplots(figsize=(5,5))
for i, (fpr, tpr, roc_auc) in enumerate(roc_data):
  ax.plot(fpr, tpr, label='Checker {}\nAUC={}'.format(i, round(roc_auc,2)))
  ax.plot([0,1], [0,1], 'r--')
ax.set(title='Reciever Operating Characteristic', 
       ylabel='True Positive Rate', xlabel='False Positive Rate')
ax.legend()
plt.show()
<Figure size 360x360 with 1 Axes>
# get spatial results
pred_ars = []
for i, model in enumerate([model3, model4]):
  pred_ar = np.zeros_like(data[0].flatten())
  pred_ar[~nodata_mask.flatten()] = model.predict_proba(X)[:,1]
  pred_ar[nodata_mask.flatten()] = np.nan
  pred_ars.append(pred_ar.reshape(data[0].shape))

# plot the predictions
fig, axes = plt.subplots(1, 2, figsize=(16,12))
for i, (ax, t) in enumerate(zip(axes, ['Checker 0', 'Checker 1'])):
  im = ax.imshow(pred_ars[i], extent=region, cmap='plasma')
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
  df.plot(ax=ax, markersize=150, facecolor='w', color='k', marker='*', linewidth=1)
  ax.set(title=t)
plt.show()
/tmp/ipykernel_28321/267784532.py:13: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/267784532.py:13: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
<Figure size 1152x864 with 4 Axes>

Mineral Field holdout

# import kmeans clustering module
from sklearn.cluster import KMeans

# run clustering on the coordinates of the occurences, 8 clusters is about right
occ_xypts = [[geom.x, geom.y] for geom in df.geometry]
kmeans_obj = KMeans(n_clusters=8, random_state=42).fit(occ_xypts)
df['cluster_labels'] = kmeans_obj.labels_ + 1

# visualise the clusters
fig, ax = plt.subplots(figsize=(10,10))
ax.imshow(data[0], extent=region, vmin=np.nanpercentile(data[0], 5), vmax=np.nanpercentile(data[0], 95), alpha=0.2)
for i in np.unique(df.cluster_labels):
  df[df.cluster_labels==i].plot(ax=ax, markersize=150, marker='*', label='C{}'.format(i))
ax.set(title='Spatially Clustered Occurrences\non Isostatic Residual Bouguer')
ax.legend(loc=(1.05,0.4))
plt.show()
<Figure size 720x720 with 1 Axes>
# rasterize the occurence points by cluster
clustermap = rasterize(shapes=((geom,c) for c, geom in zip(df.cluster_labels, df.buffer(500).geometry)), 
                       out_shape=data[0].shape, fill=0., transform=transf)

# apply the no data mask
clustermap[nodata_mask] = np.nan

# plot the predictions
fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(clustermap, extent=region, interpolation='nearest', cmap='turbo')
fig.colorbar(im, shrink=0.5, label='Cluste Label ID')
plt.show()
/tmp/ipykernel_28321/2164095668.py:11: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, shrink=0.5, label='Cluste Label ID')
<Figure size 576x576 with 2 Axes>
# create a data selection function
def cluster_pixel_selection(clustermap, data_cube, class_1_list):
  X = data_cube.reshape((data_cube.shape[0], data_cube.shape[1] * data_cube.shape[2])).T
  y = clustermap.flatten() 
  X = X[~np.isnan(y)]
  y = y[~np.isnan(y)]
  y[np.isin(y, class_1_list)] = 1
  y[y!=1] = 0
  return X, y

# create a function to fit a model to input data
def fit_stratifiedrandomforest(X, y):
  X, y = rus.fit_resample(X, y)
  model = RandomForestClassifier(n_estimators=15, n_jobs=-1)
  return model.fit(X, y)

# define a function to determine performance on holdout occurence clusters
def holdout_roc_auc(clustermap, data_cube, holdout_cluster_list, model_cluster_list, model):
  X = data_cube.reshape((data_cube.shape[0], data_cube.shape[1] * data_cube.shape[2])).T
  y = clustermap.flatten() 
  X = X[~np.isnan(y)]
  y = y[~np.isnan(y)]
  X = X[~np.isin(y, model_cluster_list)]
  y = y[~np.isin(y, model_cluster_list)]
  y[np.isin(y, holdout_cluster_list)] = 1
  # predict onto X
  y_pred = model.predict_proba(X)
  fpr, tpr, threshold = roc_curve(y, y_pred[:,1])
  roc_auc = auc(fpr, tpr)
  return fpr, tpr, roc_auc

# define a function to generate probability maps from a model
def get_proba_map(nodata_mask, data_cube, model):
  X = data_cube.reshape((data_cube.shape[0], data_cube.shape[1] * data_cube.shape[2])).T
  X = X[~nodata_mask.flatten()]
  pred_ar = np.zeros_like(data_cube[0].flatten())
  pred_ar[~nodata_mask.flatten()] = model.predict_proba(X)[:,1]
  pred_ar[nodata_mask.flatten()] = np.nan
  pred_ar = pred_ar.reshape(data_cube[0].shape)
  return pred_ar
# train a model on cluster 1
X, y = cluster_pixel_selection(clustermap, data, [1])
model = fit_stratifiedrandomforest(X, y)
fpr, tpr, roc_auc = holdout_roc_auc(clustermap, data, [2,3,4,5,6,7,8], [1], model)

# visualise this
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(fpr, tpr, label='AUC={}'.format(round(roc_auc,2)))
ax.plot([0,1], [0,1], 'r--')
ax.set(title='Reciever Operating Characteristic', 
       ylabel='True Positive Rate', xlabel='False Positive Rate')
ax.legend()
plt.show()
<Figure size 360x360 with 1 Axes>
# import progress bar
from tqdm import tqdm

# loop through clusters
models, holdout_clusters = [], []
fprs, tprs, roc_aucs = [], [], []
for i in tqdm(range(1,9)):
  X, y = cluster_pixel_selection(clustermap, data, [j for j in range(1,9) if j!=i])
  model = fit_stratifiedrandomforest(X, y)
  fpr, tpr, roc_auc = holdout_roc_auc(clustermap, data, [i], [j for j in range(1,9) if j!=i], model)
  holdout_clusters.append(i)
  models.append(model)
  fprs.append(fpr)
  tprs.append(tpr)
  roc_aucs.append(roc_auc)
100%|██████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.57s/it]
# plot ROC curves for each holdout model
fig, ax = plt.subplots(figsize=(7,7))
for fpr, tpr, roc_auc, hc in zip(fprs, tprs, roc_aucs, holdout_clusters):
  ax.plot(fpr, tpr, label='Cluster{}\nAUC={}'.format(hc, round(roc_auc,2)))
  ax.plot([0,1], [0,1], 'r--')
  ax.set(title='Reciever Operating Characteristic', 
         ylabel='True Positive Rate', xlabel='False Positive Rate')
  ax.legend(loc=(1.05,0.1))
plt.show()
<Figure size 504x504 with 1 Axes>
# loop through models to generate pridction maps
prob_maps = []
for m in models:
  prob_maps.append(get_proba_map(nodata_mask, data, m))
prob_maps[0].shape
(2633, 1876)
# import matplotlib Rectangle to highlight holdout area
from matplotlib.patches import Rectangle

# plot these probability maps
fig, axes = plt.subplots(4, 2, figsize=(13,30), constrained_layout=True)
for i, ax in enumerate(axes.flatten()):
  if i < len(holdout_clusters):
    # plot the image
    im = ax.imshow(prob_maps[i], vmin=0, vmax=1, cmap='plasma', extent=region)
    fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
    df[df.cluster_labels!=holdout_clusters[i]].plot(ax=ax, markersize=25, color='k', marker='*', label='Training Occurences')
    # get holdout bounds
    xmin, ymin, xmax, ymax = df[df.cluster_labels==holdout_clusters[i]].total_bounds
    xlen, ylen = abs(xmin-xmax), abs(ymin-ymax)
    # draw the rectangle
    ax.add_patch(Rectangle((xmin, ymin), xlen, ylen, linewidth=3, edgecolor='k', facecolor='none', label='Holdout Area'))
    ax.legend(loc='upper left')
    ax.set(title='Occurence Cluster {} Holdout'.format(i+1))
  else:
    ax.axis('off')
plt.show()
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
/tmp/ipykernel_28321/2177181897.py:10: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
  fig.colorbar(im, ax=ax, shrink=0.5, label='Probability Proximal')
<Figure size 936x2160 with 16 Axes>
# feature importance
fig, ax = plt.subplots(figsize=(10,4))
feat_imps = np.array([m.feature_importances_ for m in models])
ax.boxplot(feat_imps, labels=names)
ax.set(title='Cluster Holdout Model Feature Importance')
plt.xticks(rotation=45)
plt.show()
<Figure size 720x288 with 1 Axes>