Random Forest Model for Crop Type and Land Classification
Using RandomForest Classifier for crop type mapping with data from Cloud Optimized Geotiffs (COGs).
Using RandomForest Classifier for crop type mapping for SERVIR Sat ML training. This notebook showcase how we can read satellite imagery (Sentinal-2) from AWS public S3 through AWS Public Data Program for crop type mapping.
-
If you want to run this whole notebook on Google Colab, here is the link.
-
If you would like to replicate the workflow with the same data on your local machine, please download the data from our shared Google Drive folder.
Besides mapping crop type with RandomForestClassifier, we also prepared notebooks that use LightGBM and SVM.
- Find our notebook for LightGBM on Google Colab;
- Find our notebook for SVM on Google Colab.
from os import path as op
import pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import rasterio
from rasterio.features import rasterize
from rasterstats.io import bounds_window
import rasterstats
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from treeinterpreter import treeinterpreter as ti
# read in training data polygons that created as geojson
training_vectors = gpd.read_file('training_data.geojson')
training_vectors.head()
# find all unique values of training data names to use as classes
classes = np.unique(training_vectors.name)
classes
# create a dictionary to convert class names into integers for modeling
class_dict = dict(zip(classes, range(len(classes))))
class_dict
# raster information
raster_file = 'Trans_nzoia_2019_05-02.tif'
# a custom function for getting each value from the raster
def all_values(x):
return x
# this larger cell reads data from a raster file for each training vector
X_raw = []
y_raw = []
with rasterio.open(raster_file, 'r') as src:
for (label, geom) in zip(training_vectors.name, training_vectors.geometry):
# read the raster data matching the geometry bounds
window = bounds_window(geom.bounds, src.transform)
# store our window information
window_affine = src.window_transform(window)
fsrc = src.read(window=window)
# rasterize the geometry into the larger shape and affine
mask = rasterize(
[(geom, 1)],
out_shape=fsrc.shape[1:],
transform=window_affine,
fill=0,
dtype='uint8',
all_touched=True
).astype(bool)
# for each label pixel (places where the mask is true)
label_pixels = np.argwhere(mask)
for (row, col) in label_pixels:
# add a pixel of data to X
data = fsrc[:,row,col]
one_x = np.nan_to_num(data, nan=1e-3)
X_raw.append(one_x)
# add the label to y
y_raw.append(class_dict[label])
# convert the training data lists into the appropriate numpy array shape and format for scikit-learn
X = np.array(X_raw)
y = np.array(y_raw)
(X.shape, y.shape)
# helper function for calculating ND*I indices (bands in the final dimension)
def band_index(arr, a, b):
return np.expand_dims((arr[..., a] - arr[..., b]) / (arr[..., a] + arr[..., b]), axis=1)
ndvi = band_index(X, 3, 2)
ndwi = band_index(X, 1, 3)
X = np.concatenate([X, ndvi, ndwi], axis=1)
X.shape
# split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# calculate class weights to allow for training on inbalanced training samples
labels, counts = np.unique(y_train, return_counts=True)
class_weight_dict = dict(zip(labels, 1 / counts))
class_weight_dict
# initialize a RandomForestClassifier
clf = RandomForestClassifier(
n_estimators=200,
class_weight=class_weight_dict,
max_depth=6,
n_jobs=-1,
verbose=1,
random_state=0)
# fit the model to the data (training)
clf.fit(X, y)
# predict on X_test to evaluate the model
preds = clf.predict(X_test)
cm = confusion_matrix(y_test, preds, labels=labels)
# (optional) save the trained model as pickle file
model_name = 'random_forest.sav'
with open(model_name, 'wb') as modelfile:
pickle.dump(clf, modelfile)
# plot the confusion matrix
%matplotlib inline
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title='Normalized Confusion Matrix',
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
# predict again with the tree interpreter to see how much each band contributes to the classification
sample = 100
prediction, bias, contributions = ti.predict(clf, X_test[:sample])
c = np.sum(contributions, axis=0)
# plot the contributions
band_names = ['Blue', 'Green', 'Red', 'NIR', 'SWIR1', 'SWIR2', 'NDVI', 'NDWI']
gdf = gpd.GeoDataFrame(c, columns=classes, index=band_names)
gdf.style.background_gradient(cmap='viridis')
# in this case, we predict over the entire input image
# (only small portions were used for training)
new_image = raster_file
output_image = "classification.tif"
with rasterio.open(new_image, 'r') as src:
profile = src.profile
profile.update(
dtype=rasterio.uint8,
count=1,
)
with rasterio.open(output_image, 'w', **profile) as dst:
# perform prediction on each small image patch to minimize required memory
patch_size = 500
for i in range((src.shape[0] // patch_size) + 1):
for j in range((src.shape[1] // patch_size) + 1):
# define the pixels to read (and write) with rasterio windows reading
window = rasterio.windows.Window(
j * patch_size,
i * patch_size,
# don't read past the image bounds
min(patch_size, src.shape[1] - j * patch_size),
min(patch_size, src.shape[0] - i * patch_size))
# read the image into the proper format
data = src.read(window=window)
# adding indices if necessary
img_swp = np.moveaxis(data, 0, 2)
img_flat = img_swp.reshape(-1, img_swp.shape[-1])
img_ndvi = norm_inds(img_flat, 3, 2)
img_ndwi = norm_inds(img_flat, 1, 3)
img_w_ind = np.concatenate([img_flat, img_ndvi, img_ndwi], axis=1)
# remove no data values, store the indices for later use
m = np.ma.masked_invalid(img_w_ind)
to_predict = img_w_ind[~m.mask].reshape(-1, img_w_ind.shape[-1])
# skip empty inputs
if not len(to_predict):
continue
# predict
img_preds = clf.predict(to_predict)
# add the prediction back to the valid pixels (using only the first band of the mask to decide on validity)
# makes the assumption that all bands have identical no-data value arrangements
output = np.zeros(img_flat.shape[0])
output[~m.mask[:, 0]] = img_preds.flatten()
# resize to the original image dimensions
output = output.reshape(*img_swp.shape[:-1])
# create our final mask
mask = (~m.mask[:, 0]).reshape(*img_swp.shape[:-1])
# write to the final files
dst.write(output.astype(rasterio.uint8), 1, window=window)
dst.write_mask(mask, window=window)