CropIdentification.py

# -*- coding: utf-8 -*-
"""Minor Project.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1PInxqep7X4UJYSnDouM0A4bFmtl_x2UK
"""

!pip install ipyleaflet

!pip install pystac

!pip install pystac_client

!pip install odc-stac

from google.colab import drive
drive.mount('/content/drive/')

"""### Cloning the repository for leafmap"""

!git clone /~https://github.com/giswqs/leafmap.git

!pip install planetary-computer

!planetarycomputer configure

# import planetary_computer as pc
# import pystac_client

# from pystac_client import Client
# import planetary_computer, requests
# api = Client.open(
#    'https://planetarycomputer.microsoft.com/api/stac/v1',
#    modifier=planetary_computer.sign_inplace,
# )

!pip install Rich

!python -m rich

"""## Loading the dependencies"""

## We have to get access to planetary_computer API

from odc import stac
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score,classification_report,confusion_matrix

# Planetary Computer Tools
import pystac
import pystac_client
# import odc-stac
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
from odc.stac import stac_load
import planetary_computer as pc
pc.settings.set_subscription_key('8037994b42c1450e9a4daca46c196464')

# Others
import requests
import rich.table

from itertools import cycle
from tqdm import tqdm
tqdm.pandas()

## Curated the data from a certain region of Vietnam in 2020
crop_presence_data = pd.read_csv("/content/drive/MyDrive/Crop_Location_Data_20221201.csv")
crop_presence_data.head()

"""Sentinel-1 radar data penetrates through the clouds, thus helping us to get the band values with minimal atmospheric attenuation. Band values such as VV and VH help us in distinguishing between the rice and non rice crops. Hence we are choosing VV and VH as predictor variables for this experiment.

VV - gamma naught values of signal transmitted with vertical polarization and received with vertical polarization with radiometric terrain correction applied.

VH - gamma naught values of signal transmitted with vertical polarization and received with horizontal polarization with radiometric terrain correction applied.
"""

# !pip install odc
import odc

def get_sentinel_data(latlong,time_slice,assets):
    '''
    Returns VV and VH values for a given latitude and longitude 
    Attributes:
    latlong - A tuple with 2 elements - latitude and longitude
    time_slice - Timeframe for which the VV and VH values have to be extracted
    assets - A list of bands to be extracted
    '''
    #latlong = (10.013942985253381, 105.67361318732796)
    latlong=latlong.replace('(','').replace(')','').replace(' ','').split(',')
    for i in range(2):
      latlong[i] = float(latlong[i])

    
    #bbox_of_interest = (float(latlong[1]) , float(latlong[0]), float(latlong[1]) , float(latlong[0]))
    time_of_interest = time_slice
    box_size_deg = 0.0004 # Surrounding box in degrees, yields approximately 5x5 pixel region

    min_lon = latlong[1]-box_size_deg/2
    min_lat = latlong[0]-box_size_deg/2
    max_lon = latlong[1]+box_size_deg/2
    max_lat = latlong[0]+box_size_deg/2

    bbox_of_interest = (min_lon, min_lat, max_lon, max_lat)
    ###

    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1"
    )
    search = catalog.search(
        collections=["sentinel-1-rtc"], bbox=bbox_of_interest, datetime=time_of_interest
    )
    items = list(search.get_all_items())
    bands_of_interest = assests
    resolution = 10  # meters per pixel 
    scale = resolution / 111320.0
    data = stac_load([items[0]], bands = bands_of_interest, patch_url=pc.sign,crs="EPSG:4326",resolution=scale,bbox=bbox_of_interest)
    vh = data["vh"].astype("float").values.tolist()[0][0]
    vv = data["vv"].astype("float").values.tolist()[0][0]

    mean = data.mean(dim=['latitude','longitude']).compute()
    dop = (mean.vv / (mean.vv + mean.vh))
    m = 1 - dop
    rvi = (np.sqrt(dop))*((4*mean.vh)/(mean.vv + mean.vh))
    return vh[0],vv[0]

# xx = odc.stac.load(...crs="EPSG:3857",resolution=10)
# yy = odc.stac.load(...crs="EPSG:4326",resolution=0.00009009)

## Sentinel-radiometric data
# catalog = pystac_client.Client.open(
#     "https://planetarycomputer.microsoft.com/api/stac/v1",
#     modifier=pc.sign_inplace,
# )

# ## Just searching for assets over Panama in order to 
# bbox = [-80.11, 8.71, -79.24, 9.38]
# search = catalog.search(
#     collections=["sentinel-1-rtc"], bbox=bbox, datetime="2022-05-02/2022-05-09"
# )
# items = search.item_collection()
# print(f"Found {len(items)} items")
# item = items[0]

# Image(url=item.assets["rendered_preview"].href)

# table = rich.table.Table("key", "value")
# for k, v in sorted(item.properties.items()):
#     table.add_row(k, str(v))

# table

# time_slice = "2020-03-20/2020-03-21"
time_slice = '2020-03-20/2021-03-21'
assests = ['vh','vv']
vh_vv = []
for coordinates in tqdm(crop_presence_data['Latitude and Longitude']):
    vh_vv.append(get_sentinel_data(coordinates,time_slice,assests))
vh_vv_data = pd.DataFrame(vh_vv,columns =['vh','vv'])

# latlong = '(10.323727047081501, 105.2516346045924)'	
# latlong=latlong.replace('(','').replace(')','').replace(' ','').split(',')
# bbox_of_interest = [float(latlong[1]) , float(latlong[0]), float(latlong[1])+100.00 , float(latlong[0])]
# bbox_of_interest

def combine_two_datasets(dataset1,dataset2):
    '''
    Returns a  vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined 
    dataset2 - Dataset 2 to be combined
    '''
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

crop_data = combine_two_datasets(crop_presence_data,vh_vv_data)
crop_data.head()

### Radar Vegetation Index (RVI)
#mean = d.mean(dim=['latitude','longitude']).compute()

# crop_data.to_csv('/content/drive/MyDrive/crop_data.csv')

"""## Model Building"""

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
crop_data = crop_data[['vh','vv','Class of Land']]

X = crop_data.drop(columns=['Class of Land']).values
y = crop_data ['Class of Land'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y,random_state=40)

# poly = PolynomialFeatures(degree = 3, interaction_only=True)
# X_train = poly.fit_transform(X_train)
# X_test = poly.fit_transform(X_test)

sc = RobustScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# ## Explore MINMAX Scaler
# ## Max Absolute Scaling
# ## Robust Scaling
# from sklearn.preprocessing import MinMaxScaler
# sc = MinMaxScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

"""## Training"""

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0)
model.fit(X_train,y_train)

insample_predictions = model.predict(X_train)
print("Insample Accuracy {0:.2f}%".format(100*accuracy_score(insample_predictions,y_train)))
print(classification_report(insample_predictions,y_train))

#  ## Trying out Gaussian Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
# accuracy = (X_test.shape[0] - (y_test != y_pred).sum())/X_test.shape[0]
# print("Insample Accuracy {0:.2f}%".format(100*accuracy))
# # print(classification_report(insample_predictions,y_train))


def plot_confusion_matrix(true_value,predicted_value,title,labels):
    '''
    Plots a confusion matrix.
    Attributes:
    true_value - The ground truth value for comparision.
    predicted_value - The values predicted by the model.
    title - Title of the plot.
    labels - The x and y labels of the plot.
    '''
    cm = confusion_matrix(true_value,predicted_value)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Blues');
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('True labels'); 
    ax.set_title(title); 
    ax.xaxis.set_ticklabels(labels); 
    ax.yaxis.set_ticklabels(labels);

plot_confusion_matrix(y_train,insample_predictions,"Model Level 1: Gradient Boosting Method In-Sample Results",['Rice', 'Non Rice'])

"""## Outsample Evaluation"""

outsample_predictions = model.predict(X_test)
print("Accuracy {0:.2f}%".format(100*accuracy_score(outsample_predictions, y_test)))
print(classification_report(y_test, outsample_predictions))

plot_confusion_matrix(y_test, outsample_predictions,"Model Level 1: Gradient Boosting Method Out-Sample Results",['Rice', 'Non Rice'])

test_file = pd.read_csv('/content/drive/MyDrive/challenge_1_submission_template.csv')
test_file.head()

from sklearn.utils import all_estimators
def getClassifiers(classifier):
    classifierModels = []
    for name, ClassifierClass in all_estimators(type_filter=classifier):
        try:
            classifier = ClassifierClass()
            classifierModels.append((name, classifier))
        except Exception as e:
            pass
    return classifierModels

classifierModels = getClassifiers('classifier')
print(len(classifierModels))
model1 = classifierModels[: 10]
model2 = classifierModels[10: 20]
model3 = classifierModels[20: 30]
model4 = classifierModels[30: 34]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, r2_score

def evaluate(model, XTrain, yTrain, XTest, yTest):
    model.fit(XTrain, yTrain)
    yPred = model.predict(XTest)
    accuracy = accuracy_score(yTest, yPred)
    # f1 = f1_score(yTest, yPred)
    print(f'Model Name = {type(model).__name__}, accuracy score = {accuracy}')
    print(classification_report(y_test, yPred))

def tryModels(models):
    for model in models:
        try:
            print(f'Running {type(model[1]).__name__}')
            ans = evaluate(model[1], X_train, y_train, X_test, y_test)
            # print(classification_report(y_test, outsample_predictions))
            print(ans)
        except Exception as e:
            pass

tryModels(model1)

tryModels(model2)

tryModels(model3)

tryModels(model4)

from sklearn.semi_supervised import LabelPropagation
model0 = LabelPropagation(kernel='knn', n_neighbors=7, max_iter=1000)
model0.fit(X_train,y_train)

#Grid Search method for Hyperparameter tuning
# from sklearn.model_selection import GridSearchCV
# lp = LabelPropagation(kernel='knn',max_iter=1000)
# params = {
#     'n_neighbors' : [3,5,7,9]
# }
# grid = GridSearchCV(lp,params,cv=5,scoring='accuracy')
# grid.fit(X_train, y_train)
# model = grid.best_estimator_
# predicted_labels1 = model.predict(X_test)
# print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels1,y_test)))
# print(classification_report(predicted_labels1,y_test))
# grid.best_params_

insample_predictions0 = model0.predict(X_test)
print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(insample_predictions0,y_test)))
print(classification_report(insample_predictions0,y_test))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
params = {
    'n_estimators' : [100,150,200,250],
    'criterion' : ['gini','entropy','log_loss'],
    'max_depth' : [1,2,3,4,5,6,7]
}
grid = GridSearchCV(rf,params,scoring='accuracy',cv=3)
grid.fit(X_train,y_train)
rf_model = grid.best_estimator_
predicted_labels2 = rf_model.predict(X_test)
print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels2,y_test)))
print(classification_report(predicted_labels2,y_test))
grid.best_params_

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn_model = KNeighborsClassifier()
# knn_model.fit(X_train, y_train)
# predicted_labels3 = knn_model.predict(X_test)
# print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels3,y_test)))
# print(classification_report(predicted_labels3,y_test))

params = {
    'n_neighbors' : [5,6,7],
    'weights' : ['uniform','distance'],
    'algorithm' : ['auto','ball_tree','kd_tree','brute'],
    'p' : [1,2]
}
grid = GridSearchCV(knn_model,params,cv=3,scoring='accuracy')
grid.fit(X_train, y_train)
model = grid.best_estimator_
predicted_labels3 = model.predict(X_test)
print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels3,y_test)))
print(classification_report(predicted_labels3,y_test))
grid.best_params_

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
gpc = GaussianProcessClassifier()
# param_grid = {
#     'kernel' : [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
# }
# grid = GridSearchCV(gpc, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
param_grid = {
    'max_iter_predict' : [100,150,200],
    'n_restarts_optimizer' : [1,2],
    'multi_class' : ['one_vs_rest', 'one_vs_one']
}
grid = GridSearchCV(gpc, param_grid, scoring='accuracy', cv=5)
grid.fit(X_train, y_train)
model = grid.best_estimator_
predicted_labels4 = model.predict(X_test)
print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels4,y_test)))
print(classification_report(predicted_labels4,y_test))
# model = grid.best_estimator_
# predicted_labels = model.predict(X_test)
# print("Outsample Accuracy {0:.2f}%".format(100*accuracy_score(predicted_labels,y_test)))
# print(classification_report(predicted_labels,y_test))
# print('Best Mean Accuracy: %.3f' % grid.best_score_)
grid.best_params_