Source code for tlseparation.classification.gmm

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"

import numpy as np
from sklearn.mixture import GaussianMixture as GMM


[docs]def classify(variables, n_classes):

    """
    Function to perform the classification of a dataset using sklearn's
    Gaussian Mixture Models with Expectation Maximization.

    Parameters
    ----------
    variables : array
        N-dimensional array (m x n) containing a set of parameters (n)
        over a set of observations (m).
    n_classes : int
        Number of classes to assign the input variables.

    Returns
    -------
    classes : list
        List of classes labels for each observation from the input variables.
    means : array
        N-dimensional array (c x n) of each class (c) parameter space means
        (n).
    probability : array
        Probability of samples belonging to every class in the classification.
        Sum of sample-wise probability should be 1.

    """

    # Initialize a GMM classifier with n_classes and fit variables to it.
    gmm = GMM(n_components=n_classes)
    gmm.fit(variables)

    return gmm.predict(variables), gmm.means_, gmm.predict_proba(variables)


[docs]def class_select_ref(classes, cm, classes_ref):

    """
    Selects from the classification results which classes are wood and which
    are leaf.

    Parameters
    ----------
    classes : list
        List of classes labels for each observation from the input variables.
    cm : array
        N-dimensional array (c x n) of each class (c) parameter space mean
        valuess (n).
    classes_ref : array
        Reference classes values.

    Returns
    -------
    mask : array
        List of booleans where True represents wood points and False
        represents leaf points.

    """

    # Initializing array of class ids.
    class_ids = np.zeros([cm.shape[0]])

    # Looping over each index in the classes means array.
    for c in range(cm.shape[0]):
        # Setting initial minimum distance value.
        mindist = np.inf
        # Looping over indices in classes reference values.
        for i in range(classes_ref.shape[0]):
            # Calculating distance of current class mean parameters and
            # current reference paramenters.
            d = np.linalg.norm(cm[c] - classes_ref[i])
            # Checking if current distance is smaller than previous distance
            # if so, assign current reference index to current class index.
            if d < mindist:
                class_ids[c] = i
                mindist = d

    # Assigning final classes values to new classes.
    new_classes = np.zeros([classes.shape[0]])
    for i in range(new_classes.shape[0]):
        new_classes[i] = class_ids[classes[i]]

    return new_classes


[docs]def class_select_abs(classes, cm, nbrs_idx, feature=5, threshold=0.5):

    """
    Select from GMM classification results which classes are wood and which
    are leaf based on a absolute value threshold from a single feature in
    the parameter space.

    Parameters
    ----------
    classes : list or array
        Classes labels for each observation from the input variables.
    cm : array
        N-dimensional array (c x n) of each class (c) parameter space mean
        valuess (n).
    nbrs_idx : array
        Nearest Neighbors indices relative to every point of the array that
        originated the classes labels.
    feature : int
        Column index of the feature to use as constraint.
    threshold : float
        Threshold value to mask classes. All classes with means >= threshold
        are masked as true.

    Returns
    -------
    mask : list
        List of booleans where True represents wood points and False
        represents leaf points.

    """

    # Calculating the ratio of first 3 components of the classes means (cm).
    # These components are the basic geometric descriptors.
    if np.max(np.sum(cm, axis=1)) >= threshold:

        class_id = np.argmax(cm[:, feature])

        # Masking classes based on the criterias set above. Mask will present
        # True for wood points and False for leaf points.
        mask = classes == class_id

    else:
        mask = []

    return mask