Source code for tlseparation.classification.wlseparation

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"


import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from ..utility.knnsearch import set_nbrs_knn
from ..classification.point_features import knn_features
from ..classification.gmm import (classify, class_select_abs,
                                  class_select_ref)


[docs]def fill_class(arr1, arr2, noclass, k):

    """
    Assigns noclass entries to either arr1 or arr2, depending on
    neighborhood majority analisys.

    Parameters
    ----------
    arr1 : array
        Point coordinates for entries of the first class.
    arr2 : array
        Point coordinates for entries of the second class.
    noclass : array
        Point coordinates for noclass entries.
    k : int
        Number of neighbors to use in the neighborhood majority analysis.

    Returns
    -------
    arr1 : array
        Point coordinates for entries of the first class.
    arr2 : array
        Point coordinates for entries of the second class.

    """

    # Stacking arr1 and arr2. This will be fitted in the NearestNeighbors
    # search in order to define local majority and assign classes to
    # noclass.
    arr = np.vstack((arr1, arr2))

    # Generating classes labels with the same shapes as arr1, arr2 and,
    # after stacking, arr.
    class_1 = np.full(arr1.shape[0], 1, dtype=np.int)
    class_2 = np.full(arr2.shape[0], 2, dtype=np.int)
    classes = np.hstack((class_1, class_2)).T

    # Performin NearestNeighbors search to detect local sets of points.
    nbrs = NearestNeighbors(leaf_size=25, n_jobs=-1).fit(arr)
    indices = nbrs.kneighbors(noclass, n_neighbors=k, return_distance=False)

    # Allocating output variable.
    new_class = np.zeros(noclass.shape[0])

    # Selecting subset of classes based on the neighborhood expressed by
    # indices.
    class_ = classes[indices]

    # Looping over all points in indices.
    for i in range(len(indices)):

        # Counting the number of occurrences of each value in the ith instance
        # of class_.
        unique, count = np.unique(class_[i, :], return_counts=True)
        # Appending the majority class into the output variable.
        new_class[i] = unique[np.argmax(count)]

    # Stacking new points to arr1 and arr2.
    arr1 = np.vstack((arr1, noclass[new_class == 1]))
    arr2 = np.vstack((arr2, noclass[new_class == 2]))

    # Making sure all points were processed and assigned a class.
    assert ((arr1.shape[0] + arr2.shape[0]) ==
            (arr.shape[0] + noclass.shape[0]))

    return arr1, arr2


[docs]def wlseparate_ref_voting(arr, knn_lst, class_file, n_classes=3):

    """
    Classifies a point cloud (arr) into two main classes, wood and leaf.
    Altough this function does not output a noclass category, it still
    filters out results based on classification confidence interval in the
    voting process (if lower than prob_threshold, then voting is not used
    for current point and knn value).

    The final class selection is based a voting scheme applied to a similar
    approach of wlseparate_ref. In this case, the function iterates over a
    series of knn values and apply the reference distance criteria to select
    wood and leaf classes.

    Each knn class result is accumulated in a list and in the end a voting
    is applied. For each point, if the number of times it was classified as
    wood is larger than threhsold, the final class is set to wood. Otherwise
    it is set as leaf.

    Class selection will mask points according to their class mean distance
    to reference classes. The closes reference class gets assignes to each
    intermediate class.

    Parameters
    ----------
    arr : array
        Three-dimensional point cloud of a single tree to perform the
        wood-leaf separation. This should be a n-dimensional array (m x n)
        containing a set of coordinates (n) over a set of points (m).
    knn_lst : list
        List of knn values to use in the search to constitue local subsets of
        points around each point in 'arr'. It can be a single knn value, as
        long as it has list data type.
    class_file : pandas dataframe or str
        Dataframe or path to reference classes file.
    n_classes : int
        Number of classes to use in the Gaussian Mixture Classification.

    Returns
    -------
    class_dict : dict
        Dictionary containing indices for all classes in class_ref. Classes
        are labeled according to classes names in class_file.
    count_dict : dict
        Dictionary containin votes count for all classes in class_ref. Classes
        are labeled according to classes names in class_file.
    prob_dict : dict
        Dictionary containing probabilities for all classes in class_ref.
        Classes are labeled according to classes names in class_file.

    """

    # Making sure 'knn_lst' is of list type.
    if type(knn_lst) != list:
        knn_lst = [knn_lst]

    # Initializing voting accumulator and class probability arrays.
    vt = np.full([arr.shape[0], len(knn_lst)], -1, dtype=int)
    prob = np.full([arr.shape[0], len(knn_lst)], -1, dtype=float)

    # Generating a base set of indices and distances around each point.
    # This step uses the largest value in knn_lst to make further searches,
    # with smaller values of knn, more efficient.
    idx_base = set_nbrs_knn(arr, arr, np.max(knn_lst), return_dist=False)

    # Reading in class reference values from file.
    if isinstance(class_file, str):
        class_table = pd.read_csv(class_file)
        print class_table
    elif isinstance(class_file, pd.core.frame.DataFrame):
        class_table = class_file
    else:
        raise Exception('class file should be a pandas dataframe or file path')
    class_ref = np.asarray(class_table.iloc[:, 1:]).astype(float)

    # Looping over values of knn in knn_lst.
    for i, k in enumerate(knn_lst):
        # Subseting indices and distances based on initial knn search and
        # current knn value (k).
        idx_1 = idx_base[:, :k+1]

        # Calculating the geometric descriptors.
        gd_1 = knn_features(arr, idx_1)

        # Classifying the points based on the geometric descriptors.
        classes_1, cm_1, proba_1 = classify(gd_1, n_classes)
        cm_1 = ((cm_1 - np.min(cm_1, axis=0)) /
                (np.max(cm_1, axis=0) - np.min(cm_1, axis=0)))

        # Selecting which classes represent classes from classes reference
        # file.
        new_classes = class_select_ref(classes_1, cm_1, class_ref)

        # Appending results to vt temporary list.
        vt[:, i] = new_classes.astype(int)
        prob[:, i] = np.max(proba_1, axis=1)

    # Performing the voting scheme (majority selection) for each point.
    # Initializing final_* variables to store class number, vote counts and
    # class provability.
    final_class = np.full([arr.shape[0]], -1, dtype=int)
    final_count = np.full([arr.shape[0]], -1, dtype=int)
    final_prob = np.full([arr.shape[0]], -1, dtype=float)
    # Iterating over class votes (vt) and their probabilities (prob).
    for i, (v, p) in enumerate(zip(vt, prob)):
        # Counting votes of each class.
        unique, count = np.unique(v, return_counts=True)
        # Appending to final_* arrays the most voted class, the total number
        # of votes this class received and it's classficiation probability.
        final_class[i] = unique[np.argmax(count)]
        final_count[i] = count[np.argmax(count)]
        # Masking entries that received a vote for the most voted class.
        final_class_mask = v == final_class[i]
        # Averaging over all classification probabilities for all votes of
        # the most voted class.
        final_prob[i] = np.mean(p[final_class_mask])

    # Selecting classes labels from entries in class_ref.
    # Generating indices array to help in future indexing.
    idx = np.arange(arr.shape[0], dtype=int)
    # Initializing dictionaires for output variables.
    class_dict = {}
    count_dict = {}
    prob_dict = {}
    # Looping over each unique class in final_class.
    for c in np.unique(final_class).astype(int):
        # Selecting all indices for points that were classfied as
        # belonging to current class.
        class_idx = idx[final_class == c]
        # Selecting all vote counts for points that were classfied as
        # belonging to current class. Only gets votes of most voted class for
        # each point.
        class_count = final_count[final_class == c]
        # Selecting all classification probabilities for points that were
        # classfied as belonging to current class. Only gets probability of
        # most voted class for each point.
        class_prob = final_prob[final_class == c]
        # Assigining current class indices, votes and probability to
        # output dictionaries. Current key name is set as selected class name
        # from class_ref.
        class_dict[class_table.iloc[c, :]['class']] = class_idx
        count_dict[class_table.iloc[c, :]['class']] = class_count
        prob_dict[class_table.iloc[c, :]['class']] = class_prob

    return class_dict, count_dict, prob_dict


[docs]def wlseparate_abs(arr, knn, knn_downsample=1, n_classes=3):

    """
    Classifies a point cloud (arr) into three main classes, wood, leaf and
    noclass.

    The final class selection is based on the absolute value of the last
    geometric feature (see point_features module).
    Points will be only classified as wood or leaf if their classification
    probability is higher than prob_threshold. Otherwise, points are
    assigned to noclass.

    Class selection will mask points with feature value larger than a given
    threshold as wood and the remaining points as leaf.

    Parameters
    ----------
    arr : array
        Three-dimensional point cloud of a single tree to perform the
        wood-leaf separation. This should be a n-dimensional array (m x n)
        containing a set of coordinates (n) over a set of points (m).
    knn : int
        Number of nearest neighbors to search to constitue the local subset of
        points around each point in 'arr'.
    knn_downsample : float
        Downsample factor (0, 1) for the knn parameter. If less than 1, a
        sample of size (knn * knn_downsample) will be selected from the
        nearest neighbors indices. This option aims to maintain the spatial
        representation of the local subsets of points, but reducing overhead
        in memory and processing time.
    n_classes : int
        Number of classes to use in the Gaussian Mixture Classification.

    Returns
    -------
    class_indices : dict
        Dictionary containing indices for wood and leaf classes.
    class_probability : dict
        Dictionary containing probabilities for wood and leaf classes.

    """

    # Generating the indices array of the 'k' nearest neighbors (knn) for all
    # points in arr.
    idx_1 = set_nbrs_knn(arr, arr, knn, return_dist=False)

    # If downsample fraction value is set to lower than 1. Apply downsampling
    # on knn indices.
    if knn_downsample < 1:
        n_samples = np.int(idx_1.shape[1] * knn_downsample)
        idx_f = np.zeros([idx_1.shape[0], n_samples + 1])
        idx_f[:, 0] = idx_1[:, 0]
        for i in range(idx_f.shape[0]):
            idx_f[i, 1:] = np.random.choice(idx_1[i, 1:], n_samples,
                                            replace=False)
        idx_1 = idx_f.astype(int)

    # Calculating geometric descriptors.
    gd_1 = knn_features(arr, idx_1)

    # Classifying the points based on the geometric descriptors.
    classes_1, cm_1, proba_1 = classify(gd_1, n_classes)

    # Selecting which classes represent wood and leaf. Wood classes are masked
    # as True and leaf classes as False.
    mask_1 = class_select_abs(classes_1, cm_1, idx_1)

    # Generating set of indices of entries in arr. This will be part of the
    # output.
    arr_ids = np.arange(0, arr.shape[0], 1, dtype=int)

    # Creating output class indices dictionary and class probabilities
    # dictionary.
    # mask represent wood points, (~) not mask represent leaf points.
    class_indices = {}
    class_probability = {}
    try:
        class_indices['wood'] = arr_ids[mask_1]
        class_probability['wood'] = np.max(proba_1, axis=1)[mask_1]
    except:
        class_indices['wood'] = []
        class_probability['wood'] = []
    try:
        class_indices['leaf'] = arr_ids[~mask_1]
        class_probability['leaf'] = np.max(proba_1, axis=1)[~mask_1]
    except:
        class_indices['leaf'] = []
        class_probability['leaf'] = []

    return class_indices, class_probability