Source code for tlseparation.utility.filtering

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"

import numpy as np
from knnsearch import (set_nbrs_knn, set_nbrs_rad)
from data_utils import (get_diff, remove_duplicates)
from shortpath import (array_to_graph, extract_path_info)
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from ..classification.point_features import (svd_evals, knn_features,
                                             curvature)

[docs]def cluster_size(arr, labels, min_size): """ Filters a set of connected components by maximum size on any dimension. Parameters ---------- arr : array Three-dimensional (m x n) array of a point cloud, where the coordinates are represented in the columns (n) and the points are represented in the rows (m). labels : array 1D array with cluster labels assigned to each point from the input point cloud. min_size : int/float Minimum size, on any dimension, for a cluster to be set as valid (True) Returns ------- filter_mask : array 1D mask array setting True for valid poins in 'arr' and False otherwise. """ # Initializes mask. filter_mask = np.zeros(labels.shape[0], dtype=int) # Loops over each cluster label. for l in np.unique(labels): # Masks indices of current cluster. mask = l == labels # Selects point from current cluster. cluster_points = arr[mask] # Calculates the size of the current cluster in all dimensions. cluster_size = (np.max(cluster_points, axis=0).astype(float) - np.min(cluster_points, axis=0)) # Checks if cluster has at least one dimension with size larger then # min_size and, if so, assign a True (1) value to filter mask for # points that are part of the current cluster. if np.max(cluster_size) > min_size: filter_mask[mask] = 1 return filter_mask.astype(bool)
[docs]def cluster_features(arr, labels, feature_threshold, min_pts=10): """ Filters a set of connected components by a geometric feature threshold. This feature (n 2 in the separation methodology) is used to describe elongated shapes. If the shape of the cluster is elongated enough (i.e. feature value larger than threshold) the points belonging to this cluster are masked as True. Parameters ---------- arr : array Three-dimensional (m x n) array of a point cloud, where the coordinates are represented in the columns (n) and the points are represented in the rows (m). labels : array 1D array with cluster labels assigned to each point from the input point cloud. feature_threshold : float Minimum feature value for the cluster to be set as elongated (True). min_pts : int Minimum number of points for the cluster to be set as valid (True). Returns ------- filter_mask : array 1D mask array setting True for valid poins in 'arr' and False otherwise. """ # Initializes arrays for mask and eigenvalues ratio. filter_mask = np.zeros(labels.shape[0], dtype=int) evals_ratio = np.zeros([arr.shape[0], 3]) # Loops over each cluster label. for l in np.unique(labels): # Masks indices of current cluster. mask = l == labels # Check if current cluster has at least points, otherwise is not # possible to estimate its eigenvalues if np.sum(mask) >= 3: # Selects point from current cluster. cluster_points = arr[mask] if cluster_points.shape[0] >= min_pts: # Calculating centroid coordinates of points in # 'cluster_points'. centroid = np.average(cluster_points, axis=0) # Running SVD on centered points from 'cluster_points'. _, evals, _ = np.linalg.svd(cluster_points - centroid, full_matrices=False) # Calculating eigenvalues ratio and assigning to the # respective indices of current points to evals_ratio. evals_ratio[mask] = evals / np.sum(evals) else: pass # Calculating geometric feature. feature = evals_ratio[:, 0] - evals_ratio[:, 1] # Checking feature values against threshold and masking feature values # larger than threshold as True. filter_mask = feature >= feature_threshold return filter_mask
[docs]def feature_filter(arr, feature_id, threshold, knn): """ Filters a point cloud based on a given feature threshold. Only points with selected feature values higher than threshold are kept as valid. Parameters ---------- arr : array Three-dimensional (m x n) array of a point cloud, where the coordinates are represented in the columns (n) and the points are represented in the rows (m). feature_id : int Column index of feature selected as criteria to filter. Column indices follow Python notation [0 - (n_columns - 1)]. threshold : float Minimum feature value for valid points. knn : int Number of neighbors to select around each point. Used to describe local point arrangement. Returns ------- mask_feature : numpy.ndarray Boolean mask with valid points entries set as True. """ # Running NearestNeighborhood search and calculating geometric features # for each point's neighborhood. nbrs_idx = set_nbrs_knn(arr, arr, knn, False) features = knn_features(arr, nbrs_idx) # Masking valid points. return features[:, feature_id] >= threshold
[docs]def plane_filter(arr, rad, threshold): """ Filters a point cloud based on its points planicity. Removes points that are part of a neighbohood with planar spatial arrangement (low curvature). Parameters ---------- arr : array Three-dimensional (m x n) array of a point cloud, where the coordinates are represented in the columns (n) and the points are represented in the rows (m). rad : float Search radius distance around each point. Used to describe local point arrangement. threshold : float Minimum curvature value for valid points. Returns ------- mask_plane : numpy.ndarray Boolean mask with valid points entries set as True. """ # Running NearestNeighborhood search around each point in arr. nbrs_idx = set_nbrs_rad(arr, arr, rad, False) # Calculating curvature for each point's neighborhood. c = curvature(arr, nbrs_idx) return c >= threshold
[docs]def cluster_filter(arr, max_dist, eval_threshold): """ Applies a cluster filter to a point cloud 'arr'. This filter aims to remove small, isolated, clusters of points. Parameters ---------- arr : array Point cloud of shape n points x m dimensions to be filtered. max_dist : float Maximum distance between points to considered part of the same cluster. eval_threshold : float Minimum value for largest eigenvalue for a valid cluster. This value is an indication of cluster shape, in which the higher the eigenvalue, more elongated is the cluster. Points from clusters that have eigenvalue smaller then eval_threshold are filtered out. Returns ------- mask : array Boolean mask of filtered points. Entries are set as True if belonging to a valid cluster and False otherwise. """ # Initializing and fitting HDBSCAN clustering to input array 'arr'. clusterer = DBSCAN(max_dist).fit(arr) labels = clusterer.labels_ # Initializing arrat of final eigenvalues for each cluster. final_evals = np.zeros([labels.shape[0], 3]) # Looping over each unique cluster label. for L in np.unique(labels): # Obtaining indices for all entries in 'arr' that are part of current # cluster. ids = np.where(labels == L)[0] # Checking if current cluster is not an empty cluster (label == -1) # and if current cluster has more than 3 points. if (L != -1) & len(ids) >= 3: # Calculated eigenvalues for current cluster. e = svd_evals(arr[ids]) # Assigning current eigenvalues to indices of all points of # current cluster in final_evals. final_evals[ids] = e # Calculate eigenvalues ratio. This standardizes all rows (eigenvalues # of each point) to an interval between 0 and 1. The sum of each row # is 1. ratio = np.asarray([i / np.sum(i) for i in final_evals]) # Mask points by largest eigenvalue (column -0). return ratio[:, 0] >= eval_threshold
[docs]def radius_filter(arr, radius, min_points): """ Applies a radius search filter, which remove isolated points/clusters of points. Parameters ---------- arr : array Point cloud of shape n points x m dimensions to be filtered. radius : float Search radius around each point to form a neighborhood. min_point : int Minimum number of points in a neighborhood for it to be considered valid, i.e not filtered out. Returns ------- mask : array Array of bools masking valid points as True and "noise" points as False. """ # Setting up neighborhood indices. indices = set_nbrs_rad(arr, arr, radius, return_dist=False) # Allocating array of neighborhood's sizes (one entry for each point in # arr). n_points = np.zeros(arr.shape[0], dtype=int) # Iterating over each entry in indices and calculating total number of # points. for i, id_ in enumerate(indices): n_points[i] = id_.shape[0] return n_points >= min_points
[docs]def continuity_filter(wood, leaf, rad=0.05): """ Function to apply a continuity filter to a point cloud that contains gaps defined as points from a second point cloud. This function works assuming that the continuous variable is the wood portion of a tree point cloud and the gaps in it are empty space or missclassified leaf data. In this sense, this function tries to correct gaps where leaf points are present. Parameters ---------- wood : array Wood point cloud to be filtered. leaf : array Leaf point cloud, with points that may be causing discontinuities in the wood point cloud. rad : float Radius to search for neighboring points in the iterative process. Returns ------- wood : array Filtered wood point cloud. not_wood : array Remaining point clouds after the filtering. """ # Stacking wood and leaf arrays. arr = np.vstack((wood, leaf)) # Getting root index (base_id) from point cloud 'arr'. base_id = np.argmin(arr[:, 2]) # Calculating shortest path graph over sampled array. G = array_to_graph(arr, base_id, 3, 100, 0.05, 0.02, 0.5) node_ids, dist = extract_path_info(G, base_id, return_path=False) node_ids = np.array(node_ids) # Obtaining wood point cloud indices. wood_id = node_ids[node_ids <= wood.shape[0]] # Generating nearest neighbors search for the entire point cloud (arr). nbrs = NearestNeighbors(algorithm='kd_tree', leaf_size=10, n_jobs=-1).fit(arr[node_ids]) # Converting dist variable to array, as it is originaly a list. dist = np.asarray(dist) # Selecting points and accummulated distance for all wood points in arr. gp = arr[wood_id] d = dist[wood_id] # Preparing control variables to iterate over. idbase will be all initial # wood ids and pts all initial wood points. These variables are the ones # to use in search of possible missclassified neighbors. idbase = wood_id pts = gp # Setting treshold variables to iterative process. e = 9999999 e_threshold = 3 # Iterating until threshold is met. while e > e_threshold: # Obtaining the neighbor indices of current set of points (pts). idx2 = nbrs.radius_neighbors(pts, radius=rad, return_distance=False) # Initializing temporary variable id1. id1 = [] # Looping over nn search indices and comparing their respective # distances to center point distance. If nearest neighbor distance (to # point cloud base) is smaller than center point distance, then ith # point is also wood. for i in range(idx2.shape[0]): for i_ in idx2[i]: if dist[i_] <= (d[i]): id1.append(i_) # Uniquifying id1. id1 = np.unique(id1) # Comparing original idbase to new wood ids (id1). comp = np.in1d(id1, idbase) # Maintaining only new ids for next iteration. diff = id1[np.where(~comp)[0]] idbase = np.unique(np.hstack((idbase, id1))) # Passing new wood points to pts and recalculating e value. pts = arr[diff] e = pts.shape[0] # Passing accummulated distances from new points to d. d = dist[diff] # Stacking new points to initial wood points and removing duplicates. gp = np.vstack((gp, pts)) gp = remove_duplicates(gp) # Removing duplicates from final wood points and obtaining not_wood points # from the difference between final wood points and full point cloud. wood = remove_duplicates(gp) not_wood = get_diff(wood, arr) return wood, not_wood
[docs]def array_majority(arr_1, arr_2, **kwargs): """ Applies majority filter on two arrays. Parameters ---------- arr_1 : array n-dimensional array of points to filter. arr_2 : array n-dimensional array of points to filter. **knn : int or float Number neighbors to select around each point in arr in order to apply the majority criteria. **rad : int or float Search radius arount each point in arr to select neighbors in order to apply the majority criteria. Returns ------- c_maj_1 : array Boolean mask of filtered entries of same class as input 'arr_1'. c_maj_2 : array Boolean mask of filtered entries of same class as input 'arr_2'. Raises ------ AssertionError: Raised if neither 'knn' or 'rad' arguments are passed with valid values (int or float). """ # Asserting input arguments are valid. assert ('knn' in kwargs.keys()) or ('rad' in kwargs.keys()), 'Please\ input a value for either "knn" or "rad".' if 'knn' in kwargs.keys(): assert (type(kwargs['knn']) == int) or (type(kwargs['knn']) == float), \ '"knn" variable must be of type int or float.' elif 'rad' in kwargs.keys(): assert (type(kwargs['rad']) == int) or (type(kwargs['rad']) == float), \ '"rad" variable must be of type int or float.' # Stacking the arrays from both classes to generate a combined array. arr = np.vstack((arr_1, arr_2)) # Generating the indices for the local subsets of points around all points # in the combined array. Function used is based upon the argument passed. if 'knn' in kwargs.keys(): indices = set_nbrs_knn(arr, arr, kwargs['knn'], return_dist=False) elif 'rad' in kwargs.keys(): indices = set_nbrs_rad(arr, arr, kwargs['rad'], return_dist=False) # Making sure indices has type int. indices = indices.astype(int) # Generating the class arrays from both classified arrays and combining # them into a single classes array (classes). class_1 = np.full(arr_1.shape[0], 1, dtype=np.int) class_2 = np.full(arr_2.shape[0], 2, dtype=np.int) classes = np.hstack((class_1, class_2)).T # Allocating output variable. c_maj = np.zeros(classes.shape) # Selecting subset of classes based on the neighborhood expressed by # indices. class_ = classes[indices] # Looping over all points in indices. for i in range(len(indices)): # Counting the number of occurrences of each value in the ith instance # of class_. unique, count = np.unique(class_[i, :], return_counts=True) # Appending the majority class into the output variable. c_maj[i] = unique[np.argmax(count)] return c_maj == 1, c_maj == 2
[docs]def class_filter(arr_1, arr_2, target, **kwargs): """ Function to apply class filter on an array based on the combination of classed from both arrays (arr_1 and arr_2). Which array gets filtered is defined by ''target''. Parameters ---------- arr_1 : array n-dimensional array of points to filter. arr_2 : array n-dimensional array of points to filter. target : int or float Number of the input array to filter. Valid values are 0 or 1. **knn : int or float Number neighbors to select around each point in arr in order to apply the majority criteria. **rad : int or float Search radius arount each point in arr to select neighbors in order to apply the majority criteria. Returns ------- c_maj_1 : array Boolean mask of filtered entries of same class as input 'arr_1'. c_maj_2 : array Boolean mask of filtered entries of same class as input 'arr_2'. Raises ------ AssertionError: Raised if neither 'knn' or 'rad' arguments are passed with valid values (int or float). AssertionError: Raised if 'target' variable is not an int or float with value 0 or 1. """ # Asserting input arguments are valid. assert ('knn' in kwargs.keys()) or ('rad' in kwargs.keys()), 'Please\ input a value for either "knn" or "rad".' if 'knn' in kwargs.keys(): assert (type(kwargs['knn']) == int) or (type(kwargs['knn']) == float), \ '"knn" variable must be of type int or float.' elif 'rad' in kwargs.keys(): assert (type(kwargs['rad']) == int) or (type(kwargs['rad']) == float), \ '"rad" variable must be of type int or float.' assert (type(target) == int) or (type(target) == float), '"target"\ variable must be of type int or float.' assert (target == 0) or (target == 1), '"target" variable must be either\ 0 or 1.' # Stacking the arrays from both classes to generate a combined array. arr = np.vstack((arr_1, arr_2)) # Generating the class arrays from both classified arrays and combining # them into a single classes array (classes). class_1 = np.full(arr_1.shape[0], 0, dtype=np.int) class_2 = np.full(arr_2.shape[0], 1, dtype=np.int) classes = np.hstack((class_1, class_2)).T # Generating the indices for the local subsets of points around all points # in the combined array. Function used is based upon the argument passed. if 'knn' in kwargs.keys(): indices = set_nbrs_knn(arr, arr, kwargs['knn'], return_dist=False) elif 'rad' in kwargs.keys(): indices = set_nbrs_rad(arr, arr, kwargs['rad'], return_dist=False) # Making sure indices has type int. indices = indices.astype(int) # Allocating output variable. c_maj = classes.copy() # Selecting subset of classes based on the neighborhood expressed by # indices. class_ = classes[indices] # Checking for the target class. target_idx = np.where(classes == target)[0] # Looping over the target points to filter. for i in target_idx: # Counting the number of occurrences of each value in the ith instance # of class_. count = np.bincount(class_[i, :]) # Appending the majority class into the output variable. c_maj[i] = count.argmax() return c_maj == 0, c_maj == 1
[docs]def dist_majority(arr_1, arr_2, **kwargs): """ Applies majority filter on two arrays. Parameters ---------- arr_1 : array n-dimensional array of points to filter. arr_2 : array n-dimensional array of points to filter. **knn : int or float Number neighbors to select around each point in arr in order to apply the majority criteria. **rad : int or float Search radius arount each point in arr to select neighbors in order to apply the majority criteria. Returns ------- c_maj_1 : array Boolean mask of filtered entries of same class as input 'arr_1'. c_maj_2 : array Boolean mask of filtered entries of same class as input 'arr_2'. Raises: AssertionError: Raised if neither 'knn' or 'rad' arguments are passed with valid values (int or float). """ # Asserting input arguments are valid. assert ('knn' in kwargs.keys()) or ('rad' in kwargs.keys()), 'Please\ input a value for either "knn" or "rad".' if 'knn' in kwargs.keys(): assert (type(kwargs['knn']) == int) or (type(kwargs['knn']) == float), \ '"knn" variable must be of type int or float.' elif 'rad' in kwargs.keys(): assert (type(kwargs['rad']) == int) or (type(kwargs['rad']) == float), \ '"rad" variable must be of type int or float.' # Stacking the arrays from both classes to generate a combined array. arr = np.vstack((arr_1, arr_2)) # Generating the indices for the local subsets of points around all points # in the combined array. Function used is based upon the argument passed. if 'knn' in kwargs.keys(): dist, indices = set_nbrs_knn(arr, arr, kwargs['knn']) elif 'rad' in kwargs.keys(): dist, indices = set_nbrs_rad(arr, arr, kwargs['rad']) # Making sure indices has type int. indices = indices.astype(int) # Generating the class arrays from both classified arrays and combining # them into a single classes array (classes). class_1 = np.full(arr_1.shape[0], 1, dtype=np.int) class_2 = np.full(arr_2.shape[0], 2, dtype=np.int) classes = np.hstack((class_1, class_2)).T # Allocating output variable. c_maj = np.zeros(classes.shape) # Selecting subset of classes based on the neighborhood expressed by # indices. class_ = classes[indices] # Looping over all points in indices. for i in range(len(indices)): # Obtaining classe from indices i. c = class_[i, :] # Caculating accummulated distance for each class. d1 = np.sum(dist[i][c == 1]) d2 = np.sum(dist[i][c == 2]) # Checking which class has the highest distance and assigning it # to current index in c_maj. if d1 >= d2: c_maj[i] = 1 elif d1 < d2: c_maj[i] = 2 return c_maj == 1, c_maj == 2