Source code for tlseparation.utility.knnsearch

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"

import numpy as np
from sklearn.neighbors import NearestNeighbors


[docs]def set_nbrs_knn(arr, pts, knn, return_dist=True, block_size=100000): """ Function to create a set of nearest neighbors indices and their respective distances for a set of points. This function uses a knn search and sets a limit size for a block of points to query. This makes it less efficient in terms of processing time, but avoids running out of memory in cases of very dense/large arrays/queries. Parameters ---------- arr : array N-dimensional array to perform the knn search on. pts : array N-dimensional array to search for on the knn search. knn : int Number of nearest neighbors to search for. return_dist : boolean Option to return or not the distances of each neighbor. block_size : int Limit of points to query. The variable 'pts' will be subdivided in n blocks of size block_size to perform query. Returns ------- indices : array Set of neighbors indices from 'arr' for each entry in 'pts'. distance : array Distances from each neighbor to each central point in 'pts'. """ # Making sure knn is of type int. knn = int(knn) # Initiating the nearest neighbors search and fitting it to the input # array. nbrs = NearestNeighbors(n_neighbors=knn, metric='euclidean', algorithm='kd_tree', leaf_size=15, n_jobs=-1).fit(arr) # Making sure block_size is limited by at most the number of points in # arr. if block_size > pts.shape[0]: block_size = pts.shape[0] # Creating block of ids. ids = np.arange(pts.shape[0]) ids = np.array_split(ids, int(pts.shape[0] / block_size)) # Initializing variables to store distance and indices. if return_dist is True: distance = np.zeros([pts.shape[0], knn]) indices = np.zeros([pts.shape[0], knn]) # Checking if the function should return the distance as well or only the # neighborhood indices. if return_dist is True: # Obtaining the neighborhood indices and their respective distances # from the center point by looping over blocks of ids. for i in ids: nbrs_dist, nbrs_ids = nbrs.kneighbors(pts[i]) distance[i] = nbrs_dist indices[i] = nbrs_ids return distance, indices elif return_dist is False: # Obtaining the neighborhood indices only by looping over blocks of # ids. for i in ids: nbrs_ids = nbrs.kneighbors(pts[i], return_distance=False) indices[i] = nbrs_ids return indices
[docs]def set_nbrs_rad(arr, pts, rad, return_dist=True, block_size=100000): """ Function to create a set of nearest neighbors indices and their respective distances for a set of points. This function uses a radius search and sets a limit size for a block of points to query. This makes it less efficient in terms of processing time, but avoids running out of memory in cases of very dense/large arrays/queries. Parameters ---------- arr : array N-dimensional array to perform the radius search on. pts : array N-dimensional array to search for on the knn search. rad : float Radius of the NearestNeighbors search. return_dist : boolean Option to return or not the distances of each neighbor. block_size : int Limit of points to query. The variable 'pts' will be subdivided in n blocks of size block_size to perform query. Returns ------- indices : array Set of neighbors indices from 'arr' for each entry in 'pts'. distance : array Distances from each neighbor to each central point in 'pts'. """ # Making sure block_size is limited by at most the number of points in # arr. if block_size > pts.shape[0]: block_size = pts.shape[0] # Initiating the nearest neighbors search and fitting it to the input # array. nbrs = NearestNeighbors(radius=rad, metric='euclidean', algorithm='kd_tree', leaf_size=15, n_jobs=-1).fit(arr) # Creating block of ids. ids = np.arange(pts.shape[0]) ids = np.array_split(ids, int(pts.shape[0] / block_size)) # Initializing variables to store distance and indices. if return_dist is True: distance = [] indices = [] # Checking if the function should return the distance as well or only the # neighborhood indices. if return_dist is True: # Obtaining the neighborhood indices and their respective distances # from the center point by looping over blocks of ids. for i in ids: nbrs_dist, nbrs_ids = nbrs.radius_neighbors(pts[i]) for j, k in enumerate(i): distance.append(nbrs_dist[j]) indices.append(nbrs_ids[j]) return distance, indices elif return_dist is False: # Obtaining the neighborhood indices only by looping over blocks of # ids. for i in ids: nbrs_ids = nbrs.radius_neighbors(pts[i], return_distance=False) for j, k in enumerate(i): indices.append(nbrs_ids[j]) return indices
[docs]def subset_nbrs(distance, indices, new_knn, block_size=100000): """ Performs a subseting of points from the results of a nearest neighbors search. This function assumes that the first index/distance in each row represents the center point of the neighborhood represented by said rows. Parameters ---------- distance : array Distances from each neighbor to each central point in 'pts'. indices : array Set of neighbors indices from 'arr' for each entry in 'pts'. new_knn : array Number of neighbors to select from the initial number of neighbors. block_size : int Limit of points to query. The variables 'distance' and 'indices' will be subdivided in n blocks of size block_size to perform query. Returns ------- distance : array Subset of distances from each neighbor 'indices'. indices : array Subset of neighbors indices from 'indices'. """ # Making sure block_size is limited by at most the number of points in # arr. if block_size > distance.shape[0]: block_size = distance.shape[0] # Creating block of ids. ids = np.arange(distance.shape[0]) ids = np.array_split(ids, int(distance.shape[0] / block_size)) # Initializing new_distance and new_indices variables. new_distance = [] new_indices = [] # Processing all blocks of indices in ids. for id_ in ids: # Looping over each sample in distance and indices. for d, i in zip(distance[id_], indices[id_]): # Checks if new knn values are smaller than current distance and # indices rows. This avoids errors of trying to select a number of # columns larger than the available columns. if distance.shape[1] >= new_knn: new_distance.append(d[:new_knn+1]) new_indices.append(i[:new_knn+1].astype(int)) else: new_distance.append(d) new_indices.append(int(i)) # Returning new_distance and new_indices as arrays. return np.asarray(new_distance), np.asarray(new_indices)