Source code for tlseparation.utility.knnsearch

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"

import numpy as np
from sklearn.neighbors import NearestNeighbors


[docs]def set_nbrs_knn(arr, pts, knn, return_dist=True, block_size=100000):

    """
    Function to create a set of nearest neighbors indices and their respective
    distances for a set of points. This function uses a knn search and sets a
    limit size for a block of points to query. This makes it less efficient in
    terms of processing time, but avoids running out of memory in cases of
    very dense/large arrays/queries.

    Parameters
    ----------
    arr : array
        N-dimensional array to perform the knn search on.
    pts : array
        N-dimensional array to search for on the knn search.
    knn : int
        Number of nearest neighbors to search for.
    return_dist : boolean
        Option to return or not the distances of each neighbor.
    block_size : int
        Limit of points to query. The variable 'pts' will be subdivided in n
        blocks of size block_size to perform query.

    Returns
    -------
    indices : array
        Set of neighbors indices from 'arr' for each entry in 'pts'.
    distance : array
        Distances from each neighbor to each central point in 'pts'.

    """

    # Making sure knn is of type int.
    knn = int(knn)

    # Initiating the nearest neighbors search and fitting it to the input
    # array.
    nbrs = NearestNeighbors(n_neighbors=knn, metric='euclidean',
                            algorithm='kd_tree', leaf_size=15,
                            n_jobs=-1).fit(arr)

    # Making sure block_size is limited by at most the number of points in
    # arr.
    if block_size > pts.shape[0]:
        block_size = pts.shape[0]

    # Creating block of ids.
    ids = np.arange(pts.shape[0])
    ids = np.array_split(ids, int(pts.shape[0] / block_size))

    # Initializing variables to store distance and indices.
    if return_dist is True:
        distance = np.zeros([pts.shape[0], knn])
    indices = np.zeros([pts.shape[0], knn])

    # Checking if the function should return the distance as well or only the
    # neighborhood indices.
    if return_dist is True:
        # Obtaining the neighborhood indices and their respective distances
        # from the center point by looping over blocks of ids.
        for i in ids:
            nbrs_dist, nbrs_ids = nbrs.kneighbors(pts[i])
            distance[i] = nbrs_dist
            indices[i] = nbrs_ids
        return distance, indices

    elif return_dist is False:
        # Obtaining the neighborhood indices only  by looping over blocks of
        # ids.
        for i in ids:
            nbrs_ids = nbrs.kneighbors(pts[i], return_distance=False)
            indices[i] = nbrs_ids
        return indices


[docs]def set_nbrs_rad(arr, pts, rad, return_dist=True, block_size=100000):

    """
    Function to create a set of nearest neighbors indices and their respective
    distances for a set of points. This function uses a radius search and sets
    a limit size for a block of points to query. This makes it less efficient
    in terms of processing time, but avoids running out of memory in cases of
    very dense/large arrays/queries.

    Parameters
    ----------
    arr : array
        N-dimensional array to perform the radius search on.
    pts : array
        N-dimensional array to search for on the knn search.
    rad : float
        Radius of the NearestNeighbors search.
    return_dist : boolean
        Option to return or not the distances of each neighbor.
    block_size : int
        Limit of points to query. The variable 'pts' will be subdivided in n
        blocks of size block_size to perform query.

    Returns
    -------
    indices : array
        Set of neighbors indices from 'arr' for each entry in 'pts'.
    distance : array
        Distances from each neighbor to each central point in 'pts'.

    """

    # Making sure block_size is limited by at most the number of points in
    # arr.
    if block_size > pts.shape[0]:
        block_size = pts.shape[0]

    # Initiating the nearest neighbors search and fitting it to the input
    # array.
    nbrs = NearestNeighbors(radius=rad, metric='euclidean',
                            algorithm='kd_tree', leaf_size=15,
                            n_jobs=-1).fit(arr)

    # Creating block of ids.
    ids = np.arange(pts.shape[0])
    ids = np.array_split(ids, int(pts.shape[0] / block_size))

    # Initializing variables to store distance and indices.
    if return_dist is True:
        distance = []
    indices = []

    # Checking if the function should return the distance as well or only the
    # neighborhood indices.
    if return_dist is True:
        # Obtaining the neighborhood indices and their respective distances
        # from the center point by looping over blocks of ids.
        for i in ids:
            nbrs_dist, nbrs_ids = nbrs.radius_neighbors(pts[i])
            for j, k in enumerate(i):
                distance.append(nbrs_dist[j])
                indices.append(nbrs_ids[j])
        return distance, indices

    elif return_dist is False:
        # Obtaining the neighborhood indices only  by looping over blocks of
        # ids.
        for i in ids:
            nbrs_ids = nbrs.radius_neighbors(pts[i], return_distance=False)
            for j, k in enumerate(i):
                indices.append(nbrs_ids[j])
        return indices


[docs]def subset_nbrs(distance, indices, new_knn, block_size=100000):

    """
    Performs a subseting of points from the results of a nearest neighbors
    search.
    This function assumes that the first index/distance in each row represents
    the center point of the neighborhood represented by said rows.

    Parameters
    ----------
    distance : array
        Distances from each neighbor to each central point in 'pts'.
    indices : array
        Set of neighbors indices from 'arr' for each entry in 'pts'.
    new_knn : array
        Number of neighbors to select from the initial number of neighbors.
    block_size : int
        Limit of points to query. The variables 'distance' and 'indices' will
        be subdivided in n blocks of size block_size to perform query.

    Returns
    -------
    distance : array
        Subset of distances from each neighbor 'indices'.
    indices : array
        Subset of neighbors indices from 'indices'.

    """

    # Making sure block_size is limited by at most the number of points in
    # arr.
    if block_size > distance.shape[0]:
        block_size = distance.shape[0]

    # Creating block of ids.
    ids = np.arange(distance.shape[0])
    ids = np.array_split(ids, int(distance.shape[0] / block_size))

    # Initializing new_distance and new_indices variables.
    new_distance = []
    new_indices = []

    # Processing all blocks of indices in ids.
    for id_ in ids:

        # Looping over each sample in distance and indices.
        for d, i in zip(distance[id_], indices[id_]):
            # Checks if new knn values are smaller than current distance and
            # indices rows. This avoids errors of trying to select a number of
            # columns larger than the available columns.
            if distance.shape[1] >= new_knn:
                new_distance.append(d[:new_knn+1])
                new_indices.append(i[:new_knn+1].astype(int))
            else:
                new_distance.append(d)
                new_indices.append(int(i))

    # Returning new_distance and new_indices as arrays.
    return np.asarray(new_distance), np.asarray(new_indices)