Source code for tlseparation.utility.data_utils

# Copyright (c) 2017-2019, Matheus Boni Vicari, TLSeparation Project
# All rights reserved.
#
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Matheus Boni Vicari"
__copyright__ = "Copyright 2017-2019, TLSeparation Project"
__credits__ = ["Matheus Boni Vicari"]
__license__ = "GPL3"
__version__ = "1.3.2"
__maintainer__ = "Matheus Boni Vicari"
__email__ = "matheus.boni.vicari@gmail.com"
__status__ = "Development"

import numpy as np
import pandas as pd
from knnsearch import set_nbrs_knn


[docs]def get_diff(arr1, arr2): """ Performs the intersection of two arrays, returning the entries not intersected between arr1 and arr2. Parameters ---------- arr1 : array N-dimensional array of points to intersect. arr2 : array N-dimensional array of points to intersect. Returns ------- arr : array Difference array between 'arr1' and 'arr2'. """ # Asserting that both arrays have the same number of columns. assert arr1.shape[1] == arr2.shape[1] # Stacking both arrays. arr3 = np.vstack((arr1, arr2)) # Creating a pandas.DataFrame from the stacked array. df = pd.DataFrame(arr3) # Removing duplicate points and keeping only points that have only a # single occurrence in the stacked array. diff = df.drop_duplicates(keep=False) return np.asarray(diff)
[docs]def remove_duplicates(arr, return_ids=False): """ Removes duplicated rows from an array. Parameters ---------- arr : array N-dimensional array (m x n) containing a set of parameters (n) over a set of observations (m). return_ids: bool Option to return indices of duplicated entries instead of new array with unique entries. Returns ------- unique : array N-dimensional array (m* x n) containing a set of unique parameters (n) over a set of unique observations (m*). """ # Setting the pandas.DataFrame from the array (arr) data. df = pd.DataFrame({'x': arr[:, 0], 'y': arr[:, 1], 'z': arr[:, 2]}) if return_ids: # Using the duplicated function to mask duplicate points from df. return np.where(df.duplicated((['x', 'y', 'z'])))[0] else: # Using the drop_duplicates function to remove duplicate points # from df. unique = df.drop_duplicates(['x', 'y', 'z']) return np.asarray(unique).astype(float)
[docs]def apply_nn_value(base, arr, attr): """ Upscales a set of attributes from a base array to another denser array. Parameters ---------- base : array Base array to which the attributes to upscale were originaly matched. arr : array Target array to which the attributes will be upscaled. attr : array Attributes to upscale. Returns ------- new_attr : array Upscales attributes. Raises ------ AssertionError: length (number of samples) of "base" and "attr" must be equal. """ assert base.shape[0] == attr.shape[0], '"base" and "attr" must have the\ same number of samples.' # Obtaining the closest in base for each point in arr. idx = set_nbrs_knn(base, arr, 1, return_dist=False) # Making sure idx has the right type, int, for indexing. idx = idx.astype(int) # Applying base's attribute (attr) to points in arr. newattr = attr[idx] return np.reshape(newattr, newattr.shape[0])
[docs]def entries_to_remove(entries, d): """ Function to remove selected entries (key and respective values) from a given dict. Based on a reply from the user mattbornski [#]_ at stackoverflow. Parameters ---------- entries : array Set of entried to be removed. d : dict Dictionary to apply the entried removal. References ---------- .. [#] mattbornski, 2012. http://stackoverflow.com/questions/8995611/\ removing-multiple-keys-from-a-dictionary-safely """ for k in entries: d.pop(k, None)