Source code for submodlib.functions.facilityLocationVariantMutualInformation

# facilityLocationVariantMutualInformation.py
# Author: Vishal Kaushal <vishal.kaushal@gmail.com>
import numpy as np
import scipy
from .setFunction import SetFunction
import submodlib_cpp as subcp
from submodlib_cpp import FacilityLocationVariantMutualInformation 
from submodlib.helper import create_kernel

[docs]class FacilityLocationVariantMutualInformationFunction(SetFunction): """Implementation of the Facility Location Variant Mutual Information (FL2MI) function. Given a :ref:`functions.submodular-mutual-information` function, Facility Location Variant Mutual Information function is its instantiation using a :class:`~submodlib.functions.facilityLocation.FacilityLocationFunction`. However it is slightly different from :class:`~submodlib.functions.facilityLocationMutualInformation.FacilityLocationMutualInformationFunction`. This variant considers only cross-similarities between data points and the target. Mathematically, it takes the following form: .. math:: I_f(A; Q) = \\sum_{i \\in Q} \\max_{j \\in A} s_{ij} + \\eta \\sum_{i \\in A} \\max_{j \\in Q} s_{ij} This expression has interesting characteristics different from those of FL1MI. In particular, there is no saturation in FL2MI and it just models the pairwise similarities of target to data points and vice versa. .. note:: CRAIG :cite:`mirzasoleiman2020coresets` when applied to the task of targeted subset selection can be seen as a special case of FL2MI (see :cite:`kaushal2021prism`). Parameters ---------- n : int Number of elements in the ground set. Must be > 0. num_queries : int Number of query points in the target. query_sijs : numpy.ndarray, optional Similarity kernel between the ground set and the queries. Shape: n X num_queries. When not provided, it is computed using data, queryData and metric. data : numpy.ndarray, optional Matrix of shape n X num_features containing the ground set data elements. data[i] should contain the num-features dimensional features of element i. Mandatory, if query_sijs is not provided. Ignored if query_sijs is provided. queryData : numpy.ndarray, optional Matrix of shape num_queries X num_features containing the query elements. queryData[i] should contain the num-features dimensional features of query i. It is optional (and is ignored if provided) if query_sijs has been provided. metric : str, optional Similarity metric to be used for computing the similarity kernels. Can be "cosine" for cosine similarity or "euclidean" for similarity based on euclidean distance. Default is "cosine". queryDiversityEta : float, optional The value of the query-relevance vs diversity trade-off. Increasing :math:`\eta` tends to increase query-relevance while reducing query-coverage and diversity. Default is 1. """ def __init__(self, n, num_queries, query_sijs=None, data=None, queryData=None, metric="cosine", queryDiversityEta=1): self.n = n self.num_queries = num_queries self.metric = metric self.query_sijs = query_sijs self.data = data self.queryData = queryData self.queryDiversityEta=queryDiversityEta self.cpp_obj = None self.cpp_query_sijs = None self.cpp_content = None self.effective_ground = None if self.n <= 0: raise Exception("ERROR: Number of elements in ground set must be positive") if self.num_queries < 0: raise Exception("ERROR: Number of queries must be >= 0") # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'") if type(self.query_sijs) != type(None): # User has provided query kernel if type(self.query_sijs) != np.ndarray: raise Exception("Invalid query kernel type provided, must be ndarray") if np.shape(self.query_sijs)[0]!=self.n or np.shape(self.query_sijs)[1]!=self.num_queries: raise Exception("ERROR: Query Kernel should be n X num_queries") if (type(self.data) != type(None)) or (type(self.queryData) != type(None)): print("WARNING: similarity query kernel found. Provided data and query matrices will be ignored.") else: #similarity query kernel has not been provided if (type(self.data) == type(None)) or (type(self.queryData) == type(None)): raise Exception("Since query kernel is not provided, data matrices are a must") if np.shape(self.data)[0]!=self.n: raise Exception("ERROR: Inconsistentcy between n and no of examples in the given data matrix") if np.shape(self.queryData)[0]!=self.num_queries: raise Exception("ERROR: Inconsistentcy between num_queries and no of examples in the given query data matrix") #construct queryKernel self.query_sijs = np.array(subcp.create_kernel_NS(self.queryData.tolist(),self.data.tolist(), self.metric)) #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding self.cpp_query_sijs = self.query_sijs.tolist() #break numpy ndarray to native list of list datastructure if type(self.cpp_query_sijs[0])==int or type(self.cpp_query_sijs[0])==float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l=[] l.append(self.cpp_query_sijs) self.cpp_query_sijs=l self.cpp_obj = FacilityLocationVariantMutualInformation(self.n, self.num_queries, self.cpp_query_sijs, self.queryDiversityEta) self.effective_ground = set(range(n))