Source code for submodlib.functions.clustered

# clustered.py
# Author: Vishal Kaushal <vishal.kaushal@gmail.com>
import numpy as np
from .setFunction import SetFunction
from submodlib.helper import create_cluster_kernels
import submodlib_cpp as subcp
from submodlib_cpp import Clustered 

[docs]class ClusteredFunction(SetFunction):
	"""Implementation of the Clustered function.

	Given a set-function :math:`f` and a clustering, clustered function internally creates a mixture of functions each defined over a cluster. It is thus defined as
	
	.. math::
			f(X) = \\sum_i f_{C_i}(X)

	where :math:`f_{C_i}` operates only on cluster :math:`C_i` as sub-groundset and interprets :math:`X` as :math:`X \\cap C_i`.
	
	.. note::
			When the clusters are labels, ClusteredFunction is useful to achieve supervised subset selection.
	
	.. note::
			Some functions in this toolkit provide a "clustered" mode operation, achieving the same effect as invoking ClusteredFunction on those functions.

	Parameters
	----------
	n : int
		Number of elements in the ground set. Must be > 0.
	f_name : str
		Name of particular set function whose clustered implementation is desired.
	data : numpy.ndarray
		Data matrix of shape n X num_features containing the ground set data elements. data[i] should contain the num-features dimensional features of element i. This is used for computing the similarity kernel (and for computing the clusters if clustering is not provided). 
	mode : str
		Governs the internal implementation details. Can be "single" (to create a single dense large similarity kernel) or "multi" (to create one small dense kernel per cluster). If "single", internally the "partial" versions of the functions are used to get the functions for each cluster. If "multi", the functions for each cluster are instantiated separately with each cluster corresponding to a different groundset. 
	cluster_lab : list, optional
		List of size n, containing the cluster labels for each data point. If not provided, clustering is done internally using sklearn's `BIRCH <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html>`_ using provided data matrix.
	num_clusters : int, optional
		Number of clusters. Mandatory if cluster_lab is provided. If cluster_lab is not provided, clustering is done internally. In this case if num_clusters is not provided, an optimal number of clusters is created based on the supplied data. 
	metric : str, optional
		Similarity metric to be used while computing similarity kernel for each cluster (in "multi" mode) or a single dense kernel (in "single" mode). Can be "euclidean" or "cosine". Default value is "cosine".
	lambdaVal : float, optional
		Additional parameter that needs to be passed on to the set function if required. For example, the additional parameter of :class:`~submodlib.functions.graphCut.GraphCutFunction` and :class:`~submodlib.functions.logDeterminant.LogDeterminantFunction`. Default is 1.
		
	"""

	def __init__(self, n, f_name, data, mode, cluster_lab=None, num_clusters=None, metric="cosine", lambdaVal=1):
		self.n = n
		self.f_name = f_name
		self.num_clusters=num_clusters
		self.data = data
		self.mode = mode
		self.cluster_lab=cluster_lab
		self.metric = metric
		
		self.clusters=None
		self.cluster_sijs=None
		self.cluster_map=None
		self.sijs = None
		self.cpp_content = None
		self.cpp_sijs = None
		self.effective_ground=None
		self.lambdaVal = lambdaVal

		if self.n <= 0:
			raise Exception("ERROR: Number of elements in ground set must be positive")

		if self.mode not in ['single', 'multi']:
			raise Exception("ERROR: Incorrect mode. Must be one of 'single' or 'multi'")

		# if self.metric not in ['euclidean', 'cosine']:
		# 	raise Exception("ERROR: Unsupported metric")

		if type(self.cluster_lab) != type(None) and (self.num_clusters  is None or self.num_clusters <= 0):
			raise Exception("ERROR: Positive number of clusters must be provided when cluster_lab is provided")
		if type(self.cluster_lab) != type(None) and len(self.cluster_lab) != self.n:
			raise Exception("ERROR: cluster_lab's size is NOT same as ground set size")
		if type(self.cluster_lab) != type(None) and not all(ele >= 0 and ele <= self.num_clusters-1 for ele in self.cluster_lab):
			raise Exception("Cluster IDs/labels contain invalid values")

		if np.shape(self.data)[0]!=self.n:
			raise Exception("ERROR: Inconsistentcy between n and no of examples in the given ground data matrix")

		if mode == "single":
			self.clusters, _, _ = create_cluster_kernels(self.data.tolist(), self.metric, self.cluster_lab, self.num_clusters, onlyClusters=True)
			self.num_neighbors = np.shape(self.data)[0] #Using all data as num_neighbors in case of dense mode
			self.cpp_content = np.array(subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors))
			val = self.cpp_content[0]
			row = list(map(lambda arg: int(arg), self.cpp_content[1]))
			col = list(map(lambda arg: int(arg), self.cpp_content[2]))
			self.sijs = np.zeros((n,n))
			self.sijs[row,col] = val
			self.cpp_sijs = self.sijs.tolist() #break numpy ndarray to native list of list datastructure
			if type(self.cpp_sijs[0])==int or type(self.cpp_sijs[0])==float: #Its critical that we pass a list of list to pybind11
			#This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
				l=[]
				l.append(self.cpp_sijs)
				self.cpp_sijs=l
			# print("self.n: ", self.n)
			# print("self.f_name: ", self.f_name)
			# print("self.clusters: ", self.clusters)
			# print("self.cpp_sijs: ", self.cpp_sijs)
			# print("self.lambdaVal: ", self.lambdaVal)
			self.cpp_obj = Clustered(self.n, self.f_name, self.clusters, self.cpp_sijs, self.lambdaVal)
		else:
			self.clusters, self.cluster_sijs, self.cluster_map = create_cluster_kernels(self.data.tolist(), self.metric, self.cluster_lab, self.num_clusters)
			l_temp = []
			#TODO: this for loop can be optimized
			for el in self.cluster_sijs:
				temp=el.tolist()
				if type(temp[0])==int or type(temp[0])==float: #Its critical that we pass a list of list to pybind11
																#This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
					l=[]
					l.append(temp)
					temp=l
				l_temp.append(temp)
			self.cluster_sijs = l_temp
			# print("self.n: ", self.n)
			# print("self.f_name: ", self.f_name)
			# print("self.clusters: ", self.clusters)
			# print("self.cluster_sijs: ", self.cluster_sijs)
			# print("self.cluster_map: ", self.cluster_map)
			# print("self.lambdaVal: ", self.lambdaVal)
			self.cpp_obj = Clustered(self.n, self.f_name, self.clusters, self.cluster_sijs, self.cluster_map, lambdaVal)
		self.effective_ground=self.cpp_obj.getEffectiveGroundSet()