Source code for pysarpu.pysarpu

"""Main module."""
import numpy as np
import pickle
from tqdm.auto import tqdm

[docs]class PUClassifier: ''' PU learning classification model under unknown propensity. This model works by specifying a model on the classification and on the propensity and estimates parameters using EM algorithm (SAR-EM, Bekker et al.) :param cmodel: an instance of class :class: `Classifier <pysarpu.classification.Classifier>` representing the classification model. This package includes two types of classification models: logistic regression (accessible through :class:`pysarpu.classification.LinearLogisticRegression`) and linear discriminant analysis (accessible through :class:`pysarpu.classification.LinearDiscriminantClassifier`) :type cmodel: :class:`pysarpu.classification.Classifier` :param emodel: an instance of class :class:`pysarpu.propensity.Propensity` representing the propensity model. This package includes multiple pre-implemented propensity models: logistic propensity (:class:`pysarpu.propensity.LogisticPropensity`), log-normal propensity (:class:`pysarpu.propensity.LogProbitPropensity`) and Gumbel propensity (:class:`pysarpu.propensity.GumbelPropensity`) :type emodel: :class:`pysarpu.propensity.Propensity` :param da: whether the classification model is a discriminant analysis type model (`True`) or not (`False`). Indeed, the likelihood maximized is not the same in these two settings. Default: `False`. :type da: :class:`bool`, optional :return: Return an instance of PU learning model (not yet initialized). :rtype: :class:`pysarpu.PUClassifier` ''' def __init__(self, cmodel, emodel, da=False): ''' Constructor for :class:`pysarpu.PUClassifier` ''' self.cmodel = cmodel self.emodel = emodel self.da = da self.history = []
[docs] def initialization(self, Xc, Xe, Y, w=1.): ''' Initialization of parameters for both classification and propensity models before running EM algorithm. The parameters of each models are initialized following their respective method: see `initialization` methods for `cmodel` and `emodel`. :param Xc: covariate matrix for classification. The parameters of `cmodel` will be initialized in agreement with the dimension of the entry data :math:`d_1`. :type Xc: `numpy.array` of shape :math:`(n,d_1)` :param Xe: covariate matrix for propensity. The parameters of `emodel` will be initialized in agreement with the dimension of the entry data :math:`d_2`. :type Xe: `numpy.array` of shape :math:`(n,d_2)` :param Y: observed labels. Only used in the computation of the initial log-likelihood. :type Y: `numpy.array` vector of size :math:`n`. :return: `None` ''' self.cmodel.initialization(Xc, w) self.emodel.initialization(Xe, w) self.history = [self.loglikelihood(Xc, Xe, Y)]
def __str__(self): s = 'PU classifier \n' s += self.cmodel.__str__() s += '\n' s += self.emodel.__str__() return s def __repr__(self): return self.__str__() def eta(self, Xc): return self.cmodel.eta(Xc) def logeta(self, Xc): return self.cmodel.logeta(Xc)
[docs] def e(self, Xe): ''' Propensity function using the current parameters of propensity model `emodel`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` of shape :math:`(n,d_2)` :return: vector of propensity scores. :rtype: `numpy.array` of size :math:`n` ''' return self.emodel.e(Xe)
[docs] def loge(self, Xe): ''' Logarithm of propensity function using the current parameters of propensity model `emodel`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` of shape :math:`(n,d_2)` :return: vector of log-propensity scores. :rtype: `numpy.array` of size :math:`n` ''' return self.emodel.loge(Xe)
def logp(self, Xe): # try: return self.emodel.logp(Xe) # except: # print('Unavailable for this propensity model') def log1e(self, Xe): return self.emodel.log1e(Xe) def log1etae(self, Xc, Xe): leta, le = self.logeta(Xc), self.loge(Xe) approx = (leta + le <=-1e-10) return approx*np.nan_to_num(np.log1p(-np.exp(leta + le))) + (1-approx)*np.nan_to_num(np.log(-leta-le))
[docs] def predict_cproba(self, Xc): ''' Class probability predictions using the parameters of the classification model. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :return: posterior class probabilities. :rtype: `numpy.array` vector of size :math:`n` ''' return self.cmodel.eta(Xc)
[docs] def predict_clogproba(self, Xc): ''' Class log-probability predictions using the parameters of the classification model. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :return: posterior class log-probabilities. :rtype: `numpy.array` vector of size :math:`n` ''' return self.cmodel.logeta(Xc)
[docs] def predict_c(self, Xc, threshold=0.5): ''' Class binary predictions using the parameters of the classification model. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param threshold: decision threshold defining the decision rule. :type threshold: `float`, optional (in :math:`[0,1]`) :return: class predictions. :rtype: `numpy.array` binary vector of size :math:`n` ''' return (self.predict_cproba(Xc)>=threshold).astype(int)
def predict_eproba(self, Xe): return self.emodel.e(Xe) def predict_elogproba(self, Xe): return self.emodel.loge(Xe) def predict_e(self, Xe, threshold=0.5): return (self.predict_eproba(Xe)>=threshold).astype(int)
[docs] def predict_proba(self, Xc, Xe): ''' Label probability predictions based on the classification model `cmodel` and the propensity model `emodel`. Note that this is different from method `predict_cproba` which returns class probabilities instead. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :return: posterior label probabilities. :rtype: `numpy.array` vector of size :math:`n` ''' return self.predict_cproba(Xc)*self.predict_eproba(Xe)
[docs] def predict_logproba(self, Xc, Xe): ''' Label log-probability predictions based on the classification model `cmodel` and the propensity model `emodel`. Note that this is different from method `predict_clogproba` which returns class log-probabilities instead. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :return: posterior label log-probabilities. :rtype: `numpy.array` vector of size :math:`n` ''' return self.predict_clogproba(Xc) + self.predict_elogproba(Xe)
[docs] def predict(self, Xc, Xe, threshold=0.5): ''' Label binary predictions based on the classification model `cmodel` and the propensity model `emodel`. Note that this is different from method `predict_c` which returns class predictions instead. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :param threshold: decision threshold defining the decision rule. :type threshold: `float`, optional (in :math:`[0,1]`) :return: label binary predictions. :rtype: `numpy.array` binary vector of size :math:`n` ''' return (self.predict_proba(Xc, Xe)>=threshold).astype(int)
[docs] def loglikelihood(self, Xc, Xe, Y, w=1.): ''' Log-likelihood function given the current parameters of classification and propensity models. Note that the funciton returns the mean of individual dlog-likelihoods (instead of the usual sum). :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :param Y: observed labels. Only used in the computation of the initial log-likelihood. :type Y: `numpy.array` vector of size :math:`n`. :param w: individual weights (experimental, not tested). Apply weights to observations in the computation of the likelihood. :type w: either `float` (`1.`, default) or `numpy.array` of size :math:`n`, optional. :return: log-likelihood. :rtype: `float` ''' if self.da: return self.loglikelihood_da(Xc, Xe, Y, w) else: return self.loglikelihood_lr(Xc, Xe, Y, w)
def loglikelihood_lr(self, Xc, Xe, Y, w=1.): # if self.pdf == False: # return np.mean(w*(Y*self.logeta(Xc) + Y*self.loge(Xe) + (1-Y)*self.log1etae(Xc, Xe))) # else: # return np.mean(w*(Y*self.logeta(Xc) + Y*self.logp(Xe) + (1-Y)*self.log1etae(Xc, Xe))) return np.mean(w*(Y*self.logeta(Xc) + Y*self.loge(Xe) + (1-Y)*self.log1etae(Xc, Xe))) def loglikelihood_da(self, Xc, Xe, Y, w=1.): return np.mean(Y*(np.log(self.cmodel.pdf_pos(Xc)) + self.emodel.loge(Xe)) + (1-Y)*np.log(self.cmodel.pdf_pos(Xc)*(1-self.emodel.e(Xe)) + self.cmodel.pdf_neg(Xc)))
[docs] def expectation(self, Xc, Xe, Y): ''' Compute the expectation step of EM algorithm, return the probabilities for every instance to be of positive class given the observed labels. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :param Y: observed labels. Only used in the computation of the initial log-likelihood. :type Y: `numpy.array` vector of size :math:`n`. :return: posterior probabilities :rtype: `np.array` vector of size :math:`n` ''' l1 = self.logeta(Xc) + self.log1e(Xe) l2 = self.log1etae(Xc, Xe) p = np.min(np.concatenate([np.exp(l1-l2)[:,np.newaxis], np.ones((len(Y),1))], axis=1), axis=1) return Y + (1-Y)*p
[docs] def maximisation(self, Xc, Xe, Y, gamma, w=1., warm_start=True, balance=False): ''' Compute the maximisation step of EM algorithm, update the model parameters in both classification and propensity models. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :param Y: observed labels. Only used in the computation of the initial log-likelihood. :type Y: `numpy.array` vector of size :math:`n`. :param gamma: posterior probabilities obtained in the expectation step. :type gamma: `numpy.array` of size :math:`n` :param w: individual weights (experimental, not tested). Apply weights to observations in the computation of the likelihood. :type w: either `float` (`1.`, default) or `numpy.array` of size :math:`n`, optional. :return: `None` ''' self.prev_params_c, self.prev_params_e = self.cmodel.params.copy(), self.emodel.params.copy() self.cmodel.fit(Xc, gamma, w, warm_start) self.emodel.fit(Xe, gamma, Y, w, warm_start, balance)
def _fit(self, Xc, Xe, Y, w=1., tol=1e-6, max_iter=1e4, warm_start=False, balance=False): if not warm_start: self.initialization(Xc, Xe, Y, Y, w) it = 0 delta = 1 last_ll = self.loglikelihood(Xc, Xe, Y, w) warm_start = False while abs(delta) > tol and it < max_iter: if delta<0: # print("Attention") self.cmodel.params = self.prev_params_c.copy() self.emodel.params = self.prev_params_e.copy() gamma = self.expectation(Xc, Xe, Y) self.maximisation(Xc, Xe, Y, gamma, w, warm_start, balance) delta = self.loglikelihood(Xc, Xe, Y, w) - last_ll last_ll = self.loglikelihood(Xc, Xe, Y, w) self.history.append(last_ll) # print(last_ll) it += 1 warm_start = True
[docs] def fit(self, Xc, Xe, Y, w=1., tol=1e-6, max_iter=1e4, warm_start=False, balance=False, n_init=20, iter_init=20): ''' Estimation of PU learning model parameters (classifier and propensity) through EM algorithm. Multiple random initialization are considered and trained over a few iterations. Then, only the one achieving the best log-likelihood is considered and trained until convergence. :param Xc: covariate matrix for classification. :type Xc: `numpy.array` with shape :math:`(n, d_1)`. :param Xe: covariate matrix for propensity. :type Xe: `numpy.array` with shape :math:`(n, d_2)`. :param Y: observed labels. Only used in the computation of the initial log-likelihood. :type Y: `numpy.array` vector of size :math:`n`. :param w: individual weights (experimental, not tested). Apply weights to observations in the computation of the likelihood. :type w: either `float` (`1.`, default) or `numpy.array` of size :math:`n`, optional. :param tol: tolerance parameter. Once the increase in the log-likelihood is below `tol`, the algorithm stops (default `1e-6`). :type tol: float, optional :param max_iter: maximum number of iterations (default: `1e4`) :type max_iter: int, optional :param warm_start: indicates whether current parameters can be used for initialization (`True`) or if they should be re-initialized before estimation (default `False`). :type warm_start: bool, optional :param balance: re-balance weights when fitting the propensity model in the maximization (experimental, potentially interesting in highly unbalanced situations). Default: `False`. :type balance: bool, optional :param n_init: number of initialization to consider in the Small EM initialization strategy (default: `n_init=20`) :type n_init: int, optional :param iter_init: maximum number of iterations to consider for each initialization (default: `20`). :type iter_init: int, optional :return: `None` ''' if not warm_start: self.initialization(Xc, Xe, Y, Y, w) optimal_ll = self.loglikelihood(Xc, Xe, Y) optimal_params_c = self.cmodel.params.copy() optimal_params_e = self.emodel.params.copy() for k in tqdm(range(n_init)): self._fit(Xc, Xe, Y, w=w, tol=tol, max_iter=iter_init, warm_start=False, balance=balance) ll = self.loglikelihood(Xc, Xe, Y) if ll > optimal_ll: optimal_ll = ll optimal_params_c = self.cmodel.params.copy() optimal_params_e = self.emodel.params.copy() self.cmodel.params = optimal_params_c.copy() self.emodel.params = optimal_params_e.copy() self._fit(Xc, Xe, Y, w=w, tol=tol, max_iter=max_iter, warm_start=True, balance=balance)
[docs] def save(self, path): ''' Saving PU learning model with current parameters as a binary file (rely on `pickle` library). :param path: path at which the model should be saved. :type path: `str` :return: `None` ''' with open(path, 'wb') as f: pickle.dump(self, f)