
    !ge                     l    d Z ddlZddlmZ ddlmZmZmZm	Z	m
Z
 g dZ G d de      Z G d	 d
e      Zy)aZ  
Multivariate Conditional and Unconditional Kernel Density Estimation
with Mixed Data Types.

References
----------
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
    Princeton University Press. (2007)
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
    and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
    http://dx.doi.org/10.1561/0800000009
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
    with Categorical and Continuous Data." Working Paper. (2000)
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
    Distributions Annals of Economics and Finance 5, 211-235 (2004)
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
    cumulative distribution function."
    Journal of Nonparametric Statistics (2008)
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
    with Categorical and Continuous Data." Working Paper
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
    regression" Statistica Sinica 14(2004), pp. 485-512
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
        Regression" Journal of Business & Economics Statistics
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
        Categorical Predictor Variables in Nonparametric Regression
        Models", 2006, Econometric Reviews 25, 523-544

    N   )kernels)
GenericKDEEstimatorSettingsgpkeLeaveOneOut_adjust_shape)KDEMultivariateKDEMultivariateConditionalr   c                   F    e Zd ZdZddZd Zd fdZddZddZd	 Z	d
 Z
y)r
   a  
    Multivariate kernel density estimator.

    This density estimator can handle univariate as well as multivariate data,
    including mixed continuous / ordered discrete / unordered discrete data.
    It also provides cross-validated bandwidth selection methods (least
    squares, maximum likelihood).

    Parameters
    ----------
    data : list of ndarrays or 2-D ndarray
        The training data for the Kernel Density Estimation, used to determine
        the bandwidth(s).  If a 2-D array, should be of shape
        (num_observations, num_variables).  If a list, each list element is a
        separate observation.
    var_type : str
        The type of the variables:

            - c : continuous
            - u : unordered (discrete)
            - o : ordered (discrete)

        The string should contain a type specifier for each variable, so for
        example ``var_type='ccuo'``.
    bw : array_like or str, optional
        If an array, it is a fixed user-specified bandwidth.  If a string,
        should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    defaults : EstimatorSettings instance, optional
        The default values for (efficient) bandwidth estimation.

    Attributes
    ----------
    bw : array_like
        The bandwidth parameters.

    See Also
    --------
    KDEMultivariateConditional

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> nobs = 300
    >>> np.random.seed(1234)  # Seed random generator
    >>> c1 = np.random.normal(size=(nobs,1))
    >>> c2 = np.random.normal(2, 1, size=(nobs,1))

    Estimate a bivariate distribution and display the bandwidth found:

    >>> dens_u = sm.nonparametric.KDEMultivariate(data=[c1,c2],
    ...     var_type='cc', bw='normal_reference')
    >>> dens_u.bw
    array([ 0.39967419,  0.38423292])
    Nc                    || _         t        | j                         | _        t        || j                        | _        || _        t        j                  | j                        \  | _        | _        | j                  | j                  k  rt        d      |
t               n|}| j                  |       | j                  s| j                  |      | _        y | j                  |      | _        y )NzGThe number of observations must be larger than the number of variables.)var_typelenk_varsr	   data	data_typenpshapenobs
ValueErrorr   _set_defaults	efficient_compute_bwbw_compute_efficient)selfr   r   r   defaultss        e/var/www/dash_apps/app1/venv/lib/python3.12/site-packages/statsmodels/nonparametric/kernel_density.py__init__zKDEMultivariate.__init__e   s     $--(!$4	!!#$))!4	4;99# = > >*2*:$&8$~~&&r*DG--b1DG    c                     d}|dt        | j                        z   dz   z  }|dt        | j                        z   dz   z  }|d| j                  z   dz   z  }|d| j                  z   dz   z  }|S ) Provide something sane to print.zKDE instance
zNumber of variables: k_vars = 
zNumber of samples:   nobs = zVariable types:      BW selection method: )strr   r   r   
_bw_methodr   rprs     r   __repr__zKDEMultivariate.__repr__u   sy    /#dkk2BBTII-DII>EE&6==&84??
r    c                     | S N xs    r   <lambda>zKDEMultivariate.<lambda>~        r    c           	          t        | j                        }d}t        |      D ]<  \  }}t        || | j                  |ddf    | j                        }| ||      z  }> | S )aX  
        Returns the leave-one-out likelihood function.

        The leave-one-out likelihood function for the unconditional KDE.

        Parameters
        ----------
        bw : array_like
            The value for the bandwidth parameter(s).
        func : callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Notes
        -----
        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                    \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        r   Nr   data_predictr   )r   r   	enumerater   r   )r   r   funcLOOLiX_not_if_is           r   loo_likelihoodzKDEMultivariate.loo_likelihood~   sl    6 $))$#C. 	JAwr		!Q$7G $/CcNA	
 r	r    c                 j   || j                   }nt        || j                        }g }t        t	        j
                  |      d         D ]R  }|j                  t        | j                  | j                   ||ddf   | j                        | j                  z         T t	        j                  |      }|S )aT  
        Evaluate the probability density function.

        Parameters
        ----------
        data_predict : array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        pdf_est : array_like
            Probability density function evaluated at `data_predict`.

        Notes
        -----
        The probability density is given by the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        Nr   r2   r   r	   r   ranger   r   appendr   r   r   r   squeeze)r   r3   pdf_estr8   s       r   pdfzKDEMultivariate.pdf   s    , 99L(t{{CLrxx-a01 	EANN4dii-9!Q$-?)-8:>))D E	E
 **W%r    c                 p   || j                   }nt        || j                        }g }t        t	        j
                  |      d         D ]U  }|j                  t        | j                  | j                   ||ddf   | j                  ddd      | j                  z         W t	        j                  |      }|S )a  
        Evaluate the cumulative distribution function.

        Parameters
        ----------
        data_predict : array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        cdf_est : array_like
            The estimate of the cdf.

        Notes
        -----
        See https://en.wikipedia.org/wiki/Cumulative_distribution_function
        For more details on the estimation see Ref. [5] in module docstring.

        The multivariate CDF for mixed data (continuous and ordered/unordered
        discrete) is estimated by:

        .. math::

            F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G(\frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d}, \lambda)\right]

        where G() is the product kernel CDF estimator for the continuous
        and L() for the discrete variables.

        Used bandwidth is ``self.bw``.
        Nr   gaussian_cdfaitchisonaitken_cdfwangryzin_cdf)r   r3   r   ckertypeukertypeokertyper=   )r   r3   cdf_estr8   s       r   cdfzKDEMultivariate.cdf   s    > 99L(t{{CLrxx-a01 	GANN4dii-9!Q$-?)-)7)>)8:
 =AIIF G	G **W%r    c           	      z   d}t        t        j                  t        j                  t        j                        }| j
                  }| j                   }| j                  }t        j                  |D cg c]  }|dk(  	 c}      }||   j                         }	t        j                  |j                        }
t        |      D ]d  }t        |      D ](  \  }} ||   ||   |dd|f   |||f         |
dd|f<   * |
j                  d      |	z  }|j                  d      }||z  }f t        t        j                   t        j"                  t        j$                        }t'        | j                        }d}t        j                  |j                  d   dz
  |j                  d   f      }
t        |      D ]f  \  }}t        |      D ])  \  }} ||   ||   |dd|f    |||f         |
dd|f<   + |
j                  d      |	z  }||j                  d      z  }h ||dz  z  d|z  ||dz
  z  z  z
  S c c}w )a  
        Returns the Integrated Mean Square Error for the unconditional KDE.

        Parameters
        ----------
        bw : array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV : float
            The cross-validation objective function.

        Notes
        -----
        See p. 27 in [1]_ for details on how to handle the multivariate
        estimation with mixed data types see p.6 in [2]_.

        The formula for the cross-validation objective function is:

        .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N}
            \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n}
            \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j})

        Where :math:`\bar{K}_{h}` is the multivariate product convolution
        kernel (consult [2]_ for mixed data types).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
                with Categorical and Continuous Data." Working Paper. (2000)
        r   )courM   Nr   axis   )dictr   gaussian_convolutionwang_ryzin_convolutionaitchison_aitken_convolutionr   r   r   r   arrayprodemptyr   r>   r4   sumgaussian
wang_ryzinaitchison_aitkenr   )r   r   Fkertypesr   r   r   rM   ix_cont_bw_cont_productKvalr8   iivtypedens	k_bar_sumr6   r7   r9   s                      r   imsezKDEMultivariate.imse   s@   f '66!88!>>@ yy		z==((h7AH78g;++-xx

#t 	A&x0 ;	E-huobf.21b5k.21b5k;QU;
 99!9$'77Da(INA	 '**!,,!224 $))$xxAq$**Q-89#C. 	"JAw&x0 ;	E-huobf/6q"u~o.21b5k;QU; 99!9$'77Dq!!A	" D!Ga!ettax'899:9 8s   3H8c                 (    d}| j                   f}||fS )@Helper method to be able to pass needed vars to _compute_subset.r
   )r   r   
class_type
class_varss      r   _get_class_vars_typez$KDEMultivariate._get_class_vars_typeN  s    &
mm&
:%%r    NNr+   __name__
__module____qualname____doc__r   r)   r;   rB   rK   rg   rm   r,   r    r   r
   r
   )   s5    :v2  '2 "H"H.`V;p&r    r
   c                   H    e Zd ZdZ	 ddZd Zd fdZddZddZd	 Z	d
 Z
y)r   a  
    Conditional multivariate kernel density estimator.

    Calculates ``P(Y_1,Y_2,...Y_n | X_1,X_2...X_m) =
    P(X_1, X_2,...X_n, Y_1, Y_2,..., Y_m)/P(X_1, X_2,..., X_m)``.
    The conditional density is by definition the ratio of the two densities,
    see [1]_.

    Parameters
    ----------
    endog : list of ndarrays or 2-D ndarray
        The training data for the dependent variables, used to determine
        the bandwidth(s).  If a 2-D array, should be of shape
        (num_observations, num_variables).  If a list, each list element is a
        separate observation.
    exog : list of ndarrays or 2-D ndarray
        The training data for the independent variable; same shape as `endog`.
    dep_type : str
        The type of the dependent variables:

            c : Continuous
            u : Unordered (Discrete)
            o : Ordered (Discrete)

        The string should contain a type specifier for each variable, so for
        example ``dep_type='ccuo'``.
    indep_type : str
        The type of the independent variables; specified like `dep_type`.
    bw : array_like or str, optional
        If an array, it is a fixed user-specified bandwidth.  If a string,
        should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    defaults : Instance of class EstimatorSettings
        The default values for the efficient bandwidth estimation

    Attributes
    ----------
    bw : array_like
        The bandwidth parameters

    See Also
    --------
    KDEMultivariate

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Conditional_probability_distribution

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> nobs = 300
    >>> c1 = np.random.normal(size=(nobs,1))
    >>> c2 = np.random.normal(2,1,size=(nobs,1))

    >>> dens_c = sm.nonparametric.KDEMultivariateConditional(endog=[c1],
    ...     exog=[c2], dep_type='c', indep_type='c', bw='normal_reference')
    >>> dens_c.bw   # show computed bandwidth
    array([ 0.41223484,  0.40976931])
    Nc                    || _         || _        ||z   | _        t        | j                         | _        t        | j                        | _        t        || j                        | _        t        || j
                        | _        t        j                  | j                        \  | _        | _        t        j                  | j                  | j                  f      | _        t        j                  | j                        d   | _        |
t               n|}| j!                  |       | j"                  s| j%                  |      | _        y | j)                  |      | _        y )Nr   )dep_type
indep_typer   r   k_depk_indepr	   endogexogr   r   r   column_stackr   r   r   r   r   r   r   r   )r   rz   r{   rv   rw   r   r   s          r   r   z#KDEMultivariateConditional.__init__  s     $!J.'
4??+"5$**5
!$5	 " 4	4:OOTZZ$;<	hhtyy)!,*2*:$&8$~~&&r*DG--b1DGr    c                 <   d}|dt        | j                        z   dz   z  }|dt        | j                        z   dz   z  }|dt        | j                        z   dz   z  }|d| j                  z   dz   z  }|d| j
                  z   dz   z  }|d| j                  z   dz   z  }|S )	r"   z$KDEMultivariateConditional instance
z+Number of independent variables: k_indep = r#   z'Number of dependent variables: k_dep = zNumber of observations: nobs = z!Independent variable types:      zDependent variable types:      r$   )r%   ry   rx   r   rw   rv   r&   r'   s     r   r)   z#KDEMultivariateConditional.__repr__  s    5<4<< !#'( 	(84::!%& 	&03tyy>ADHH2T__DtKK04==@4GG&84??
r    c                     | S r+   r,   r-   s    r   r/   z#KDEMultivariateConditional.<lambda>  r0   r    c           	         t        | j                        }t        | j                        j                         }d}t	        |      D ]  \  }}t        |      }t        || | j                  |ddf    | j                  | j                  z         }	t        || j                  d | | j                  |ddf    | j                        }
|	|
z  }| ||      z  } | S )a  
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw : array_like
            The bandwidth parameter(s).
        func : callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L : float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(x)``
        for ``f(x)``.
        r   Nr2   )
r   r   r{   __iter__r4   nextr   rv   rw   rx   )r   r   r5   yLOOxLOOr7   r8   Y_jr9   f_yxf_xr:   s               r   r;   z)KDEMultivariateConditional.loo_likelihood  s    2 499%499%..0o 	FAs4jG#TYYq!t_4D"&--$//"ADDr$**+gX%)YYq!t_$4 $1C *CcNA	 r	r    c           	      j   || j                   }nt        || j                        }|| j                  }nt        || j                        }g }t        j                  ||f      }t        t        j                  |      d         D ]  }t        | j                  | j                  ||ddf   | j                  | j                  z         }t        | j                  | j                  d | j                  ||ddf   | j                        }|j                  ||z          t        j                  |      S )aQ  
        Evaluate the probability density function.

        Parameters
        ----------
        endog_predict : array_like, optional
            Evaluation data for the dependent variables.  If unspecified, the
            training data is used.
        exog_predict : array_like, optional
            Evaluation data for the independent variables.

        Returns
        -------
        pdf : array_like
            The value of the probability density at `endog_predict` and `exog_predict`.

        Notes
        -----
        The formula for the conditional probability density is:

        .. math:: f(y|x)=\frac{f(x,y)}{f(x)}

        with

        .. math:: f(x)=\prod_{s=1}^{q}h_{s}^{-1}k
                            \left(\frac{x_{is}-x_{js}}{h_{s}}\right)

        where :math:`k` is the appropriate kernel for each variable.
        Nr   r2   )rz   r	   rx   r{   ry   r   r|   r>   r   r   r   r   rv   rw   r?   r@   )r   endog_predictexog_predictrA   r3   r8   r   r   s           r   rB   zKDEMultivariateConditional.pdf  s   <   JJM)-DM99L(t||DL|'DErxx-a01 	'Adii%1!Q$%7"&--$//"ADD twwtzz{+$))$0A$6 $1C NN4#:&	' zz'""r    c                 @   || j                   }nt        || j                        }|| j                  }nt        || j                        }t        j                  |      d   }t        j                  |      }t        |      D ]  }t        | j                  | j                  d | j                  ||ddf   | j                        | j                  z  }t        j                  |      }t        | j                  d| j                   | j                   ||ddf   | j                  dddd      }t        | j                  | j                  d | j                  ||ddf   | j                  d	      }||z  j                  d
      }	|	| j                  |z  z  ||<    |S )a  
        Cumulative distribution function for the conditional density.

        Parameters
        ----------
        endog_predict : array_like, optional
            The evaluation dependent variables at which the cdf is estimated.
            If not specified the training dependent variables are used.
        exog_predict : array_like, optional
            The evaluation independent variables at which the cdf is estimated.
            If not specified the training independent variables are used.

        Returns
        -------
        cdf_est : array_like
            The estimate of the cdf.

        Notes
        -----
        For more details on the estimation see [2]_, and p.181 in [1]_.

        The multivariate conditional CDF for mixed data (continuous and
        ordered/unordered discrete) is estimated by:

        .. math::

            F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)}

        where G() is the product kernel CDF estimator for the dependent (y)
        variable(s) and W() is the product kernel CDF estimator for the
        independent variable(s).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Liu, R., Yang, L. "Kernel estimation of multivariate cumulative
                    distribution function." Journal of Nonparametric
                    Statistics (2008)
        Nr   r2   rD   rE   rF   F)r   r3   r   rG   rH   rI   tosumr   r3   r   r   rP   )rz   r	   rx   r{   ry   r   r   rY   r>   r   r   rw   r   r@   rv   rZ   )
r   r   r   N_data_predictrJ   r8   mu_x	cdf_endogcdf_exogSs
             r   rK   zKDEMultivariateConditional.cdf  sy   R   JJM)-DM99L(t||DL,/2((>*~& 	0A

,499%1!Q$%7!%248II>D ::d#DTWWQtzz2*71*=&*mm&4&;&5UDI DGGDJJK0tyy)5ad);%)__ECH X%***2Adii$./GAJ!	0$ r    c                 ^   t        | j                        }d}t        | j                        }t	        j
                  | j                  dz
  df      }t        |      D ]  \  }}|dd| j                  df   }|ddd| j                  f   }	t	        j                  |	|      }
t	        j                  ||	      }t	        j                  ||      }t	        j                  ||      }t        || j                  d || j                  |ddf   | j                  d      }t        || j                  d || j                  |ddf   | j                  d      }t        |d| j                   |
|| j                  dddd	      }||z  |z  j                         |d
z  z  }t        || | j                  |ddf    | j                  | j                  z         |z  }t        || j                  d | | j                  |ddf    | j                        |z  }|||d
z  z  d
||z  z  z
  z  } ||z  S )a  
        The integrated mean square error for the conditional KDE.

        Parameters
        ----------
        bw : array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV : float
            The cross-validation objective function.

        Notes
        -----
        For more details see pp. 156-166 in [1]_. For details on how to
        handle the mixed variable types see [2]_.

        The formula for the cross-validation objective function for mixed
        variable types is:

        .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n}
            \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}-
            \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})}

        where

        .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l}
                        K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)}

        where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and
        :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf.

        :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel.

        The value of the function is minimized by the ``_cv_ls`` method of the
        `GenericKDE` class to return the bw estimates that minimize the
        distance between the estimated and "true" probability density.

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
                with Categorical and Continuous Data." Working Paper. (2000)
        r   r   NFr   gauss_convolutionwangryzin_convolutionaitchisonaitken_convolution)r   r3   r   rG   rI   rH   r   rR   r2   )r   r   floatr   r   onesr4   rx   kronr   r{   rw   rv   rZ   )r   r   zLOOCVr   expanderrc   ZXYYe_LYe_RXe_LXe_RK_Xi_XlK_Xj_XlK2_Yi_YjGf_X_Ym_xs                       r   rg   zKDEMultivariateConditional.imse[  s!   ^ 499%TYY77DIIM1-.t_ 	5EB!TZZ[.!A![djj[.!A771h'D778Q'D771h'D778Q'D2djjk?(,		"a%(8$(OO5BG 2djjk?(,		"a%(8$(OO5BG Bq,4)-%8%<%B"')H 7"X-224tQw>A1"DIIb!e4D3D#'==4??#BEGKLEr$**+aR%)YYr1u%5$5 $1378C 1sax<1#444B3	56 Dyr    c                 T    d}| j                   | j                  | j                  f}||fS )ri   r   )rx   rv   rw   rj   s      r   rm   z/KDEMultivariateConditional._get_class_vars_type  s*    1
jj$--A
:%%r    r+   rn   ro   r,   r    r   r   r   U  s;    ?D 2& '2 &P2#hFPN`&r    r   )rs   numpyr    r   _kernel_baser   r   r   r   r	   __all__r
   r   r,   r    r   <module>r      sA   <    Qi&j i&X	Z& Z&r    