
    !gqS                        d Z ddlmZ ddlZddlmZ ddlmZ ddl	m
Z d Zd:dZd	 ZeZd
 Zd Zd Zd Zd Zi Zeeeeeeeded<   d Zd d d ded<   deied<    G d d      Zd;dZd<dZd=dZd>d Z G d! d"      Zed#k(  rdd$lmZ ej@                  jC                  d%d&      Z! e"d'        e" ee!d(              ee!d(      Z# e"e#jI                                g d)Z%e%D ]  Z& e"e&e#jI                  e&d               e"d*       dd+l'm(Z(  e(e)      Z*dZ+ e,d      D ]W  Z-ej\                  j_                  e+      Z! ee!d(      Z#e%D ],  Z&e*e&   ja                  e#jI                  e&d      d   d,          . Y  ejb                  e%D  cg c]  } e*|    	 c}       Z2 e"d-d.jg                  e%              e"d/e2d0k  ji                  d,              e"d1e2d2k  ji                  d,              e"d3e2d4k  ji                  d,              ed5 d(d6       dZ+dZ5 e e       d7e+e5d8      Z6 ejn                  e5 ejb                  g d9      z        jq                  e9      Z: e"e6e:          yyc c} w )?at  More Goodness of fit tests

contains

GOF : 1 sample gof tests based on Stephens 1970, plus AD A^2
bootstrap : vectorized bootstrap p-values for gof test with fitted parameters


Created : 2011-05-21
Author : Josef Perktold

parts based on ks_2samp and kstest from scipy.stats
(license: Scipy BSD, but were completely rewritten by Josef Perktold)


References
----------

    )lmapN)distributions)cache_readonly)
kolmogorovc                    t        t        j                  | |f      \  } }| j                  d   }|j                  d   }t	        |       }t	        |      }t        j
                  |       } t        j
                  |      }t        j                  | |g      }t        j                  | |d      d|z  z  }t        j                  ||d      d|z  z  }t        j                  t        j                  ||z
              }t        j                  ||z  t        ||z         z        }	 t        |dz   d|z  z   |z        }	||	fS #  d}	Y ||	fS xY w)aA  
    Computes the Kolmogorov-Smirnof statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different


    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678)

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1)
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)
    r   right)side      ?Q?)\(?)r   npasarrayshapelensortconcatenatesearchsortedmaxabsolutesqrtfloatksprob)
data1data2n1n2data_allcdf1cdf2denprobs
             f/var/www/dash_apps/app1/venv/lib/python3.12/site-packages/statsmodels/sandbox/distributions/gof_new.pyks_2sampr$      s%   P 

UEN3LE5	QB	QB	UB	UBGGENEGGENE~~uUm,H??5w7R@DOOE(83r6BD
r{{49%&A	BuRU|#	$Br$wtBw)* d7Nd7Ns   $D? ?Ec                    t        | t              rG|r|| k(  r5t        t        |       j                  }t        t        |       j
                  } nt        d      t        |t              rt        t        |      j                  }t        |       r d|i}t        j                   | |i |      }n t        j                  |       }t        |      } ||g| }|dv rTt        j                  d|dz         |z  |z
  j                         }	|dk(  r"|	t        j                  j                  |	|      fS |dv rQ|t        j                  d|      |z  z
  j                         }
|d	k(  r"|
t        j                  j                  |
|      fS |d
k(  rt        j                  	
g      }|dk(  r7|t        j                  j                  |t        j                   |      z        fS |dk(  rt        j                  j                  |t        j                   |      z        }|dkD  s|d|dz  dz  z
  kD  r7|t        j                  j                  |t        j                   |      z        fS |t        j                  j                  ||      dz  fS yy)a  
    Perform the Kolmogorov-Smirnov test for goodness of fit

    This performs a test of the distribution G(x) of an observed
    random variable against a given distribution F(x). Under the null
    hypothesis the two distributions are identical, G(x)=F(x). The
    alternative hypothesis can be either 'two_sided' (default), 'less'
    or 'greater'. The KS test is only valid for continuous distributions.

    Parameters
    ----------
    rvs : str or array or callable
        string: name of a distribution in scipy.stats

        array: 1-D observations of random variables

        callable: function to generate random variables, requires keyword
        argument `size`

    cdf : str or callable
        string: name of a distribution in scipy.stats, if rvs is a string then
        cdf can evaluate to `False` or be the same as rvs
        callable: function to evaluate cdf

    args : tuple, sequence
        distribution parameters, used if rvs or cdf are strings
    N : int
        sample size if rvs is string or callable
    alternative : 'two_sided' (default), 'less' or 'greater'
        defines the alternative hypothesis (see explanation)

    mode : 'approx' (default) or 'asymp'
        defines the distribution used for calculating p-value

        'approx' : use approximation to exact distribution of test statistic

        'asymp' : use asymptotic distribution of test statistic


    Returns
    -------
    D : float
        KS test statistic, either D, D+ or D-
    p-value :  float
        one-tailed or two-tailed p-value

    Notes
    -----

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function of the random variable is "less"
    or "greater" than the cumulative distribution function F(x) of the
    hypothesis, G(x)<=F(x), resp. G(x)>=F(x).

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import kstest

    >>> x = np.linspace(-15,15,9)
    >>> kstest(x,'norm')
    (0.44435602715924361, 0.038850142705171065)

    >>> np.random.seed(987654321) # set random seed to get the same result
    >>> kstest('norm','',N=100)
    (0.058352892479417884, 0.88531190944151261)

    is equivalent to this

    >>> np.random.seed(987654321)
    >>> kstest(stats.norm.rvs(size=100),'norm')
    (0.058352892479417884, 0.88531190944151261)

    Test against one-sided alternative hypothesis:

    >>> np.random.seed(987654321)

    Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x):

    >>> x = stats.norm.rvs(loc=0.2, size=100)
    >>> kstest(x,'norm', alternative = 'less')
    (0.12464329735846891, 0.040989164077641749)

    Reject equal distribution against alternative hypothesis: less

    >>> kstest(x,'norm', alternative = 'greater')
    (0.0072115233216311081, 0.98531158590396395)

    Do not reject equal distribution against alternative hypothesis: greater

    >>> kstest(x,'norm', mode='asymp')
    (0.12464329735846891, 0.08944488871182088)


    Testing t distributed random variables against normal distribution:

    With 100 degrees of freedom the t distribution looks close to the normal
    distribution, and the kstest does not reject the hypothesis that the sample
    came from the normal distribution

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
    (0.072018929165471257, 0.67630062862479168)

    With 3 degrees of freedom the t distribution looks sufficiently different
    from the normal distribution, that we can reject the hypothesis that the
    sample came from the normal distribution at a alpha=10% level

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
    (0.131016895759829, 0.058826222555312224)
    5if rvs is string, cdf has to be the same distributionsize)	two_sidedgreaterr
      r)   )r(   less        r+   r(   asympapproxj
  皙?333333?     @@   N)
isinstancestrgetattrr   cdfrvsAttributeErrorcallabler   r   r   aranger   ksonesf	kstwobignr   )r8   r7   argsNalternativemodekwdsvalscdfvalsDplusDminDpval_twos                r#   kstestrJ   }   s>   f #s--11C--11C !XYY #smS)--}qzwwsD(4()wws|I$G..3!$Q&0557)#---00q999++"))C+A--224& ,,//Q777k!FFE$< 7?m--002771:>>>8$..11!BGGAJ,?H4x8dQsU6\&99-1144Qrwwqz\BBB---0015a777 	 "    c                    t        j                  |      dz   dt        j                  |      z  z   }| |z  }t        j                  d|dz  z        }t        j                  | t        j                  g d      kD        }|||fS )Nr   r   r3   )=
ףp=?rN   r
   r   r   expsumarraystatnobs
mod_factorstat_modifiedpvaldigitss         r#   dplus_st70_upprZ     sr    %rwwt}(<<J:%M66"}a''(DVVD288$6778F$&&rK   c                    t        j                  |      dz   dt        j                  |      z  z   }| |z  }dt        j                  d|dz  z        z  }t        j                  | t        j                  g d      kD        }|||fS )Nr   r   r3   rM   )Q?r\   gHzG?rO   rS   s         r#   
d_st70_uppr]   &  sw    %rwwt}(<<J:%Mrvvb=!++,,DVVD288$6778F$&&rK   c                    t        j                  |      dz   dt        j                  |      z  z   }| |z  }|dz  }d|z  dz
  t        j                  d|z        z  }t        j                  | t        j                  g d      kD        }|||fS )Ngףp=
?gQ?r3      rM   )(\?r`   g)\(?rO   )rT   rU   rV   rW   zsqurX   rY   s          r#   
v_st70_upprb   .  s    &)==J:%M!DHqLBFF29--DVVD288$6778F$&&rK   c                     d|z  }| d|z  z
  d|dz  z  z   d|z   z  }dt        j                  dd|z  z
        z  }t         j                  }|||fS )	Nr
   g?g333333?r3   r*   皙?gRQ@   )r   rP   nanrT   rU   nobsinvrW   rX   rY   s         r#   wsqu_st70_uppri   7  sc    4iGC'M)C'1*,<<WMM"&&M 1122DVVF$&&rK   c                    d|z  }| d|z  z
  d|dz  z  z   }|dd|z  z   z  }dt        j                  d|z  t         j                  dz  z        z  }t        j                  | t        j                  g d      kD        }|||fS )Nr
   皙?r3   r*   r0   rM   )(\?rl   g(\?r   rP   pirQ   rR   rg   s         r#   usqu_st70_uppro   ?  s    4iGC'M)C'1*,<<Ma#-'(MrvvcM)BEE1H455DVVD288$6778F$&&rK   c                    d|z  }| d|z  z
  d|dz  z  z   }|dd|z  z   z  }dt        j                  d|z  d	z  t         j                  dz  z        z  }t        j                  | t        j                  g d
      kD        }|||fS )Nr
   gffffff??r3   r*   gGz?g|?5^?rM          @)r   r   g!rh?rm   rg   s         r#   
a_st70_upprs   H  s    4iGC'M)C'1*,<<Ma$.()M266#-2RUUAX=>>DVVD288$7889F$&&rK   )d_plusd_minusr    vwsquusquastephens70uppc                    t         j                  j                  | t        j                  |      z        }|dkD  s|d|dz  dz  z
  kD  rF| t         j                  j                  | t        j                  |      z        t        j
                  fS | t         j                  j                  | |      dz  t        j
                  fS )Nr/   r0   r1   r2   r3   )r   r>   r=   r   r   rf   r<   )rH   r@   rI   s      r#   pval_kstest_approxr|   ^  s    &&))!BGGAJ,7H4x8dQsU6\11-)),,Qrwwqz\:BFFBB-%%((1-a/77rK   c                 d    | t         j                  j                  | |      t        j                  fS Nr   r<   r=   r   rf   )rF   r@   s     r#   <lambda>r   f  s$    (;(;(>(>ua(H"&& Q rK   c                 d    | t         j                  j                  | |      t        j                  fS r~   r   )rG   r@   s     r#   r   r   g  s$    }':':'='=d1'Ervv N rK   c                     | t         j                  j                  | t        j                  |      z        t        j
                  fS r~   )r   r>   r=   r   r   rf   )rH   r@   s     r#   r   r   h  s.    =2255a
lCRVVL rK   )rt   ru   r    scipyr    scipy_approxc                       e Zd ZdZddZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        ZddZy)GOFaP  One Sample Goodness of Fit tests

    includes Kolmogorov-Smirnov D, D+, D-, Kuiper V, Cramer-von Mises W^2, U^2 and
    Anderson-Darling A, A^2. The p-values for all tests except for A^2 are based on
    the approximatiom given in Stephens 1970. A^2 has currently no p-values. For
    the Kolmogorov-Smirnov test the tests as given in scipy.stats are also available
    as options.




    design: I might want to retest with different distributions, to calculate
    data summary statistics only once, or add separate class that holds
    summary statistics and data (sounds good).




    c                    t        |t              rG|r||k(  r5t        t        |      j                  }t        t        |      j
                  }nt        d      t        |t              rt        t        |      j                  }t        |      r d|i}t        j                   ||i |      }n t        j                  |      }t        |      } ||g| }|| _        || _        || _        y )Nr&   r'   )r4   r5   r6   r   r7   r8   r9   r:   r   r   r   rU   vals_sortedrE   )selfr8   r7   r?   r@   rC   rD   rE   s           r#   __init__zGOF.__init__  s    c3SCZmS155mS155$%\]] c3--11CC=1:D773,t,-D773<DD	Ad"T"	rK   c                     | j                   }| j                  }t        j                  d|dz         |z  |z
  j	                         S )Nr
   r*   rU   rE   r   r;   r   r   rU   rE   s      r#   rt   z
GOF.d_plus  s<    yy,,		#tAv&t+g5::<<rK   c                     | j                   }| j                  }|t        j                  d|      |z  z
  j	                         S )Nr,   r   r   s      r#   ru   zGOF.d_minus  s8    yy,,"))C.t3388::rK   c                 X    t        j                  | j                  | j                  g      S r~   )r   r   rt   ru   r   s    r#   r    zGOF.d  s    vvt{{DLL122rK   c                 4    | j                   | j                  z   S )Kuiper)rt   ru   r   s    r#   rv   zGOF.v  s     {{T\\))rK   c                     | j                   }| j                  }|dt        j                  d|dz         z  dz
  |z  dz  z
  dz  j	                         d|z  dz  z   }|S )zCramer von Misesrr   r
   r*   r3   g      (@)rU   rE   r   r;   rQ   )r   rU   rE   rw   s       r#   rw   zGOF.wsqu  sf     yy,,B2tAv!66:D@CCaGLLNDrK   c                     | j                   }| j                  }| j                  ||j                         dz
  dz  z  z
  }|S )N      ?r3   )rU   rE   rw   mean)r   rU   rE   rx   s       r#   rx   zGOF.usqu  s<    yy,,yy47<<>C#7!";;;rK   c                     | j                   }| j                  }d}t        d|      D ]0  }||   |d | z
  }|dkD  }d||   z
  ||<   ||j                         z  }2 |dz  d|z  |z  z
  }|S )Nr   r*   r   g      @rr   )rU   rE   rangerQ   )r   rU   rE   msumjmjmaskry   s           r#   ry   zGOF.a  s    yy,, q 	Agbqk)BHD2d8|BtHBFFHD		 2IT	D((rK   c           	         | j                   }| j                  }dt        j                  d|dz         z  dz
  t        j                  |      t        j                  d|ddd   z
        z   z  j                          |z  |z
  }|S )z4Stephens 1974, does not have p-value formula for A^2rr   r
   r*   N)rU   rE   r   r;   logrQ   )r   rU   rE   asqus       r#   r   zGOF.asqu  s     yy,,ryyT!V,,q0266!GDbDM/#::=>AceDDHIKOP rK   c                     t        | |      }|dk(  rt        |   |   || j                        |fS t        |   |   || j                        S )z


        rz   )r6   	gof_pvalsrU   )r   testidpvalsrT   s       r#   get_testzGOF.get_test  sS    
 tV$O#U#F+D$))<dBBU#F+D$))<<rK   N)    )r    rz   )__name__
__module____qualname____doc__r   r   rt   ru   r    rv   rw   rx   ry   r   r   r   rK   r#   r   r   n  s    .4 = =
 ; ;
 3 3 * *        	=rK   r   d   c                 *   ddl m}  |t              }t        d      D ]K  } | |      }t	        ||      }t
        D ],  }||   j                  |j                  |d      d   d          . M t        j                  t
        D cg c]  }||   	 c}      }	t        ddj                  t
                     t        d|	d	k  j                  d             t        d
|	dk  j                  d             t        d|	dk  j                  d             y c c}w )Nr   defaultdicti  rz   r*   	               at 0.01:{Gz?at 0.05:rd   at 0.10:rk   )collectionsr   listr   r   all_gofsappendr   r   rR   printjoinr   )
randfndistrrU   r   resultsir8   gofttiresarrs
             r#   gof_mcr     s    '$G4[ ITl3 	IBBKt}}RA!DQGH	II XXX6rwr{67F	+x}}X./	*v}**1-.	*v}**1-.	*v|))!,-	 7s   Dc           	         t        | j                        }| j                  |   }t        d      g|z  }dg|z  }t        d      ||<   t        ddd      ||<   dt        j                  d|dz         t        |         z  dz
  t        j                  |       t        j                  d| t        |         z
        z   z  |z  j                  |       |z
  }|S )z.vectorized Anderson Darling A^2, Stephens 1974Nr   rr   r
   r*   )r   r   slicer   r;   tupler   rQ   )rE   axisndimrU   slice_reverseislicer   s          r#   asquarer     s    w}}D==D4[MD(MVd]F;F4LdB/M$299Ra(v77!;VVG_rvvam0D(E&EFFHHLMNQcRViXD KrK      c                    ||t        d      t        t        j                  |t	        |      z              }d}t        |      D ]  } | j                  |fi d||fi}	| j                  |	d      }
t        d |
      }
t        j                  | j                  |	|
      d      }t        |d      }|||k\  j                         z  } |t	        ||z        z  S  | j                  |fi d||fi}	| j                  |	d      }
t        d |
      }
t        j                  | j                  |	|
      d      }t        |d      }|t        j                  |      }|S ||k\  j                         S )a  Monte Carlo (or parametric bootstrap) p-values for gof

    currently hardcoded for A^2 only

    assumes vectorized fit_vec method,
    builds and analyses (nobs, nrep) sample in one step

    rename function to less generic

    this works also with nrep=1

    zusing batching requires a valuer   r'   r*   r   c                 .    t        j                  | d      S Nr*   r   expand_dimsxs    r#   r   zbootstrap.<locals>.<lambda>;  s    BNN1a$8 rK   c                 .    t        j                  | d      S r   r   r   s    r#   r   zbootstrap.<locals>.<lambda>D  s    q! 4 rK   )
ValueErrorintr   ceilr   r   r8   fit_vecr   r   r7   r   rQ   r   )r   r?   rU   nrepvalue
batch_sizen_batchcountirepr8   paramsrE   rT   stat_sorteds                 r#   	bootstrapr     st   , =>??bggd5#4456'N 	+D%))D@VZ,>$?@C]]3Q]/F8&AFggeiiV41=G7+Ddem((**E	+ uWz1222 eii6t 56s+4f=''%))C0q9wQ'=''$-KEM''))rK   c                     d}t        |      D ]b  } |j                  |fi d|i}|j                  |      }t        j                  |j                  ||            }	t        |	d      }
||
| k\  z  }d |dz  |z  S )zMonte Carlo (or parametric bootstrap) p-values for gof

    currently hardcoded for A^2 only

    non vectorized, loops over all parametric bootstrap replications and calculates
    and returns specific p-value,

    rename function to less generic

    r   r'   r   r
   )r   r8   r   r   r   r7   r   )r   r   r?   rU   r   r   r   r8   r   rE   rT   s              r#   
bootstrap2r   O  s    $ Ed !eii..s#''%))C01wQ'$%- ! 2:rK   c                   $    e Zd ZdZddZd Zd Zy)NewNormz-just a holder for modified distributions
    c                 F    |j                  |      |j                  |      fS r~   )r   std)r   r   r   s      r#   r   zNewNorm.fit_vecp  s    vvd|QUU4[((rK   c                 R    t         j                  j                  ||d   |d         S )Nr   r*   )locscale)r   normr7   )r   r   r?   s      r#   r7   zNewNorm.cdfs  s(    !!%%aT!WDG%DDrK   c                 b    |d   }|d   }||t         j                  j                  |      z  z   S )Nr   r*   r'   )r   r   r8   )r   r?   r'   r   r   s        r#   r8   zNewNorm.rvsv  s8    G1gU]//333>>>>rK   Nr   )r   r   r   r   r   r7   r8   r   rK   r#   r   r   l  s    )E?rK   r   __main__)stats   r   zscipy kstestr   )r    rt   ru   rv   rw   rx   ry   z
Is it correctly sized?r   r*   r   r   r   r   r   rd   r   rk   c                 D    t         j                  j                  d|       S )Nr   r   )r   tr8   rU   s    r#   r   r     s    AD1 rK   r   )r   r*   )r?   rU   r   r   )gGz?gffffff?rq   )r   r   r(   r.   )r   r   )r   r   r   NN)r   r   r   );r   statsmodels.compat.pythonr   numpyr   scipy.statsr   statsmodels.tools.decoratorsr   scipy.specialr   r   r$   rJ   rZ   dminus_st70_uppr]   rb   ri   ro   rs   r   r|   r   r   r   r   r   r   r   r   r   r   r8   r   r   r   r   r   r   r   r   r   rU   r   r   randomrandnr   rR   r   r   r   r   btfloorastyper   
quantindex)r   s   0r#   <module>r     s  & +  % 7 .Z~Y8|' !''''' 	 


	/ 8 RN
L	'  
	. ~= ~=N."..*d:? ?& z
''++ac+
"C	.	&f
sFD	$--/CH 6b$--O456 

$%'$GD3Z Iiiood#3 	IBBKt}}RA!DQGH	II RXXX6rwr{67F	+x}}X./	*v}**1-.	*v}**1-.	*v|))!,-
16DDD	795t$d	KB$*;!<<=DDSIJ	"Z.Q 0 7s   I1