Source code for environmentaltools.temporal.regimes

from loguru import logger

import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.interpolate import RegularGridInterpolator
from scipy.io import loadmat
from scipy.optimize import fsolve
from scipy.special import gamma

from environmentaltools.common import utils


def _fit_lmom_for_bca(data, tr, func, nyears):
    """Internal helper function for BCa jackknife fitting.
    
    Fits distribution using L-moments for BCa acceleration factor computation.
    Used internally by confidence_intervals with resample='bca'.
    
    Parameters
    ----------
    data : array-like
        Jackknife resampled data (with one observation removed)
    tr : array-like
        Return periods to evaluate
    func : str or scipy.stats distribution
        Distribution type:
        - For L-MOM: 'genpareto', 'expon', 'genextreme', 'gumbel_r'
        - For MLE: scipy.stats distribution object
    nyears : int
        Number of years in dataset
        
    Returns
    -------
    np.ndarray
        Parameter estimates and return period values
        
    Notes
    -----
    This function attempts to match the fitting method and distribution type
    from the original analysis to ensure consistent jackknife estimates.
    """
    # Handle L-moments distributions
    if isinstance(func, str):
        if (func == "genpareto") or (func == "expon"):
            return lmom_genpareto(data, tr, nyears)
        elif (func == "genextreme") or (func == "gumbel_r"):
            params = l_mom(data, func)
            pri = 1 - 1.0 / tr
            qtr = st.genextreme.ppf(pri, *params)
            return np.hstack((*params, 1, qtr))
        else:
            # Try to use probability_model_fit for unknown string distributions
            try:
                return probability_model_fit(data, tr, "L-MOM", func, nyears)
            except:
                # Fallback: assume GPD
                return lmom_genpareto(data, tr, nyears)
    
    # Handle scipy.stats distribution objects (MLE case)
    else:
        try:
            return probability_model_fit(data, tr, "MLE", func)
        except:
            # If MLE fails, try fitting as GPD
            return lmom_genpareto(data, tr, nyears)



[docs]
def confidence_intervals(boot, alpha, resample, *args):
    """Calculate confidence intervals using percentile or BCa bootstrap methods.
    
    Computes confidence intervals for bootstrap resampling results using either
    standard percentile method or bias-corrected and accelerated (BCa) method.

    Parameters
    ----------
    boot : np.ndarray
        Bootstrap resampling matrix with shape (n_simulations, n_parameters)
    alpha : float
        Significance level (e.g., 0.05 for 95% confidence intervals)
    resample : {'standard', 'bca'}
        Method for estimating confidence intervals:
        
        - 'standard': Simple percentile method
        - 'bca': Bias-corrected and accelerated bootstrap
        
    *args : tuple, optional
        Additional arguments required for BCa method:
        (orig, nosim, peaks, tri, tipo, nyears) where:
        
        - orig : array-like
            Original parameter estimates
        - nosim : int
            Number of bootstrap simulations
        - peaks : array-like
            Original peak time series
        - tri : array-like
            Return periods
        - tipo : str or scipy.stats distribution
            Distribution type
        - nyears : int
            Number of years in the dataset

    Returns
    -------
    np.ndarray
        Confidence interval bounds with shape (2, n_parameters):
        
        - ci[0, :] : lower bounds
        - ci[1, :] : upper bounds

    Notes
    -----
    The BCa method adjusts for bias and skewness in the bootstrap distribution
    using acceleration (a) and bias-correction (z0) factors. It provides more
    accurate intervals than standard percentiles, especially for small samples
    or skewed distributions.
    
    The acceleration factor is computed using jackknife influence values:
    a = sum(U^3) / [6 * sum(U^2)^(3/2)]
    
    where U are the jackknife deviations.

    References
    ----------
    Efron, B. (1987). "Better Bootstrap Confidence Intervals". 
    Journal of the American Statistical Association, 82(397), 171-185.

    Examples
    --------
    >>> boot = np.random.randn(1000, 3)  # Bootstrap samples
    >>> ci = confidence_intervals(boot, alpha=0.05, resample='standard')
    >>> print(f"95% CI: [{ci[0, 0]:.3f}, {ci[1, 0]:.3f}]")
    """

    if resample == "standard":
        ci = np.percentile(boot, np.array((alpha / 2, 1 - alpha / 2)) * 100, axis=0)
    elif resample == "bca":
        orig, nosim, peaks, tri, tipo, nyears = args[0]
        npeaks, norig = np.size(peaks), np.size(orig)
        pz0 = np.zeros(norig)
        for i in range(0, norig):
            mask = boot[:, i] < orig[i]
            pz0[i] = np.sum(mask) / float(nosim)
        z0 = st.norm.ppf(pz0)
        u = np.zeros([npeaks, norig])
        id_ = np.arange(npeaks)
        for i in range(0, npeaks):
            # Jackknife: fit distribution with i-th observation removed
            # Compute influence values for BCa acceleration factor
            u[i, :] = _fit_lmom_for_bca(peaks[id_ != i], tri, tipo, nyears) - orig
        a = np.sum(u ** 3) / np.sum(u ** 2) ** (3 / 2) / 6
        zalpha = st.norm.ppf((alpha / 2, 1 - alpha / 2))
        prob = np.zeros([2, norig])
        prob[0, :] = st.norm.cdf(z0 + (z0 + zalpha[0]) / (1 - a * (z0 + zalpha[0])))
        prob[1, :] = st.norm.cdf(z0 + (z0 + zalpha[1]) / (1 - a * (z0 + zalpha[1])))
        ci = np.zeros([2, norig])
        for i in range(0, norig):
            if not (np.isnan(prob[0, i]) | np.isnan(prob[1, i])):
                ci[:, i] = np.percentile(boot[:, i], prob[:, i] * 100, axis=0)

    return ci




[docs]
def bootstrapping(peaks, tri, method, func, nyears, nosim, resample):
    """Perform bootstrap resampling for return period estimation.
    
    Computes distribution parameters and return period values using bootstrap
    resampling with parametric or non-parametric methods.

    Parameters
    ----------
    peaks : array-like
        Annual maxima or peak over threshold time series
    tri : array-like
        Return periods to evaluate (in years)
    method : {'MLE', 'L-MOM'}
        Fitting method for the probability distribution:
        
        - 'MLE': Maximum Likelihood Estimation
        - 'L-MOM': L-moments method
        
    func : str or scipy.stats distribution
        Probability distribution name or object
    nyears : int
        Number of years in the dataset (for POT analysis)
    nosim : int
        Number of bootstrap simulations
    resample : {'parametric', 'non-parametric'}
        Resampling method:
        
        - 'parametric': Resample from fitted distribution
        - 'non-parametric': Resample with replacement from original data

    Returns
    -------
    boot : np.ndarray
        Bootstrap results with shape (nosim, n_params + 1 + len(tri))
        Contains parameters, nu, and return period values for each simulation
    orig : np.ndarray
        Original parameter estimates from the data

    Notes
    -----
    Parametric bootstrap:
    1. Fit distribution to original data
    2. Generate synthetic data from fitted distribution
    3. Refit distribution to synthetic data
    4. Repeat nosim times
    
    Non-parametric bootstrap:
    1. Resample with replacement from original data
    2. Fit distribution to resampled data
    3. Repeat nosim times
    
    The function handles special cases for GPD and GEV distributions when
    using L-moments.

    Examples
    --------
    >>> import numpy as np
    >>> peaks = np.random.weibull(1.5, 50)
    >>> tri = np.array([10, 50, 100])
    >>> boot, orig = bootstrapping(
    ...     peaks, tri, 'L-MOM', 'genextreme', 50, 1000, 'parametric'
    ... )
    """

    npeaks = np.size(peaks)
    orig = probability_model_fit(peaks, tri, method, func, nyears)
    boot = np.zeros([nosim, np.size(orig)])
    if resample == "parametric":
        if method == "MLE":
            params = orig[: func.numargs + 2]
            logger.info(
                "Computing bootstrapping of {} simulations using MLE and parametric methods for {} probability model. It will take a while.".format(
                    str(nosim), func.name
                )
            )
            for j in range(0, nosim):
                boot[j, :] = probability_model_fit(
                    func.rvs(*params, npeaks), tri, method, func
                )
        elif method == "L-MOM":
            logger.info(
                "Computing bootstrapping of {} simulations using L-MOM and parametric methods for {} probability model.".format(
                    str(nosim), func
                )
            )
            if (func == "expon") | (func == "genpareto"):
                for j in range(0, nosim):
                    boot[j, :] = probability_model_fit(
                        st.genpareto.rvs(orig[0], orig[1], orig[2], npeaks),
                        tri,
                        method,
                        func,
                        nyears,
                    )
            elif (func == "genextreme") | (func == "gumbel_r"):
                for j in range(0, nosim):
                    boot[j, :] = probability_model_fit(
                        st.genextreme.rvs(orig[0], orig[1], orig[2], npeaks),
                        tri,
                        method,
                        func,
                    )
    elif resample == "non-parametric":
        for j in range(0, nosim):
            logger.info(
                "Computing bootstrapping of {} simulations using {} and non-parametric methods for {} probability model. It will take a while.".format(
                    str(nosim), method, func.name
                )
            )
            boot[j, :] = probability_model_fit(
                np.random.choice(peaks, npeaks), tri, method, func, nyears
            )

    return boot, orig




[docs]
def probability_model_fit(data, tr, method, func, *nyears):
    """Estimate probability distribution parameters and return period values.
    
    Fits a probability distribution to data using specified method and
    computes return period values for given return periods.

    Parameters
    ----------
    data : array-like
        Time series data (annual maxima or peaks over threshold)
    tr : array-like
        Return periods in years (e.g., [10, 50, 100, 1000])
    method : {'MLE', 'L-MOM'}
        Parameter estimation method:
        
        - 'MLE': Maximum Likelihood Estimation (uses scipy.stats.fit)
        - 'L-MOM': L-moments method (custom implementation)
        
    func : str or scipy.stats distribution
        Probability distribution:
        
        - For MLE: scipy.stats distribution object
        - For L-MOM: string in ['expon', 'genpareto', 'genextreme', 'gumbel_r']
        
    *nyears : int, optional
        Number of years in dataset (required for POT analysis with GPD)

    Returns
    -------
    np.ndarray
        Array containing [shape, loc, scale, nu, qtr_1, qtr_2, ...] where:
        
        - shape, loc, scale : distribution parameters
        - nu : average number of events per year
        - qtr_i : return level for return period tr[i]

    Notes
    -----
    For annual maxima (GEV, Gumbel):
    - nu = 1 (one event per year)
    - Return levels: x_T = F^(-1)(1 - 1/T)
    
    For peaks over threshold (GPD):
    - nu = n_peaks / n_years
    - Return levels: x_T = F^(-1)(1 - 1/(T*nu))
    
    L-moments method provides more robust parameter estimates for small
    samples compared to MLE.

    See Also
    --------
    l_mom : L-moments parameter estimation
    bootstrapping : Bootstrap resampling for uncertainty estimation

    Examples
    --------
    >>> import scipy.stats as st
    >>> data = st.genextreme.rvs(0.1, loc=2, scale=0.5, size=50)
    >>> tr = np.array([10, 50, 100])
    >>> params = probability_model_fit(data, tr, 'MLE', st.genextreme)
    >>> print(f"Shape: {params[0]:.3f}, Loc: {params[1]:.3f}, Scale: {params[2]:.3f}")
    """

    if method == "MLE":
        params = func.fit(data)
        pri = 1 - 1.0 / tr
        qtr = func.ppf(pri, *params)
        nu = 1

    elif method == "L-MOM":
        params = l_mom(data, func)
        if (func == "expon") | (func == "genpareto"):
            nu = np.size(data) / nyears[0]
            pri = 1 - 1 / (tr * nu)
            qtr = st.genpareto.ppf(pri, *params)
        elif (func == "genextreme") | (func == "gumbel_r"):
            pri = 1 - 1.0 / tr
            qtr = st.genextreme.ppf(pri, *params)
            nu = 1

    return np.hstack((*params, nu, qtr))




[docs]
def l_mom(data, func):
    """Estimate distribution parameters using L-moments method.
    
    Computes distribution parameters from sample L-moments (linear combinations
    of order statistics). Provides robust estimates especially for small samples.

    Parameters
    ----------
    data : array-like
        Sample data to fit
    func : {'genpareto', 'expon', 'genextreme', 'gumbel_r'}
        Distribution type to fit:
        
        - 'genpareto': Generalized Pareto Distribution (3 parameters)
        - 'expon': Exponential distribution (2 parameters, k=0)
        - 'genextreme': Generalized Extreme Value distribution (3 parameters)
        - 'gumbel_r': Gumbel distribution (2 parameters, k=0)

    Returns
    -------
    list
        Distribution parameters [k, mu, sig] where:
        
        - k : float
            Shape parameter (0 for Exponential and Gumbel)
        - mu : float
            Location parameter
        - sig : float
            Scale parameter

    Notes
    -----
    L-moments are linear combinations of order statistics:
    
    - λ₁ = E[X] (mean)
    - λ₂ = E[X_{2:2} - X_{1:2}]/2 (scale)
    - λ₃ related to skewness
    
    L-moment ratios (τ₃ = λ₃/λ₂) are used to estimate shape parameters.
    
    For GEV, the shape parameter k is solved numerically from:
    2(1-3^(-k))/(1-2^(-k)) - 3 = τ₃
    
    Sample L-moments use plotting positions: p_i = (i-0.35)/n

    References
    ----------
    Hosking, J.R.M. (1990). "L-moments: Analysis and Estimation of 
    Distributions Using Linear Combinations of Order Statistics". 
    Journal of the Royal Statistical Society, Series B, 52(1), 105-124.

    Examples
    --------
    >>> import numpy as np
    >>> data = np.random.exponential(2.0, 100)
    >>> k, mu, sig = l_mom(data, 'expon')
    >>> print(f"Exponential: k={k:.3f}, mu={mu:.3f}, sig={sig:.3f}")
    """

    data = np.sort(data)
    n = np.size(data)
    p = (np.arange(1, n + 1) - 0.35) / n
    lam1, lam2 = np.mean(data), np.mean((2 * p - 1) * data)
    if func == "genpareto":
        lam3 = np.mean((1 - 6 * p + 6 * p ** 2) * data)
        tau3 = lam3 / lam2
        k = (3 * tau3 - 1) / (1 + tau3)
        sig = lam2 * (1 - k) * (2 - k)
        mu = lam1 - sig / (1 - k)
    elif func == "expon":
        sig = 2 * lam2
        mu = lam1 - sig
        k = 0
    elif func == "genextreme":
        lam3 = np.mean((1 - 6 * p + 6 * p ** 2) * data)
        tau3 = lam3 / lam2

        def fun(x, tau3):
            return 2.0 * (1 - 3.0 ** -x) / (1 - 2.0 ** -x) - 3 - tau3

        c = 2 / (3 + tau3) - np.log(2) / np.log(3)
        k = 7.859 * c + 2.9554 * c ** 2
        k = fsolve(fun, k, args=(tau3))
        sig = lam2 * k / (1 - 2 ** (-k)) / gamma(1 + k)
        mu = lam1 - sig * (1 - gamma(1 + k)) / k
    elif func == "gumbel_r":
        sig = lam2 / np.log(2)
        mu = lam1 - 0.5772 * sig
        k = 0

    return [k, mu, sig]




[docs]
def lmom_genpareto(data, tr, nyears):
    """Fit Generalized Pareto Distribution using L-moments with diagnostic metrics.
    
    Estimates GPD parameters via L-moments and computes return period values
    along with additional diagnostic statistics for POT analysis.

    Parameters
    ----------
    data : array-like
        Peak over threshold data (exceedances)
    tr : array-like
        Return periods in years to evaluate
    nyears : int
        Number of years in the dataset

    Returns
    -------
    np.ndarray
        Array containing [k, mu, sig, mrlp, sigmodif, nu, qtr_1, qtr_2, ...]
        
        - k : float
            Shape parameter
        - mu : float
            Location parameter (threshold)
        - sig : float
            Scale parameter
        - mrlp : float
            Mean residual life plot value: E[X - mu | X > mu]
        - sigmodif : float
            Modified scale: sig - k*mu
        - nu : float
            Average number of exceedances per year
        - qtr_i : float
            Return level for return period tr[i]

    Notes
    -----
    For peaks over threshold analysis with GPD:
    
    - Return levels: x_T = μ + (σ/ξ)[(Tλ)^ξ - 1]
    - Mean excess: E[X-u|X>u] = (σ + ξu)/(1-ξ)
    
    The modified scale parameter is used in some POT formulations.

    Examples
    --------
    >>> data = np.random.exponential(2.0, 100)
    >>> tr = np.array([10, 50, 100])
    >>> results = lmom_genpareto(data, tr, nyears=10)
    >>> k, mu, sig = results[0], results[1], results[2]
    """

    k, mu, sig = l_mom(data, "genpareto")
    mrlp = np.mean(data - mu)
    sigmodif = sig - k * mu
    nu = np.size(data) / nyears
    pri = 1 - 1 / (tr * nu)
    qtr = st.genpareto.ppf(pri, k, mu, sig)
    return np.hstack((k, mu, sig, mrlp, sigmodif, nu, qtr))




[docs]
def pot_method(
    df: pd.DataFrame,
    var_: str,
    window_size: int,
    alpha: float = 0.05,
    sim_no: int = 10000,
    method: str ="nearest"
):
    """Perform Peaks Over Threshold (POT) analysis with multiple thresholds.
    
    Analyzes extreme values using Generalized Pareto Distribution fitted with
    L-moments. Tests multiple threshold values and provides bootstrap confidence
    intervals for return period estimates.

    Parameters
    ----------
    df : pd.DataFrame
        Time series data with datetime index
    var_ : str
        Name of the variable column to analyze
    window_size : int
        Window size for extracting independent maxima events
    alpha : float, default=0.05
        Significance level for confidence intervals (0.05 gives 95% CI)
    sim_no : int, default=10000
        Number of bootstrap simulations
    method : str, default='nearest'
        Interpolation method for p-value table (currently commented out)

    Returns
    -------
    dict
        Results dictionary with keys:
        
        - 'mean_value_lmom' : np.ndarray
            Shape (n_thresholds, n_params+6+len(tr_eval))
            Mean parameter estimates and return levels for each threshold
        - 'upper_lim' : np.ndarray
            Upper confidence interval bounds
        - 'lower_lim' : np.ndarray
            Lower confidence interval bounds
        - 'au2_lmom' : np.ndarray
            Anderson-Darling test statistics (currently zeros)
        - 'au2pv_lmom' : np.ndarray
            Anderson-Darling p-values (currently zeros)
        - 'nyears' : int
            Number of years in the dataset
        - 'thresholds' : np.ndarray
            Threshold values tested (90th to 99.9th percentiles)
        - 'tr_eval' : np.ndarray
            Return periods evaluated [1, 50, 100, 1000] years

    Notes
    -----
    The POT method:
    
    1. Extracts independent events using moving window maxima
    2. Tests thresholds from 90th to 99.9th percentiles
    3. For each threshold:
       - Fits GPD using L-moments
       - Computes bootstrap confidence intervals
       - (Optionally) performs Anderson-Darling goodness-of-fit test
    
    Higher thresholds provide better asymptotic approximation but fewer data
    points. The optimal threshold balances bias-variance tradeoff.

    References
    ----------
    Coles, S. (2001). "An Introduction to Statistical Modeling of 
    Extreme Values". Springer.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     'Hs': np.random.weibull(1.5, 1000)
    ... }, index=pd.date_range('2000', periods=1000, freq='D'))
    >>> results = pot_method(df, 'Hs', window_size=48, alpha=0.05)
    >>> print(f"Number of thresholds tested: {len(results['thresholds'])}")
    """
    # Initialize evaluation return periods
    tr_eval = np.array([1,50, 100, 1000])

    thresholds = np.percentile(
        df[var_], np.hstack([np.linspace(90, 99, 10), np.linspace(99.1, 99.9, 9)])
    )

    no_thres = np.size(thresholds)

    results = {}
    results["mean_value_lmom"], results["upper_lim"], results["lower_lim"], results["au2_lmom"], results["au2pv_lmom"] = (
        np.zeros([no_thres, np.size(tr_eval) + 6]),
        np.zeros([no_thres, np.size(tr_eval) + 6]),
        np.zeros([no_thres, np.size(tr_eval) + 6]),
        np.zeros(no_thres),
        np.zeros(no_thres),
    )

    # Load p-value table for Anderson-Darling test and create the interpolator
    # data = loadmat(
    #     os.path.join(os.path.dirname(__file__), "../", "utils/misc/PVAL_AU2_LMOM.mat")
    # )

    # pvallmom, au2val, ko, ndata = (
    #     data["PVAL"],
    #     data["AU2VAL"][0],
    #     data["Ko"][0],
    #     data["NDATA"][0],
    # )

    # interpolator = RegularGridInterpolator(
    #     (ndata, ko, au2val),
    #     pvallmom,
    #     method=method,
    #     bounds_error=False,
    #     fill_value=None,
    # )

    # Compute the POT analysis
    logger.info(
        "Computing POT analysis for a window size of "
        + str(df.index[window_size] - df.index[0])
    )
    df_max_events = utils.max_moving(df[var_], window_size)
    event = df_max_events.loc[
        df_max_events[var_] > np.percentile(df[var_], 90), var_
    ].values[:]
    nyears = df.index.year.max() - df.index.year.min()

    # Perform analysis for each threshold
    for i in range(no_thres):
        logger.info(
            "Threshold "
            + str(i + 1)
            + " of "
            + str(no_thres)
            + " ("
            + str(np.round(thresholds[i], decimals=3))
            + ")."
        )
        events_for_threshold_i = event[event > thresholds[i]]
        n_evi = np.size(events_for_threshold_i)
        results["mean_value_lmom"][i, :] = lmom_genpareto(events_for_threshold_i, tr_eval, nyears)

        # Bootstrap resampling using L-moments for Generalized Pareto Distribution
        boot = np.zeros([sim_no, np.size(tr_eval) + 6])

        for j in range(0, sim_no):
            boot[j, :] = lmom_genpareto(
                np.random.choice(events_for_threshold_i, n_evi), tr_eval, nyears
            )

        results["upper_lim"][i, :] = np.percentile(boot, (1 - alpha / 2) * 100, axis=0)
        results["lower_lim"][i, :] = np.percentile(boot, (alpha / 2) * 100, axis=0)

        # Anderson-Darling Upper test without confidence intervals for L-moments
        # results["au2_lmom"][i] = au2(results["mean_value_lmom"][i, 0:3], events_for_threshold_i)
        # results["au2pv_lmom"][i] = interpolator(
        #     [
        #         np.max((10, np.min((500, n_evi)))),
        #         np.sign(results["mean_value_lmom"][i, 0])
        #         * np.min((np.abs(results["mean_value_lmom"][i, 0]), 0.5)),
        #         results["au2_lmom"][i],
        #     ]
        # )
    
    results["nyears"] = nyears
    results["thresholds"] = thresholds
    results["tr_eval"] = tr_eval

    return results




[docs]
def au2(par, data):
    """Calculate Anderson-Darling test statistic for GPD goodness-of-fit.
    
    Computes the Anderson-Darling A² statistic to assess how well a 
    Generalized Pareto Distribution fits the data. Lower values indicate
    better fit.

    Parameters
    ----------
    par : array-like
        GPD parameters [shape, loc, scale]
    data : array-like
        Peak over threshold data to test

    Returns
    -------
    float
        Anderson-Darling A² test statistic

    Notes
    -----
    The Anderson-Darling statistic is:
    
    A² = n/2 - 2Σ[F(x_i)] - Σ[(2i-1)/n * log(1-F(x_i))]
    
    where F is the fitted CDF and x_i are the sorted data.
    
    This statistic gives more weight to tail deviations than KS test,
    making it particularly suitable for extreme value analysis.
    
    The CDF values are adjusted by 1e-6 to avoid log(0) issues.

    References
    ----------
    Anderson, T.W. and Darling, D.A. (1952). "Asymptotic theory of 
    certain goodness of fit criteria based on stochastic processes". 
    Annals of Mathematical Statistics, 23, 193-212.

    Examples
    --------
    >>> data = np.random.exponential(2.0, 100)
    >>> par = [0, 0, 2.0]  # Exponential is GPD with shape=0
    >>> ad_stat = au2(par, data)
    >>> print(f"Anderson-Darling statistic: {ad_stat:.3f}")
    """

    ndat = np.size(data)
    cdf = np.sort(st.genpareto.cdf(data, *par))
    cdf -= 1e-6  # Avoid log(0) when CDF=1.0
    estadist = (
        ndat / 2
        - 2 * np.sum(cdf)
        - np.sum((2 - (2 * np.arange(1, ndat + 1) - 1.0) / ndat) * np.log(1 - cdf))
    )

    return estadist




[docs]
def fit_gpd_bootstrap(df_eventos, alpha, umb, param, bca, nyears):
    """Compute GPD parameters with bootstrap resampling and confidence intervals.
    
    Automatically fits Generalized Pareto Distribution using L-moments with
    bootstrap resampling for specified thresholds. Also tests if Exponential
    distribution is appropriate when shape parameter CI includes zero.

    Parameters
    ----------
    df_eventos : pd.DataFrame or np.ndarray
        Event time series data
    alpha : float
        Significance level for confidence intervals
    umb : array-like
        Threshold values to test
    param : {'parametric', 'non-parametric'}
        Bootstrap resampling method
    bca : {'standard', 'bca'}
        Confidence interval computation method
    nyears : int
        Number of years in the dataset

    Returns
    -------
    boot : list of lists
        Bootstrap results [GPD_results, Exponential_results (if applicable)]
        Each contains bootstrap parameter matrices for each threshold
    orig : list of lists
        Original parameter estimates [GPD_estimates, Exponential_estimates]
    ci : list of lists
        Confidence intervals [GPD_CI, Exponential_CI]
    tr : np.ndarray
        Return periods evaluated
    peaks : np.ndarray
        Peak values used (for last threshold processed)
    npeaks : int
        Number of peaks (for last threshold)
    eventanu : float
        Average events per year (for last threshold)

    Notes
    -----
    For each threshold:
    
    1. Extracts peaks exceeding threshold
    2. Fits GPD using L-moments with bootstrap
    3. Computes confidence intervals
    4. If CI for shape includes 0, also fits Exponential distribution
    
    Exponential is a special case of GPD with shape=0.

    Examples
    --------
    >>> umb = np.array([2.0, 2.5])
    >>> boot, orig, ci, tr, peaks, npeaks, eventanu = fit_gpd_bootstrap(
    ...     df['Hs'], alpha=0.05, umb=umb, param='parametric',
    ...     bca='standard', nyears=10
    ... )
    """
    tr = np.hstack((np.arange(1, 11), np.arange(2, 11) * 10, np.arange(2, 11) * 100))
    nosim = 10000

    # Extract peaks over threshold with PWM and bootstrap confidence intervals
    # Identify events
    numb = np.size(umb)
    event = df_eventos.values[:]

    boot, orig, ci = (
        [list() for i in range(2)],
        [list() for i in range(2)],
        [list() for i in range(2)],
    )
    for i in range(numb):
        peaks = event[event > umb[i]]
        npeaks = np.size(peaks)
        eventanu = npeaks / nyears

        # Central value and bootstrap with predefined threshold
        boot_s, orig_s = bootstrapping(peaks, tr, "L-MOM", "genpareto", nyears, nosim, param)
        ci_s = confidence_intervals(
            boot_s, alpha, bca, (orig_s, nosim, peaks, tr, "genpareto", nyears)
        )
        boot[0].append(boot_s), orig[0].append(orig_s), ci[0].append(ci_s)

        # If Exponential is appropriate (shape CI includes 0), compute it
        expo = (ci_s[0, 0] < 0) & (ci_s[1, 0] > 0)
        if expo:
            boot_e, orig_e = bootstrapping(peaks, tr, "L-MOM", "expon", nyears, nosim, param)
            ci_e = confidence_intervals(
                boot_e, alpha, bca, (orig_e, nosim, peaks, tr, "expon", nyears)
            )
            boot[1].append(boot_e), orig[1].append(orig_e), ci[1].append(ci_e)

    return boot, orig, ci, tr, peaks, npeaks, eventanu




[docs]
def annual_maxima_method(df, alpha, method, func, resample, ci_method, tr = None):
    """Fit GEV or Gumbel distribution to annual maxima with bootstrap CI.
    
    Performs extreme value analysis using annual maxima approach. Fits
    Generalized Extreme Value (GEV) or Gumbel distribution with bootstrap
    resampling to estimate confidence intervals.

    Parameters
    ----------
    df : pd.DataFrame
        Time series data with datetime index
    alpha : float
        Significance level for confidence intervals (e.g., 0.05 for 95% CI)
    method : {'MLE', 'L-MOM'}
        Parameter estimation method:
        
        - 'MLE': Maximum Likelihood Estimation
        - 'L-MOM': L-moments method
        
    func : str or scipy.stats distribution
        Distribution to fit:
        
        - For L-MOM: 'genextreme', 'gumbel_r'
        - For MLE: scipy.stats distribution object (e.g., st.genextreme)
        
    resample : {'parametric', 'non-parametric'}
        Bootstrap resampling method
    ci_method : {'standard', 'bca'}
        Confidence interval computation method:
        
        - 'standard': Percentile method
        - 'bca': Bias-corrected and accelerated
        
    tr : array-like, optional
        Return periods to evaluate (in years). If None, uses default range
        from 1.1 to 1000 years

    Returns
    -------
    tr : np.ndarray
        Return periods evaluated
    pannumax : np.ndarray
        Plotting positions for annual maxima
    annumax : pd.Series
        Annual maxima values
    boot : list of lists
        Bootstrap results [GEV_results, Gumbel_results (if applicable)]
    orig : list of lists
        Original parameter estimates
    ci : list of lists
        Confidence intervals

    Notes
    -----
    The annual maxima method:
    
    1. Extracts one maximum value per year
    2. Fits GEV distribution: F(x) = exp{-[1+ξ(x-μ)/σ]^(-1/ξ)}
    3. If shape parameter CI includes 0, also fits Gumbel (ξ=0)
    
    Return levels are computed as: x_T = F^(-1)(1 - 1/T)
    
    Gumbel is a special case of GEV with shape parameter ξ=0.

    Raises
    ------
    ValueError
        If invalid method, function, resample, or ci_method specified

    Examples
    --------
    >>> df = pd.DataFrame({
    ...     'Hs': np.random.weibull(1.5, 3650)
    ... }, index=pd.date_range('2010', periods=3650, freq='D'))
    >>> tr, p, maxima, boot, orig, ci = annual_maxima_method(
    ...     df, alpha=0.05, method='L-MOM', func='genextreme',
    ...     resample='parametric', ci_method='standard'
    ... )
    """

    if not ((method == "MLE") | (method == "L-MOM")):
        raise ValueError(
            'Fitting methods for probability models are "MLE" or "L-MOM". Given {}.'.format(
                method
            )
        )

    if method == "L-MOM":
        if not func in ["genpareto", "genextreme", "gumbel_r", "expon"]:
            raise ValueError(
                'Function {} is not included in L-MOM methods for fitting. Use "genpareto", "genextreme", "gumbel_r" or "expon".'.format(
                    method
                )
            )
    else:
        try:
            func = getattr(st, func)
        except:
            raise ValueError("Function {} is not included scipy.stats.".format(func))

    if not ((resample == "parametric") | (resample == "non-parametric")):
        raise ValueError(
            'Resampling methods are "parametric" or "non-parametric". Given {}'.format(
                resample
            )
        )

    if not ((ci_method == "standard") | (ci_method == "bca")):
        raise ValueError(
            'Confidence interval methods are "standard" or "bca". Given {}'.format(
                ci_method
            )
        )

    if tr is None:
        tr = np.hstack((np.arange(1, 11), np.arange(2, 11) * 10, np.arange(2, 11) * 100))
        tr = np.hstack((np.arange(1.1, 2 + 1e-6, 0.1), tr[tr > 2]))
    annumax = df.groupby(df.index.year).max()
    nyears = len(annumax)
    pannumax = np.arange(1.0, nyears + 1.0) / (nyears + 1.0)

    nosim = 10
    boot, orig, ci = (
        [list() for i in range(2)],
        [list() for i in range(2)],
        [list() for i in range(2)],
    )
    boot_a, orig_a = bootstrapping(annumax, tr, method, func, nyears, nosim, resample)
    ci_a = confidence_intervals(
        boot_a, alpha, ci_method, (orig_a, nosim, annumax, tr, func, nyears)
    )
    boot[0].append(boot_a), orig[0].append(orig_a), ci[0].append(ci_a)

    # Whether the parameters indicate that gumbel_r can be estimated, repeat the method with gumbel_r probability model
    if (ci_a[0, 0] < 0) & (ci_a[1, 0] > 0):
        if method == "MLE":
            func = st.gumbel_r
        else:
            func = "gumbel_r"
        boot_g, orig_g = bootstrapping(
            annumax, tr, method, func, nyears, nosim, resample
        )
        ci_g = confidence_intervals(
            boot_g, alpha, ci_method, (orig_g, nosim, annumax, tr, func, nyears)
        )
        boot[1].append(boot_g), orig[1].append(orig_g), ci[1].append(ci_g)

    return tr, pannumax, annumax, boot, orig, ci