Source code for environmentaltools.temporal.regimes

from loguru import logger

import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.interpolate import RegularGridInterpolator
from scipy.io import loadmat
from scipy.optimize import fsolve
from scipy.special import gamma

from environmentaltools.common import utils


def _fit_lmom_for_bca(data, tr, func, nyears):
    """Internal helper function for BCa jackknife fitting.
    
    Fits distribution using L-moments for BCa acceleration factor computation.
    Used internally by confidence_intervals with resample='bca'.
    
    Parameters
    ----------
    data : array-like
        Jackknife resampled data (with one observation removed)
    tr : array-like
        Return periods to evaluate
    func : str or scipy.stats distribution
        Distribution type:
        - For L-MOM: 'genpareto', 'expon', 'genextreme', 'gumbel_r'
        - For MLE: scipy.stats distribution object
    nyears : int
        Number of years in dataset
        
    Returns
    -------
    np.ndarray
        Parameter estimates and return period values
        
    Notes
    -----
    This function attempts to match the fitting method and distribution type
    from the original analysis to ensure consistent jackknife estimates.
    """
    # Handle L-moments distributions
    if isinstance(func, str):
        if (func == "genpareto") or (func == "expon"):
            return lmom_genpareto(data, tr, nyears)
        elif (func == "genextreme") or (func == "gumbel_r"):
            params = l_mom(data, func)
            pri = 1 - 1.0 / tr
            qtr = st.genextreme.ppf(pri, *params)
            return np.hstack((*params, 1, qtr))
        else:
            # Try to use probability_model_fit for unknown string distributions
            try:
                return probability_model_fit(data, tr, "L-MOM", func, nyears)
            except:
                # Fallback: assume GPD
                return lmom_genpareto(data, tr, nyears)
    
    # Handle scipy.stats distribution objects (MLE case)
    else:
        try:
            return probability_model_fit(data, tr, "MLE", func)
        except:
            # If MLE fails, try fitting as GPD
            return lmom_genpareto(data, tr, nyears)


[docs] def confidence_intervals(boot, alpha, resample, *args): """Calculate confidence intervals using percentile or BCa bootstrap methods. Computes confidence intervals for bootstrap resampling results using either standard percentile method or bias-corrected and accelerated (BCa) method. Parameters ---------- boot : np.ndarray Bootstrap resampling matrix with shape (n_simulations, n_parameters) alpha : float Significance level (e.g., 0.05 for 95% confidence intervals) resample : {'standard', 'bca'} Method for estimating confidence intervals: - 'standard': Simple percentile method - 'bca': Bias-corrected and accelerated bootstrap *args : tuple, optional Additional arguments required for BCa method: (orig, nosim, peaks, tri, tipo, nyears) where: - orig : array-like Original parameter estimates - nosim : int Number of bootstrap simulations - peaks : array-like Original peak time series - tri : array-like Return periods - tipo : str or scipy.stats distribution Distribution type - nyears : int Number of years in the dataset Returns ------- np.ndarray Confidence interval bounds with shape (2, n_parameters): - ci[0, :] : lower bounds - ci[1, :] : upper bounds Notes ----- The BCa method adjusts for bias and skewness in the bootstrap distribution using acceleration (a) and bias-correction (z0) factors. It provides more accurate intervals than standard percentiles, especially for small samples or skewed distributions. The acceleration factor is computed using jackknife influence values: a = sum(U^3) / [6 * sum(U^2)^(3/2)] where U are the jackknife deviations. References ---------- Efron, B. (1987). "Better Bootstrap Confidence Intervals". Journal of the American Statistical Association, 82(397), 171-185. Examples -------- >>> boot = np.random.randn(1000, 3) # Bootstrap samples >>> ci = confidence_intervals(boot, alpha=0.05, resample='standard') >>> print(f"95% CI: [{ci[0, 0]:.3f}, {ci[1, 0]:.3f}]") """ if resample == "standard": ci = np.percentile(boot, np.array((alpha / 2, 1 - alpha / 2)) * 100, axis=0) elif resample == "bca": orig, nosim, peaks, tri, tipo, nyears = args[0] npeaks, norig = np.size(peaks), np.size(orig) pz0 = np.zeros(norig) for i in range(0, norig): mask = boot[:, i] < orig[i] pz0[i] = np.sum(mask) / float(nosim) z0 = st.norm.ppf(pz0) u = np.zeros([npeaks, norig]) id_ = np.arange(npeaks) for i in range(0, npeaks): # Jackknife: fit distribution with i-th observation removed # Compute influence values for BCa acceleration factor u[i, :] = _fit_lmom_for_bca(peaks[id_ != i], tri, tipo, nyears) - orig a = np.sum(u ** 3) / np.sum(u ** 2) ** (3 / 2) / 6 zalpha = st.norm.ppf((alpha / 2, 1 - alpha / 2)) prob = np.zeros([2, norig]) prob[0, :] = st.norm.cdf(z0 + (z0 + zalpha[0]) / (1 - a * (z0 + zalpha[0]))) prob[1, :] = st.norm.cdf(z0 + (z0 + zalpha[1]) / (1 - a * (z0 + zalpha[1]))) ci = np.zeros([2, norig]) for i in range(0, norig): if not (np.isnan(prob[0, i]) | np.isnan(prob[1, i])): ci[:, i] = np.percentile(boot[:, i], prob[:, i] * 100, axis=0) return ci
[docs] def bootstrapping(peaks, tri, method, func, nyears, nosim, resample): """Perform bootstrap resampling for return period estimation. Computes distribution parameters and return period values using bootstrap resampling with parametric or non-parametric methods. Parameters ---------- peaks : array-like Annual maxima or peak over threshold time series tri : array-like Return periods to evaluate (in years) method : {'MLE', 'L-MOM'} Fitting method for the probability distribution: - 'MLE': Maximum Likelihood Estimation - 'L-MOM': L-moments method func : str or scipy.stats distribution Probability distribution name or object nyears : int Number of years in the dataset (for POT analysis) nosim : int Number of bootstrap simulations resample : {'parametric', 'non-parametric'} Resampling method: - 'parametric': Resample from fitted distribution - 'non-parametric': Resample with replacement from original data Returns ------- boot : np.ndarray Bootstrap results with shape (nosim, n_params + 1 + len(tri)) Contains parameters, nu, and return period values for each simulation orig : np.ndarray Original parameter estimates from the data Notes ----- Parametric bootstrap: 1. Fit distribution to original data 2. Generate synthetic data from fitted distribution 3. Refit distribution to synthetic data 4. Repeat nosim times Non-parametric bootstrap: 1. Resample with replacement from original data 2. Fit distribution to resampled data 3. Repeat nosim times The function handles special cases for GPD and GEV distributions when using L-moments. Examples -------- >>> import numpy as np >>> peaks = np.random.weibull(1.5, 50) >>> tri = np.array([10, 50, 100]) >>> boot, orig = bootstrapping( ... peaks, tri, 'L-MOM', 'genextreme', 50, 1000, 'parametric' ... ) """ npeaks = np.size(peaks) orig = probability_model_fit(peaks, tri, method, func, nyears) boot = np.zeros([nosim, np.size(orig)]) if resample == "parametric": if method == "MLE": params = orig[: func.numargs + 2] logger.info( "Computing bootstrapping of {} simulations using MLE and parametric methods for {} probability model. It will take a while.".format( str(nosim), func.name ) ) for j in range(0, nosim): boot[j, :] = probability_model_fit( func.rvs(*params, npeaks), tri, method, func ) elif method == "L-MOM": logger.info( "Computing bootstrapping of {} simulations using L-MOM and parametric methods for {} probability model.".format( str(nosim), func ) ) if (func == "expon") | (func == "genpareto"): for j in range(0, nosim): boot[j, :] = probability_model_fit( st.genpareto.rvs(orig[0], orig[1], orig[2], npeaks), tri, method, func, nyears, ) elif (func == "genextreme") | (func == "gumbel_r"): for j in range(0, nosim): boot[j, :] = probability_model_fit( st.genextreme.rvs(orig[0], orig[1], orig[2], npeaks), tri, method, func, ) elif resample == "non-parametric": for j in range(0, nosim): logger.info( "Computing bootstrapping of {} simulations using {} and non-parametric methods for {} probability model. It will take a while.".format( str(nosim), method, func.name ) ) boot[j, :] = probability_model_fit( np.random.choice(peaks, npeaks), tri, method, func, nyears ) return boot, orig
[docs] def probability_model_fit(data, tr, method, func, *nyears): """Estimate probability distribution parameters and return period values. Fits a probability distribution to data using specified method and computes return period values for given return periods. Parameters ---------- data : array-like Time series data (annual maxima or peaks over threshold) tr : array-like Return periods in years (e.g., [10, 50, 100, 1000]) method : {'MLE', 'L-MOM'} Parameter estimation method: - 'MLE': Maximum Likelihood Estimation (uses scipy.stats.fit) - 'L-MOM': L-moments method (custom implementation) func : str or scipy.stats distribution Probability distribution: - For MLE: scipy.stats distribution object - For L-MOM: string in ['expon', 'genpareto', 'genextreme', 'gumbel_r'] *nyears : int, optional Number of years in dataset (required for POT analysis with GPD) Returns ------- np.ndarray Array containing [shape, loc, scale, nu, qtr_1, qtr_2, ...] where: - shape, loc, scale : distribution parameters - nu : average number of events per year - qtr_i : return level for return period tr[i] Notes ----- For annual maxima (GEV, Gumbel): - nu = 1 (one event per year) - Return levels: x_T = F^(-1)(1 - 1/T) For peaks over threshold (GPD): - nu = n_peaks / n_years - Return levels: x_T = F^(-1)(1 - 1/(T*nu)) L-moments method provides more robust parameter estimates for small samples compared to MLE. See Also -------- l_mom : L-moments parameter estimation bootstrapping : Bootstrap resampling for uncertainty estimation Examples -------- >>> import scipy.stats as st >>> data = st.genextreme.rvs(0.1, loc=2, scale=0.5, size=50) >>> tr = np.array([10, 50, 100]) >>> params = probability_model_fit(data, tr, 'MLE', st.genextreme) >>> print(f"Shape: {params[0]:.3f}, Loc: {params[1]:.3f}, Scale: {params[2]:.3f}") """ if method == "MLE": params = func.fit(data) pri = 1 - 1.0 / tr qtr = func.ppf(pri, *params) nu = 1 elif method == "L-MOM": params = l_mom(data, func) if (func == "expon") | (func == "genpareto"): nu = np.size(data) / nyears[0] pri = 1 - 1 / (tr * nu) qtr = st.genpareto.ppf(pri, *params) elif (func == "genextreme") | (func == "gumbel_r"): pri = 1 - 1.0 / tr qtr = st.genextreme.ppf(pri, *params) nu = 1 return np.hstack((*params, nu, qtr))
[docs] def l_mom(data, func): """Estimate distribution parameters using L-moments method. Computes distribution parameters from sample L-moments (linear combinations of order statistics). Provides robust estimates especially for small samples. Parameters ---------- data : array-like Sample data to fit func : {'genpareto', 'expon', 'genextreme', 'gumbel_r'} Distribution type to fit: - 'genpareto': Generalized Pareto Distribution (3 parameters) - 'expon': Exponential distribution (2 parameters, k=0) - 'genextreme': Generalized Extreme Value distribution (3 parameters) - 'gumbel_r': Gumbel distribution (2 parameters, k=0) Returns ------- list Distribution parameters [k, mu, sig] where: - k : float Shape parameter (0 for Exponential and Gumbel) - mu : float Location parameter - sig : float Scale parameter Notes ----- L-moments are linear combinations of order statistics: - λ₁ = E[X] (mean) - λ₂ = E[X_{2:2} - X_{1:2}]/2 (scale) - λ₃ related to skewness L-moment ratios (τ₃ = λ₃/λ₂) are used to estimate shape parameters. For GEV, the shape parameter k is solved numerically from: 2(1-3^(-k))/(1-2^(-k)) - 3 = τ₃ Sample L-moments use plotting positions: p_i = (i-0.35)/n References ---------- Hosking, J.R.M. (1990). "L-moments: Analysis and Estimation of Distributions Using Linear Combinations of Order Statistics". Journal of the Royal Statistical Society, Series B, 52(1), 105-124. Examples -------- >>> import numpy as np >>> data = np.random.exponential(2.0, 100) >>> k, mu, sig = l_mom(data, 'expon') >>> print(f"Exponential: k={k:.3f}, mu={mu:.3f}, sig={sig:.3f}") """ data = np.sort(data) n = np.size(data) p = (np.arange(1, n + 1) - 0.35) / n lam1, lam2 = np.mean(data), np.mean((2 * p - 1) * data) if func == "genpareto": lam3 = np.mean((1 - 6 * p + 6 * p ** 2) * data) tau3 = lam3 / lam2 k = (3 * tau3 - 1) / (1 + tau3) sig = lam2 * (1 - k) * (2 - k) mu = lam1 - sig / (1 - k) elif func == "expon": sig = 2 * lam2 mu = lam1 - sig k = 0 elif func == "genextreme": lam3 = np.mean((1 - 6 * p + 6 * p ** 2) * data) tau3 = lam3 / lam2 def fun(x, tau3): return 2.0 * (1 - 3.0 ** -x) / (1 - 2.0 ** -x) - 3 - tau3 c = 2 / (3 + tau3) - np.log(2) / np.log(3) k = 7.859 * c + 2.9554 * c ** 2 k = fsolve(fun, k, args=(tau3)) sig = lam2 * k / (1 - 2 ** (-k)) / gamma(1 + k) mu = lam1 - sig * (1 - gamma(1 + k)) / k elif func == "gumbel_r": sig = lam2 / np.log(2) mu = lam1 - 0.5772 * sig k = 0 return [k, mu, sig]
[docs] def lmom_genpareto(data, tr, nyears): """Fit Generalized Pareto Distribution using L-moments with diagnostic metrics. Estimates GPD parameters via L-moments and computes return period values along with additional diagnostic statistics for POT analysis. Parameters ---------- data : array-like Peak over threshold data (exceedances) tr : array-like Return periods in years to evaluate nyears : int Number of years in the dataset Returns ------- np.ndarray Array containing [k, mu, sig, mrlp, sigmodif, nu, qtr_1, qtr_2, ...] - k : float Shape parameter - mu : float Location parameter (threshold) - sig : float Scale parameter - mrlp : float Mean residual life plot value: E[X - mu | X > mu] - sigmodif : float Modified scale: sig - k*mu - nu : float Average number of exceedances per year - qtr_i : float Return level for return period tr[i] Notes ----- For peaks over threshold analysis with GPD: - Return levels: x_T = μ + (σ/ξ)[(Tλ)^ξ - 1] - Mean excess: E[X-u|X>u] = (σ + ξu)/(1-ξ) The modified scale parameter is used in some POT formulations. Examples -------- >>> data = np.random.exponential(2.0, 100) >>> tr = np.array([10, 50, 100]) >>> results = lmom_genpareto(data, tr, nyears=10) >>> k, mu, sig = results[0], results[1], results[2] """ k, mu, sig = l_mom(data, "genpareto") mrlp = np.mean(data - mu) sigmodif = sig - k * mu nu = np.size(data) / nyears pri = 1 - 1 / (tr * nu) qtr = st.genpareto.ppf(pri, k, mu, sig) return np.hstack((k, mu, sig, mrlp, sigmodif, nu, qtr))
[docs] def pot_method( df: pd.DataFrame, var_: str, window_size: int, alpha: float = 0.05, sim_no: int = 10000, method: str ="nearest" ): """Perform Peaks Over Threshold (POT) analysis with multiple thresholds. Analyzes extreme values using Generalized Pareto Distribution fitted with L-moments. Tests multiple threshold values and provides bootstrap confidence intervals for return period estimates. Parameters ---------- df : pd.DataFrame Time series data with datetime index var_ : str Name of the variable column to analyze window_size : int Window size for extracting independent maxima events alpha : float, default=0.05 Significance level for confidence intervals (0.05 gives 95% CI) sim_no : int, default=10000 Number of bootstrap simulations method : str, default='nearest' Interpolation method for p-value table (currently commented out) Returns ------- dict Results dictionary with keys: - 'mean_value_lmom' : np.ndarray Shape (n_thresholds, n_params+6+len(tr_eval)) Mean parameter estimates and return levels for each threshold - 'upper_lim' : np.ndarray Upper confidence interval bounds - 'lower_lim' : np.ndarray Lower confidence interval bounds - 'au2_lmom' : np.ndarray Anderson-Darling test statistics (currently zeros) - 'au2pv_lmom' : np.ndarray Anderson-Darling p-values (currently zeros) - 'nyears' : int Number of years in the dataset - 'thresholds' : np.ndarray Threshold values tested (90th to 99.9th percentiles) - 'tr_eval' : np.ndarray Return periods evaluated [1, 50, 100, 1000] years Notes ----- The POT method: 1. Extracts independent events using moving window maxima 2. Tests thresholds from 90th to 99.9th percentiles 3. For each threshold: - Fits GPD using L-moments - Computes bootstrap confidence intervals - (Optionally) performs Anderson-Darling goodness-of-fit test Higher thresholds provide better asymptotic approximation but fewer data points. The optimal threshold balances bias-variance tradeoff. References ---------- Coles, S. (2001). "An Introduction to Statistical Modeling of Extreme Values". Springer. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'Hs': np.random.weibull(1.5, 1000) ... }, index=pd.date_range('2000', periods=1000, freq='D')) >>> results = pot_method(df, 'Hs', window_size=48, alpha=0.05) >>> print(f"Number of thresholds tested: {len(results['thresholds'])}") """ # Initialize evaluation return periods tr_eval = np.array([1,50, 100, 1000]) thresholds = np.percentile( df[var_], np.hstack([np.linspace(90, 99, 10), np.linspace(99.1, 99.9, 9)]) ) no_thres = np.size(thresholds) results = {} results["mean_value_lmom"], results["upper_lim"], results["lower_lim"], results["au2_lmom"], results["au2pv_lmom"] = ( np.zeros([no_thres, np.size(tr_eval) + 6]), np.zeros([no_thres, np.size(tr_eval) + 6]), np.zeros([no_thres, np.size(tr_eval) + 6]), np.zeros(no_thres), np.zeros(no_thres), ) # Load p-value table for Anderson-Darling test and create the interpolator # data = loadmat( # os.path.join(os.path.dirname(__file__), "../", "utils/misc/PVAL_AU2_LMOM.mat") # ) # pvallmom, au2val, ko, ndata = ( # data["PVAL"], # data["AU2VAL"][0], # data["Ko"][0], # data["NDATA"][0], # ) # interpolator = RegularGridInterpolator( # (ndata, ko, au2val), # pvallmom, # method=method, # bounds_error=False, # fill_value=None, # ) # Compute the POT analysis logger.info( "Computing POT analysis for a window size of " + str(df.index[window_size] - df.index[0]) ) df_max_events = utils.max_moving(df[var_], window_size) event = df_max_events.loc[ df_max_events[var_] > np.percentile(df[var_], 90), var_ ].values[:] nyears = df.index.year.max() - df.index.year.min() # Perform analysis for each threshold for i in range(no_thres): logger.info( "Threshold " + str(i + 1) + " of " + str(no_thres) + " (" + str(np.round(thresholds[i], decimals=3)) + ")." ) events_for_threshold_i = event[event > thresholds[i]] n_evi = np.size(events_for_threshold_i) results["mean_value_lmom"][i, :] = lmom_genpareto(events_for_threshold_i, tr_eval, nyears) # Bootstrap resampling using L-moments for Generalized Pareto Distribution boot = np.zeros([sim_no, np.size(tr_eval) + 6]) for j in range(0, sim_no): boot[j, :] = lmom_genpareto( np.random.choice(events_for_threshold_i, n_evi), tr_eval, nyears ) results["upper_lim"][i, :] = np.percentile(boot, (1 - alpha / 2) * 100, axis=0) results["lower_lim"][i, :] = np.percentile(boot, (alpha / 2) * 100, axis=0) # Anderson-Darling Upper test without confidence intervals for L-moments # results["au2_lmom"][i] = au2(results["mean_value_lmom"][i, 0:3], events_for_threshold_i) # results["au2pv_lmom"][i] = interpolator( # [ # np.max((10, np.min((500, n_evi)))), # np.sign(results["mean_value_lmom"][i, 0]) # * np.min((np.abs(results["mean_value_lmom"][i, 0]), 0.5)), # results["au2_lmom"][i], # ] # ) results["nyears"] = nyears results["thresholds"] = thresholds results["tr_eval"] = tr_eval return results
[docs] def au2(par, data): """Calculate Anderson-Darling test statistic for GPD goodness-of-fit. Computes the Anderson-Darling A² statistic to assess how well a Generalized Pareto Distribution fits the data. Lower values indicate better fit. Parameters ---------- par : array-like GPD parameters [shape, loc, scale] data : array-like Peak over threshold data to test Returns ------- float Anderson-Darling A² test statistic Notes ----- The Anderson-Darling statistic is: A² = n/2 - 2Σ[F(x_i)] - Σ[(2i-1)/n * log(1-F(x_i))] where F is the fitted CDF and x_i are the sorted data. This statistic gives more weight to tail deviations than KS test, making it particularly suitable for extreme value analysis. The CDF values are adjusted by 1e-6 to avoid log(0) issues. References ---------- Anderson, T.W. and Darling, D.A. (1952). "Asymptotic theory of certain goodness of fit criteria based on stochastic processes". Annals of Mathematical Statistics, 23, 193-212. Examples -------- >>> data = np.random.exponential(2.0, 100) >>> par = [0, 0, 2.0] # Exponential is GPD with shape=0 >>> ad_stat = au2(par, data) >>> print(f"Anderson-Darling statistic: {ad_stat:.3f}") """ ndat = np.size(data) cdf = np.sort(st.genpareto.cdf(data, *par)) cdf -= 1e-6 # Avoid log(0) when CDF=1.0 estadist = ( ndat / 2 - 2 * np.sum(cdf) - np.sum((2 - (2 * np.arange(1, ndat + 1) - 1.0) / ndat) * np.log(1 - cdf)) ) return estadist
[docs] def fit_gpd_bootstrap(df_eventos, alpha, umb, param, bca, nyears): """Compute GPD parameters with bootstrap resampling and confidence intervals. Automatically fits Generalized Pareto Distribution using L-moments with bootstrap resampling for specified thresholds. Also tests if Exponential distribution is appropriate when shape parameter CI includes zero. Parameters ---------- df_eventos : pd.DataFrame or np.ndarray Event time series data alpha : float Significance level for confidence intervals umb : array-like Threshold values to test param : {'parametric', 'non-parametric'} Bootstrap resampling method bca : {'standard', 'bca'} Confidence interval computation method nyears : int Number of years in the dataset Returns ------- boot : list of lists Bootstrap results [GPD_results, Exponential_results (if applicable)] Each contains bootstrap parameter matrices for each threshold orig : list of lists Original parameter estimates [GPD_estimates, Exponential_estimates] ci : list of lists Confidence intervals [GPD_CI, Exponential_CI] tr : np.ndarray Return periods evaluated peaks : np.ndarray Peak values used (for last threshold processed) npeaks : int Number of peaks (for last threshold) eventanu : float Average events per year (for last threshold) Notes ----- For each threshold: 1. Extracts peaks exceeding threshold 2. Fits GPD using L-moments with bootstrap 3. Computes confidence intervals 4. If CI for shape includes 0, also fits Exponential distribution Exponential is a special case of GPD with shape=0. Examples -------- >>> umb = np.array([2.0, 2.5]) >>> boot, orig, ci, tr, peaks, npeaks, eventanu = fit_gpd_bootstrap( ... df['Hs'], alpha=0.05, umb=umb, param='parametric', ... bca='standard', nyears=10 ... ) """ tr = np.hstack((np.arange(1, 11), np.arange(2, 11) * 10, np.arange(2, 11) * 100)) nosim = 10000 # Extract peaks over threshold with PWM and bootstrap confidence intervals # Identify events numb = np.size(umb) event = df_eventos.values[:] boot, orig, ci = ( [list() for i in range(2)], [list() for i in range(2)], [list() for i in range(2)], ) for i in range(numb): peaks = event[event > umb[i]] npeaks = np.size(peaks) eventanu = npeaks / nyears # Central value and bootstrap with predefined threshold boot_s, orig_s = bootstrapping(peaks, tr, "L-MOM", "genpareto", nyears, nosim, param) ci_s = confidence_intervals( boot_s, alpha, bca, (orig_s, nosim, peaks, tr, "genpareto", nyears) ) boot[0].append(boot_s), orig[0].append(orig_s), ci[0].append(ci_s) # If Exponential is appropriate (shape CI includes 0), compute it expo = (ci_s[0, 0] < 0) & (ci_s[1, 0] > 0) if expo: boot_e, orig_e = bootstrapping(peaks, tr, "L-MOM", "expon", nyears, nosim, param) ci_e = confidence_intervals( boot_e, alpha, bca, (orig_e, nosim, peaks, tr, "expon", nyears) ) boot[1].append(boot_e), orig[1].append(orig_e), ci[1].append(ci_e) return boot, orig, ci, tr, peaks, npeaks, eventanu
[docs] def annual_maxima_method(df, alpha, method, func, resample, ci_method, tr = None): """Fit GEV or Gumbel distribution to annual maxima with bootstrap CI. Performs extreme value analysis using annual maxima approach. Fits Generalized Extreme Value (GEV) or Gumbel distribution with bootstrap resampling to estimate confidence intervals. Parameters ---------- df : pd.DataFrame Time series data with datetime index alpha : float Significance level for confidence intervals (e.g., 0.05 for 95% CI) method : {'MLE', 'L-MOM'} Parameter estimation method: - 'MLE': Maximum Likelihood Estimation - 'L-MOM': L-moments method func : str or scipy.stats distribution Distribution to fit: - For L-MOM: 'genextreme', 'gumbel_r' - For MLE: scipy.stats distribution object (e.g., st.genextreme) resample : {'parametric', 'non-parametric'} Bootstrap resampling method ci_method : {'standard', 'bca'} Confidence interval computation method: - 'standard': Percentile method - 'bca': Bias-corrected and accelerated tr : array-like, optional Return periods to evaluate (in years). If None, uses default range from 1.1 to 1000 years Returns ------- tr : np.ndarray Return periods evaluated pannumax : np.ndarray Plotting positions for annual maxima annumax : pd.Series Annual maxima values boot : list of lists Bootstrap results [GEV_results, Gumbel_results (if applicable)] orig : list of lists Original parameter estimates ci : list of lists Confidence intervals Notes ----- The annual maxima method: 1. Extracts one maximum value per year 2. Fits GEV distribution: F(x) = exp{-[1+ξ(x-μ)/σ]^(-1/ξ)} 3. If shape parameter CI includes 0, also fits Gumbel (ξ=0) Return levels are computed as: x_T = F^(-1)(1 - 1/T) Gumbel is a special case of GEV with shape parameter ξ=0. Raises ------ ValueError If invalid method, function, resample, or ci_method specified Examples -------- >>> df = pd.DataFrame({ ... 'Hs': np.random.weibull(1.5, 3650) ... }, index=pd.date_range('2010', periods=3650, freq='D')) >>> tr, p, maxima, boot, orig, ci = annual_maxima_method( ... df, alpha=0.05, method='L-MOM', func='genextreme', ... resample='parametric', ci_method='standard' ... ) """ if not ((method == "MLE") | (method == "L-MOM")): raise ValueError( 'Fitting methods for probability models are "MLE" or "L-MOM". Given {}.'.format( method ) ) if method == "L-MOM": if not func in ["genpareto", "genextreme", "gumbel_r", "expon"]: raise ValueError( 'Function {} is not included in L-MOM methods for fitting. Use "genpareto", "genextreme", "gumbel_r" or "expon".'.format( method ) ) else: try: func = getattr(st, func) except: raise ValueError("Function {} is not included scipy.stats.".format(func)) if not ((resample == "parametric") | (resample == "non-parametric")): raise ValueError( 'Resampling methods are "parametric" or "non-parametric". Given {}'.format( resample ) ) if not ((ci_method == "standard") | (ci_method == "bca")): raise ValueError( 'Confidence interval methods are "standard" or "bca". Given {}'.format( ci_method ) ) if tr is None: tr = np.hstack((np.arange(1, 11), np.arange(2, 11) * 10, np.arange(2, 11) * 100)) tr = np.hstack((np.arange(1.1, 2 + 1e-6, 0.1), tr[tr > 2])) annumax = df.groupby(df.index.year).max() nyears = len(annumax) pannumax = np.arange(1.0, nyears + 1.0) / (nyears + 1.0) nosim = 10 boot, orig, ci = ( [list() for i in range(2)], [list() for i in range(2)], [list() for i in range(2)], ) boot_a, orig_a = bootstrapping(annumax, tr, method, func, nyears, nosim, resample) ci_a = confidence_intervals( boot_a, alpha, ci_method, (orig_a, nosim, annumax, tr, func, nyears) ) boot[0].append(boot_a), orig[0].append(orig_a), ci[0].append(ci_a) # Whether the parameters indicate that gumbel_r can be estimated, repeat the method with gumbel_r probability model if (ci_a[0, 0] < 0) & (ci_a[1, 0] > 0): if method == "MLE": func = st.gumbel_r else: func = "gumbel_r" boot_g, orig_g = bootstrapping( annumax, tr, method, func, nyears, nosim, resample ) ci_g = confidence_intervals( boot_g, alpha, ci_method, (orig_g, nosim, annumax, tr, func, nyears) ) boot[1].append(boot_g), orig[1].append(orig_g), ci[1].append(ci_g) return tr, pannumax, annumax, boot, orig, ci