Source code for fips.filters
"""
Data filtering and selection utilities.
This module provides functions for filtering observations and state vectors
based on various criteria, such as data density, time intervals, and
quality control thresholds.
"""
from typing import overload
import pandas as pd
# ==============================================================================
# INTERVAL FILTERING
# ==============================================================================
[docs]
def enough_obs_per_interval(
index: pd.Index,
intervals: pd.IntervalIndex,
threshold: int,
level: str | None = None,
) -> list[bool]:
"""
Determine which observations have enough data points per time interval.
Parameters
----------
index : pd.Index
Index containing observations.
intervals : pd.IntervalIndex
Intervals to group observations into.
threshold : int
Minimum number of observations required per interval.
level : str, optional
Level name to use if index is a MultiIndex. If None, uses the entire index.
Returns
-------
list[bool]
Boolean mask indicating which observations meet the threshold.
"""
obs = index if level is None else index.get_level_values(level)
groups = pd.Index(pd.cut(obs, bins=intervals))
counts = obs.to_series().groupby(groups, observed=True).transform("count")
return (counts >= threshold).tolist()
@overload
def select_intervals_with_min_obs(
data: pd.Series,
intervals: pd.IntervalIndex,
threshold: int,
level: str | None = None,
) -> pd.Series: ...
@overload
def select_intervals_with_min_obs(
data: pd.DataFrame,
intervals: pd.IntervalIndex,
threshold: int,
level: str | None = None,
) -> pd.DataFrame: ...
[docs]
def select_intervals_with_min_obs(
data: pd.Series | pd.DataFrame,
intervals: pd.IntervalIndex,
threshold: int,
level: str | None = None,
) -> pd.Series | pd.DataFrame:
"""
Filter data to only include observations with enough data points per time interval.
Parameters
----------
data : pd.Series | pd.DataFrame
Data to filter.
intervals : pd.IntervalIndex
Intervals to group observations into.
threshold : int
Minimum number of observations required per interval.
level : str, optional
Level name to use if index is a MultiIndex. If None, uses the entire index.
Returns
-------
pd.Series | pd.DataFrame
Filtered data.
"""
mask = enough_obs_per_interval(
index=data.index, intervals=intervals, threshold=threshold, level=level
)
return data[mask]