Source code for openpois.models.setup

"""
Data-preparation helpers for OSM turnover models.
"""

import pandas as pd



[docs]
def prepare_data_for_model(
    data: pd.DataFrame,
    group_key: str | None = None,
    group_values: list[str] | None = None,
    min_value_count: int | None = None,
    t1_col: str = 'last_obs_timestamp',
    t2_col: str = 'obs_timestamp',
) -> pd.DataFrame:
    """
    Prepare an observations DataFrame for model fitting.

    Per turnover-model-methodology.md §1.2, the per-row Bernoulli-on-Poisson
    likelihood requires Δ = t_k − t_{k−1} (inter-observation), so the default
    ``t1_col`` is ``last_obs_timestamp``. Multiplying per-row Bernoullis
    telescopes to the correct individual likelihood. The previous default of
    ``last_tag_timestamp`` would have made Δ the duration since the
    individual's start — correct for one-row-per-individual but biased
    downward on multi-version POIs.

    Also emits ``is_first_interval`` — True exactly when
    ``last_obs_timestamp == last_tag_timestamp``, i.e. this row is the first
    surviving observation of its ``(POI, name-iteration)`` individual. Used
    by the ZIE δ extension (methodology §1.7).

    Args:
        data: Observations DataFrame as returned by format_observations.
        group_key: Column name of the grouping variable. If None, no group
            filtering is applied.
        group_values: If provided, only rows with group_key in this list are
            kept.
        min_value_count: If provided, groups with fewer than this many
            observations are dropped.
        t1_col: Name of the start-time timestamp column. Default
            ``last_obs_timestamp`` gives the inter-observation interval.
        t2_col: Name of the end-time timestamp column.

    Returns:
        Filtered DataFrame with additional ``tag_days``, ``tag_years``, and
        ``is_first_interval`` columns.

    Raises:
        ValueError: If ``t1_col`` or ``t2_col`` is not present in data.
    """
    if group_key is not None:
        keep_ids = data.dropna(subset = [group_key]).id.unique().tolist()  # noqa: F841
        data = data.query('id in @keep_ids')
    # If group values were set, subset to those observations
    if (group_key is not None) and (group_values is not None):
        data = (
            data
            .dropna(subset = group_key)
            .query(f'{group_key} in @group_values')
        )
    if (group_key is not None) and (min_value_count is not None):
        value_counts = data.value_counts(group_key)
        groups_over_threshold = (  # noqa: F841
            value_counts[value_counts >= min_value_count].index.tolist()
        )
        data = data.query(f'{group_key} in @groups_over_threshold')
    # Prepare timestamps
    required_cols = [t1_col, t2_col, 'last_obs_timestamp', 'last_tag_timestamp']
    if any(col not in data.columns for col in required_cols):
        raise ValueError(
            f"Required timestamp columns missing. Expected: {required_cols}"
        )
    data = data.copy()
    for timestamp_col in set(required_cols):
        data[timestamp_col] = pd.to_datetime(data[timestamp_col])
    tag_days = (data[t2_col] - data[t1_col]).dt.days
    data = data.assign(
        tag_days = tag_days,
        tag_years = tag_days / 365,
        is_first_interval = (
            data['last_obs_timestamp'] == data['last_tag_timestamp']
        ),
    )
    data = (
        data
        .dropna(subset = ['tag_years', 'changed'])
        .query('tag_years > 1e-6')
    )
    return data