Skip to content

hsp90-smfret-model

API Reference

bibymaths/hsp90-smfret-model

API reference

`get_traces.py`

Interpolate a single FRET trace onto a common time grid using cubic splines.

Handles edge cases such as missing data (NaNs), single-point traces, and short traces where only linear interpolation is feasible.

Args: time_grid (np.ndarray): The uniform time vector to interpolate onto. t_trace (np.ndarray): Original time points of the trace. E_trace (np.ndarray): Original FRET efficiency values.

Returns: np.ndarray: The FRET trace interpolated onto time_grid. Returns NaNs for time points outside the original observation range or if insufficient data exists.

Source code in get_traces.py

def interpolate_trace(
    time_grid: np.ndarray,
    t_trace: np.ndarray,
    E_trace: np.ndarray,
    interpolate: bool = True,
) -> np.ndarray:
    """
    Interpolate a single FRET trace onto a common time grid using cubic splines.

    Handles edge cases such as missing data (NaNs), single-point traces,
    and short traces where only linear interpolation is feasible.

    Args:
        time_grid (np.ndarray): The uniform time vector to interpolate onto.
        t_trace (np.ndarray): Original time points of the trace.
        E_trace (np.ndarray): Original FRET efficiency values.

    Returns:
        np.ndarray: The FRET trace interpolated onto `time_grid`. Returns NaNs
        for time points outside the original observation range or if
        insufficient data exists.
    """
    # Ensure arrays
    t_trace = np.asarray(t_trace, float)
    E_trace = np.asarray(E_trace, float)

    # Mask out non-finite values
    mask = np.isfinite(t_trace) & np.isfinite(E_trace)
    t_clean = t_trace[mask]
    E_clean = E_trace[mask]

    # Not enough points to interpolate
    if t_clean.size == 0:
        return np.full_like(time_grid, np.nan, dtype=float)
    if t_clean.size == 1:
        # Single point: constant over its range, NaN elsewhere
        y = np.full_like(time_grid, np.nan, dtype=float)
        idx = np.argmin(np.abs(time_grid - t_clean[0]))
        y[idx] = E_clean[0]
        return y

    # Remove duplicate time stamps, if any
    t_unique, idx_unique = np.unique(t_clean, return_index=True)
    E_unique = E_clean[idx_unique]

    if not interpolate:
        y = np.full_like(time_grid, np.nan, dtype=float)
        if len(time_grid) > 1:
            dt = time_grid[1] - time_grid[0]
            # Find nearest grid index for each observation
            idx = np.rint(t_unique / dt).astype(np.int64)
            # Ensure indices are within bounds
            valid = (idx >= 0) & (idx < len(time_grid))
            y[idx[valid]] = E_unique[valid]
        elif len(time_grid) == 1 and t_unique.size > 0:
            # Edge case: 1-point grid, just take the first point if close enough
            if np.abs(t_unique[0] - time_grid[0]) < frame_interval / 2:
                y[0] = E_unique[0]
        return y

    if t_unique.size < 2:
        return np.full_like(time_grid, np.nan, dtype=float)

    # For 2 points, a spline is pointless; use linear interpolation
    if t_unique.size == 2:
        y = np.interp(time_grid, t_unique, E_unique, left=np.nan, right=np.nan)
        return y

    # For >=3 points, try cubic spline
    try:
        cs = CubicSpline(t_unique, E_unique, extrapolate=False)
        y = cs(time_grid)
    except Exception:
        y = np.interp(time_grid, t_unique, E_unique, left=np.nan, right=np.nan)

    outside = (time_grid < t_unique.min()) | (time_grid > t_unique.max())
    y[outside] = np.nan
    y[(y < fret_min) | (y > fret_max)] = np.nan
    return y

Inspect all .tracks files, log basic info and plot a representative trajectory per file. Optionally save plots to disk.

Source code in get_traces.py

def inspect_and_plot_data(
    data_dir: Path,
    frame_interval: float,
    fret_min: float,
    fret_max: float,
    key: str = "/tracks/Data",
    save_plots: bool = True,
    plot_dir: Path | None = None,
    show_plots: bool = False,
) -> None:
    """
    Inspect all *.tracks* files, log basic info and
    plot a representative trajectory per file.
    Optionally save plots to disk.
    """
    if save_plots:
        if plot_dir is None:
            plot_dir = data_dir / "plots"
        plot_dir.mkdir(parents=True, exist_ok=True)

    for path in sorted(data_dir.glob("*.tracks*")):
        logger.info("=" * 80)

        # Extract metadata from filename
        fname = path.stem  # e.g. filtered-241107-Hsp90_409_601-v014.tracks
        parts = fname.split("-")
        if len(parts) >= 3:
            exp_id = parts[1]
            construct = parts[2].split(".")[0]
        else:
            exp_id = "unknown"
            construct = "unknown"

        logger.info(f"File: {path.name}")
        logger.info(f"Experiment: {construct}, Date/ID: {exp_id}")

        # Step 1 — list keys
        try:
            store = pd.HDFStore(path, mode="r")
            keys = store.keys()
            store.close()
            logger.info(f"Keys in file: {keys}")
        except Exception as e:
            logger.error(f"Could not open file: {e}")
            continue

        # Step 2 — read dataset
        k = key if key in keys else keys[0]
        try:
            df = pd.read_hdf(path, key=k)
        except Exception as e:
            logger.error(f"Error reading {k}: {e}")
            continue

        # Step 3 — flatten multiindex columns
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ["_".join(filter(None, col)).strip() for col in df.columns]

        # Step 4 — basic info
        logger.info(f"Loaded with shape: {df.shape}")

        # Step 6 — numeric summary
        num_cols = df.select_dtypes(include=[np.number]).columns
        summary = df[num_cols].describe().T[["mean", "std", "min", "max"]]
        logger.debug(summary.head(10))

        # Step 7 — add time column
        if "donor_frame" in df.columns:
            df["time_s"] = df["donor_frame"] * frame_interval

        # Step 8 — detect FRET column
        fret_candidates = ["fret_eff", "fret_eff_app", "fret_efficiency"]
        fret_col = next((c for c in fret_candidates if c in df.columns), None)

        if fret_col is None:
            logger.warning(
                f"No FRET column found (checked: {', '.join(fret_candidates)}). Skipping plot.\n"
            )
            continue

        # Ensure we have a time axis
        if "time_s" not in df.columns:
            if "donor_frame" in df.columns:
                df["time_s"] = df["donor_frame"] * frame_interval
            else:
                logger.warning("No donor_frame/time info found. Skipping plot.\n")
                continue

        # Choose a representative particle: longest trajectory
        part_col = "fret_particle" if "fret_particle" in df.columns else None

        if part_col is not None:
            counts = df.groupby(part_col).size()
            longest_pid = counts.sort_values(ascending=False).index[0]
            traj = df[df[part_col] == longest_pid].sort_values("time_s")
            label = f"{construct} {exp_id} – particle {int(longest_pid)}"
        else:
            traj = df.sort_values("time_s")
            label = f"{construct} {exp_id} – all data (no particle id)"

        # Plot representative trace
        fig, ax = plt.subplots()
        ax.plot(traj["time_s"], traj[fret_col], marker="o", linestyle="-", markersize=3)
        ax.set_xlabel("time (s)")
        ax.set_ylabel(f"{fret_col}")
        ax.set_title(label)
        fig.tight_layout()

        if save_plots and plot_dir is not None:
            # create a file-name–safe stem
            safe_label = f"{construct}_{exp_id}"
            if part_col is not None:
                safe_label += f"_p{int(longest_pid)}"
            out_png = plot_dir / f"{safe_label}.png"
            fig.savefig(out_png, dpi=300)
            logger.info(f"Saved plot → {out_png}")

        if show_plots:
            plt.show()
        else:
            plt.close(fig)

Export per-particle time series as CSV files for downstream combination.

Source code in get_traces.py

def export_per_particle_time_series(
    data_dir: Path,
    export_dir: Path,
    frame_interval: float,
    fret_min: float,
    fret_max: float,
    key: str = "/tracks/Data",
    min_traj_length: int = 20,
) -> None:
    """
    Export per-particle time series as CSV files for downstream combination.
    """
    export_dir.mkdir(parents=True, exist_ok=True)

    for path in sorted(data_dir.glob("*.tracks*.h5")):
        fname = path.stem
        parts = fname.split("-")
        if len(parts) >= 3:
            parts[1]
            parts[2].split(".")[0]
        else:
            pass

        try:
            df = pd.read_hdf(path, key=key)
        except Exception as e:
            logger.error(f"Could not read {path.name}: {e}")
            continue

        # flatten columns
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ["_".join(filter(None, c)).strip() for c in df.columns]

        # --- NEW: restrict to donor excitation frames if available ---
        if "fret_exc_type" in df.columns:
            df = df[df["fret_exc_type"] == "d"]

        # ensure required columns
        needed_cols = {"donor_frame", "fret_particle"}
        if not needed_cols.issubset(df.columns):
            logger.warning(f"Missing required columns in {path.name}, skipping.")
            continue

        df["time_s"] = df["donor_frame"] * frame_interval

        fret_candidates = ["fret_eff", "fret_eff_app"]
        fret_col = next((c for c in fret_candidates if c in df.columns), None)
        if fret_col is None:
            logger.warning(
                f"No FRET efficiency column found in {path.name}, skipping file."
            )
            continue

        # --- FIXED: manual filter semantics ---
        # manual == 1 means "rejected" in the original GUI
        if "filter_manual" in df.columns:
            df = df[df["filter_manual"] == 1]

        if "fret_exec_type" in df.columns:
            df = df[df["fret_exec_type"] == 1]

        # keep only finite, physically reasonable FRET values
        df = df[np.isfinite(df[fret_col])]
        df = df[(df[fret_col] > fret_min) & (df[fret_col] < fret_max)]

        # drop short trajectories AFTER filtering
        lengths = df.groupby("fret_particle")["donor_frame"].nunique()
        keep = lengths[lengths >= min_traj_length].index
        df = df[df["fret_particle"].isin(keep)]

        count = 0
        for pid, traj in df.groupby("fret_particle"):
            traj = traj.sort_values("donor_frame")
            out = traj[["time_s", fret_col]].rename(columns={fret_col: "FRET"})
            out_name = f"{path.stem}_particle_{int(pid):05d}.csv"
            out.to_csv(export_dir / out_name, index=False)
            count += 1

        logger.info(
            f"Exported {count} per-particle traces from {path.name} to {export_dir}/"
        )

Combine all per-particle CSV files into a single time x trajectory matrix, clean it, and save to CSV.

Source code in get_traces.py

def build_combined_fret_matrix(
    export_dir: Path,
    frame_interval: float,
    fret_min: float,
    fret_max: float,
    use_interpolation: bool,
    combined_out: Path | None = None,
) -> Path | None:
    """
    Combine all per-particle CSV files into a single time x trajectory matrix,
    clean it, and save to CSV.
    """
    if combined_out is None:
        combined_out = export_dir / "fret_matrix.csv"

    logger.info("\nBuilding combined FRET matrix (uniform 0–max_t grid)...")

    # Exclude the already combined matrix itself (important on reruns)
    csv_files = sorted(
        f for f in export_dir.glob("*.csv") if f.name != combined_out.name
    )

    if not csv_files:
        logger.warning("No per-particle CSV files found, skipping matrix creation.")
        return None

    max_t = 0.0

    # Find max time across all traces
    for f in csv_files:
        df_tmp = pd.read_csv(f)
        if "time_s" in df_tmp.columns and len(df_tmp) > 0:
            max_t = max(max_t, df_tmp["time_s"].max())

    time_grid = np.arange(0.0, max_t + frame_interval / 2, frame_interval)

    # Collect all interpolated traces first
    columns: dict[str, np.ndarray] = {"time_s": time_grid}

    for _i, f in enumerate(csv_files):
        df = pd.read_csv(f)
        if len(df) == 0:
            continue

        t_trace = df["time_s"].values
        E_trace = df["FRET"].values
        interp = interpolate_trace(
            time_grid, t_trace, E_trace, interpolate=use_interpolation
        )

        stem = f.stem
        parts = stem.split("-")
        if len(parts) >= 3:
            exp_id = parts[1]
            construct = parts[2].split(".")[0]
        else:
            exp_id = "unknown"
            construct = "unknown"

        pid = stem.split("particle_")[-1]
        col_name = f"{construct}_{exp_id}_p{pid}"

        columns[col_name] = interp

        # if (i + 1) % 100 == 0:
        #     logger.info(f"Processed {i + 1} traces ...")

    # Build one DataFrame - time x trajectories
    combined = pd.DataFrame(columns)

    # --- CLEANUP MATRIX ---

    # 1) Make sure time_s exists
    if "time_s" not in combined.columns:
        raise ValueError("Expected 'time_s' column in combined matrix.")

    # All trajectory columns (everything except time_s)
    traj_cols = [c for c in combined.columns if c != "time_s"]

    # 2) Drop trajectory columns that are entirely NaN
    traj_cols = [c for c in traj_cols if combined[c].notna().any()]
    combined = combined[["time_s"] + traj_cols]

    # 3) Enforce valid FRET range and set out-of-range values to NaN
    for c in traj_cols:
        mask_invalid = (combined[c] < fret_min) | (combined[c] > fret_max)
        combined.loc[mask_invalid, c] = np.nan

    # 4) Drop rows where all trajectories are NaN
    if traj_cols:
        mask_any = combined[traj_cols].notna().any(axis=1)
        combined = combined[mask_any].reset_index(drop=True)

    # 5) Drop very short traces (e.g. < 10 valid points after filtering)
    min_points_per_trace = 10
    if traj_cols:
        valid_counts = combined[traj_cols].notna().sum(axis=0)
        keep_traces = valid_counts[valid_counts >= min_points_per_trace].index.tolist()
        combined = combined[["time_s"] + keep_traces]
        traj_cols = keep_traces  # update
    else:
        traj_cols = []

    # --- SAVE CLEANED MATRIX ---
    combined.to_csv(combined_out, index=False)
    logger.info(
        f"FRET matrix saved → {combined_out}\n"
        f"Time points: {combined.shape[0]}, trajectories: {combined.shape[1] - 1}"
    )

    return combined_out

Remove all per-particle CSVs except the combined matrix.

Source code in get_traces.py

def cleanup_intermediate_csv(
    export_dir: Path, combined_name: str = "fret_matrix.csv"
) -> None:
    """
    Remove all per-particle CSVs except the combined matrix.
    """
    for f in export_dir.glob("*.csv"):
        if f.name != combined_name:
            try:
                f.unlink()
            except Exception as e:
                logger.error(f"Could not delete {f.name}: {e}")

Parse command-line arguments for the FRET HDF5 processing script.

Source code in get_traces.py

def parse_args() -> argparse.Namespace:
    """
    Parse command-line arguments for the FRET HDF5 processing script.
    """
    parser = argparse.ArgumentParser(
        description=(
            "HDF5 FRET Tracking Data Processor\n"
            "Inspect raw tracks, export per-particle traces and build a combined matrix."
        )
    )

    parser.add_argument(
        "--data-dir",
        type=Path,
        default=Path("data/Hugel_2025"),
        help="Directory containing .tracks / .tracks.h5 files (default: data/Hugel_2025)",
    )
    parser.add_argument(
        "--export-dir",
        type=Path,
        default=Path("data/timeseries"),
        help="Directory to write per-particle CSVs and combined matrix (default: data/timeseries)",
    )
    parser.add_argument(
        "--frame-interval",
        type=float,
        default=0.07,
        help="Frame interval in seconds (default: 0.07 = 70 ms)",
    )
    parser.add_argument(
        "--fret-min",
        type=float,
        default=0.0,
        help="Minimum FRET value to keep (default: 0.0)",
    )
    parser.add_argument(
        "--fret-max",
        type=float,
        default=1.0,
        help="Maximum FRET value to keep (default: 1.0)",
    )
    parser.add_argument(
        "--min-traj-length",
        type=int,
        default=10,
        help="Minimum trajectory length (in frames) to keep (default: 10)",
    )
    parser.add_argument(
        "--use-interpolation",
        action="store_true",
        help="Enable interpolation (cubic/linear) onto the common time grid.",
    )
    parser.add_argument(
        "--no-inspect-plots",
        action="store_true",
        help="Disable representative trajectory plotting.",
    )
    parser.add_argument(
        "--save-plots",
        action="store_true",
        help="Save representative trajectory plots as PNG files.",
    )
    parser.add_argument(
        "--plots-dir",
        type=Path,
        default=None,
        help="Directory to save plots (default: <data-dir>/plots).",
    )
    parser.add_argument(
        "--keep-intermediate",
        action="store_true",
        help="Do not delete per-particle CSVs after building the combined matrix.",
    )

    return parser.parse_args()

Main entry point: run inspection, export per-particle traces, build combined matrix, and clean up.

Source code in get_traces.py

def main() -> None:
    """
    Main entry point: run inspection, export per-particle traces,
    build combined matrix, and clean up.
    """
    global data_dir, export_dir, frame_interval, fret_min, fret_max, USE_INTERPOLATION

    args = parse_args()

    # Override globals with CLI
    data_dir = args.data_dir
    export_dir = args.export_dir
    frame_interval = args.frame_interval
    fret_min = args.fret_min
    fret_max = args.fret_max
    USE_INTERPOLATION = args.use_interpolation

    logger.info("=== HDF5 FRET Tracking Data Processor ===")
    logger.info(f"Data directory     : {data_dir}")
    logger.info(f"Export directory   : {export_dir}")
    logger.info(f"Frame interval     : {frame_interval} s")
    logger.info(f"FRET range         : [{fret_min}, {fret_max}]")
    logger.info(f"Use interpolation  : {USE_INTERPOLATION}")
    logger.info(f"Min traj length    : {args.min_traj_length}")
    logger.info(f"Save plots         : {args.save_plots}")
    logger.info(f"Keep intermediates : {args.keep_intermediate}")

    # 1) Inspect and plot data (optional)
    if not args.no_inspect_plots:
        inspect_and_plot_data(
            data_dir=data_dir,
            frame_interval=frame_interval,
            fret_min=fret_min,
            fret_max=fret_max,
            key=key,
            save_plots=args.save_plots,
            plot_dir=args.plots_dir,
            show_plots=False,  # pipeline-friendly; change to True if you want interactive
        )

    # 2) Export per-particle time series
    export_per_particle_time_series(
        data_dir=data_dir,
        export_dir=export_dir,
        frame_interval=frame_interval,
        fret_min=fret_min,
        fret_max=fret_max,
        key=key,
        min_traj_length=args.min_traj_length,
    )

    # 3) Build combined FRET matrix
    combined_path = build_combined_fret_matrix(
        export_dir=export_dir,
        frame_interval=frame_interval,
        fret_min=fret_min,
        fret_max=fret_max,
        use_interpolation=USE_INTERPOLATION,
        combined_out=export_dir / "fret_matrix.csv",
    )

    # 4) Cleanup intermediate CSVs
    if combined_path is not None and not args.keep_intermediate:
        cleanup_intermediate_csv(export_dir, combined_name=combined_path.name)

`pipeline.py`

Log a DataFrame to the rich logger with optional title and row limit.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame to log.	required
`title`	`str \| None`	Optional title to display above the DataFrame.	`None`
`level`	`int`	Logging level (default: logging.INFO).	`INFO`
`max_rows`	`int`	Maximum number of rows to display (default: 60).	`60`

Returns:

Type	Description
`None`

Source code in pipeline.py

def log_df(
    df: pd.DataFrame,
    title: str | None = None,
    level: int = logging.INFO,
    max_rows: int = 60,
) -> None:
    """
    Log a DataFrame to the rich logger with optional title and row limit.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to log.
    title : str | None
        Optional title to display above the DataFrame.
    level : int
        Logging level (default: logging.INFO).
    max_rows : int
        Maximum number of rows to display (default: 60).
    Returns
    -------
    None
    """

    if df is None or df.empty:
        logger.log(level, "[bold yellow]DataFrame is empty[/bold yellow]")
        return

    if len(df) > max_rows:
        df = df.head(max_rows)

    with pd.option_context(
        "display.width",
        console.width,
        "display.max_columns",
        None,
        "display.max_colwidth",
        console.width,
    ):
        if title:
            logger.log(level, f"[bold cyan]{title}[/bold cyan]")
        logger.log(level, "\n" + df.to_string(index=True))

Three-state conformational model with bleaching: O <-> I <-> C, each can irreversibly bleach to B with rate k_B.

We explicitly track P_O, P_I, P_C; P_B = 1 - P_O - P_I - P_C. FRET is from O, I, C; bleached B is assumed dark (E_bleach = 0).

Parameters:

Name	Type	Description	Default
`k_OI`	`float`	Open -> Intermediate rate (1/s)	required
`k_IO`	`float`	Intermediate -> Open rate (1/s)	required
`k_IC`	`float`	Intermediate -> Closed rate (1/s)	required
`k_CI`	`float`	Closed -> Intermediate rate (1/s)	required
`k_BO`	`float`	Open -> Bleached rate (1/s)	required
`k_BI`	`float`	Intermediate -> Bleached rate (1/s)	required
`k_BC`	`float`	Closed -> Bleached rate (1/s)	required
`E_open`	`float`	FRET efficiency in Open state	required
`E_inter`	`float`	FRET efficiency in Intermediate state	required
`E_closed`	`float`	FRET efficiency in Closed state	required
`P_O0`	`float`	Initial probability of being in Open state	required
`P_C0`	`float`	Initial probability of being in Closed state	required

Returns:

Type	Description
`None`

Source code in pipeline.py

@dataclass
class Hsp90Params3State:
    """
    Three-state conformational model with bleaching:
      O <-> I <-> C, each can irreversibly bleach to B with rate k_B.

    We explicitly track P_O, P_I, P_C; P_B = 1 - P_O - P_I - P_C.
    FRET is from O, I, C; bleached B is assumed dark (E_bleach = 0).

    Parameters
    ----------
    k_OI : float
        Open -> Intermediate rate (1/s)
    k_IO : float
        Intermediate -> Open rate (1/s)
    k_IC : float
        Intermediate -> Closed rate (1/s)
    k_CI : float
        Closed -> Intermediate rate (1/s)
    k_BO : float
        Open -> Bleached rate (1/s)
    k_BI : float
        Intermediate -> Bleached rate (1/s)
    k_BC : float
        Closed -> Bleached rate (1/s)
    E_open : float
        FRET efficiency in Open state
    E_inter : float
        FRET efficiency in Intermediate state
    E_closed : float
        FRET efficiency in Closed state
    P_O0 : float
        Initial probability of being in Open state
    P_C0 : float
        Initial probability of being in Closed state

    Returns
    -------
    None

    """

    # Conformational rates
    k_OI: float  # Open -> Intermediate rate (1/s)
    k_IO: float  # Intermediate -> Open rate (1/s)
    k_IC: float  # Intermediate -> Closed rate (1/s)
    k_CI: float  # Closed -> Intermediate rate (1/s)

    # State-dependent bleaching
    k_BO: float  # O -> B
    k_BI: float  # I -> B
    k_BC: float  # C -> B

    # FRET levels
    E_open: float  # FRET in Open state
    E_inter: float  # FRET in Intermediate state
    E_closed: float  # FRET in Closed state

    # Initial probabilities (P_I0 = 1 - P_O0 - P_C0, P_B0 = 0)
    P_O0: float  # Initial Open probability
    P_C0: float  # Initial Closed probability

Container for a full fit: kinetics + static subpopulation.

Parameters:

Name	Type	Description	Default
`params`	`Hsp90Params3State`	Fitted kinetic parameters.	required
`f_dyn`	`float`	Fraction of molecules following the kinetic model.	required
`E_static`	`float`	FRET level of static subpopulation.	required

Returns:

Type	Description
`None`

Source code in pipeline.py

@dataclass
class Hsp90Fit3State:
    """
    Container for a full fit: kinetics + static subpopulation.

    Parameters
    ----------
    params : Hsp90Params3State
        Fitted kinetic parameters.
    f_dyn : float
        Fraction of molecules following the kinetic model.
    E_static : float
        FRET level of static subpopulation.
    Returns
    -------
    None
    """

    params: Hsp90Params3State
    f_dyn: float  # fraction of molecules following the kinetic model
    E_static: float  # FRET level of static subpopulation

Flatten a single Hsp90Fit3State into a 1-row DataFrame.

Parameters:

Name	Type	Description	Default
`fit`	`Hsp90Fit3State`	Fitted model.	required

Returns:

Type	Description
`DataFrame`	One-row DataFrame with all fit parameters.

Source code in pipeline.py

def fit_to_df(fit: Hsp90Fit3State) -> pd.DataFrame:
    """
    Flatten a single Hsp90Fit3State into a 1-row DataFrame.

    Parameters
    ----------
    fit : Hsp90Fit3State
        Fitted model.
    Returns
    -------
    pd.DataFrame
        One-row DataFrame with all fit parameters.
    """
    p = fit.params
    row = {
        "k_OI": p.k_OI,
        "k_IO": p.k_IO,
        "k_IC": p.k_IC,
        "k_CI": p.k_CI,
        "k_BO": p.k_BO,
        "k_BI": p.k_BI,
        "k_BC": p.k_BC,
        "E_open": p.E_open,
        "E_inter": p.E_inter,
        "E_closed": p.E_closed,
        "P_O0": p.P_O0,
        "P_C0": p.P_C0,
        "f_dyn": fit.f_dyn,
        "E_static": fit.E_static,
    }
    return pd.DataFrame([row])

Map unconstrained parameters to ordered FRET levels: Eo = sigmoid(e0) Ei = Eo + sigmoid(d1) * (1 - Eo) Ec = Ei + sigmoid(d2) * (1 - Ei)

Parameters:

Name	Type	Description	Default
`e0`	`float`	Unconstrained parameter for Eo.	required
`d1`	`float`	Unconstrained increment parameter for Ei.	required
`d2`	`float`	Unconstrained increment parameter for Ec.	required

Returns:

Name	Type	Description
`Eo`	`float`	FRET efficiency in Open state (0 < Eo < 1).
`Ei`	`float`	FRET efficiency in Intermediate state (Eo < Ei < 1).
`Ec`	`float`	FRET efficiency in Closed state (Ei < Ec < 1).

Source code in pipeline.py

def ordered_levels(e0, d1, d2):
    """
    Map unconstrained parameters to ordered FRET levels:
    Eo = sigmoid(e0)
    Ei = Eo + sigmoid(d1) * (1 - Eo)
    Ec = Ei + sigmoid(d2) * (1 - Ei)

    Parameters
    ----------
    e0 : float
        Unconstrained parameter for Eo.
    d1 : float
        Unconstrained increment parameter for Ei.
    d2 : float
        Unconstrained increment parameter for Ec.
    Returns
    -------
    Eo : float
        FRET efficiency in Open state (0 < Eo < 1).
    Ei : float
        FRET efficiency in Intermediate state (Eo < Ei < 1).
    Ec : float
        FRET efficiency in Closed state (Ei < Ec < 1).
    """
    Eo = 1 / (1 + np.exp(-e0))  # (0,1)
    inc1 = (1 / (1 + np.exp(-d1))) * (1 - Eo)  # (0, 1-Eo)
    Ei = Eo + inc1
    inc2 = (1 / (1 + np.exp(-d2))) * (1 - Ei)  # (0, 1-Ei)
    Ec = Ei + inc2
    return Eo, Ei, Ec

Numba-optimized ODE for P_O, P_I, P_C with bleaching.

Parameters:

Name	Type	Description	Default
`t`	`float`	Current time (not used in this ODE as it's time-invariant).	required
`y`	`ndarray`	Current state vector [P_O, P_I, P_C].	required
`params`	`ndarray`	Kinetic parameters: [k_OI, k_IO, k_IC, k_CI, k_BO, k_BI, k_BC].	required

Returns:

Name	Type	Description
`dPdt`	`ndarray`	Time derivatives [dP_O/dt, dP_I/dt, dP_C/dt].

Source code in pipeline.py

@njit(
    "float64[:](float64, float64[:], float64[:])",
    cache=True,
    fastmath=True,
    nogil=False,
)
def rhs_hsp90_numba(t: float, y: np.ndarray, params: np.ndarray) -> np.ndarray:
    """
    Numba-optimized ODE for P_O, P_I, P_C with bleaching.

    Parameters
    ----------
    t : float
        Current time (not used in this ODE as it's time-invariant).
    y : ndarray
        Current state vector [P_O, P_I, P_C].
    params : ndarray
        Kinetic parameters: [k_OI, k_IO, k_IC, k_CI, k_BO, k_BI, k_BC].
    Returns
    -------
    dPdt : ndarray
        Time derivatives [dP_O/dt, dP_I/dt, dP_C/dt].
    """
    k_OI, k_IO, k_IC, k_CI, k_BO, k_BI, k_BC = params  # 7 kinetic params now
    P_O, P_I, P_C = y[0], y[1], y[2]
    dP_O = -k_OI * P_O + k_IO * P_I - k_BO * P_O
    dP_I = k_OI * P_O - (k_IO + k_IC + k_BI) * P_I + k_CI * P_C
    dP_C = k_IC * P_I - k_CI * P_C - k_BC * P_C
    return np.array([dP_O, dP_I, dP_C], dtype=np.float64)

Dynamic part only: E_dyn(t) = E_OP_O + E_IP_I + E_C*P_C.

This version enforces: - 0 <= P_O0, P_I0, P_C0 <= 1 - P_O0 + P_I0 + P_C0 = 1 (all non-bleached states) and clamps small numerical negatives in the solution.

Source code in pipeline.py

def model_fret_3state(t_eval: np.ndarray, p: Hsp90Params3State) -> np.ndarray:
    """
    Dynamic part only: E_dyn(t) = E_O*P_O + E_I*P_I + E_C*P_C.

    This version enforces:
      - 0 <= P_O0, P_I0, P_C0 <= 1
      - P_O0 + P_I0 + P_C0 = 1   (all non-bleached states)
    and clamps small numerical negatives in the solution.
    """
    # --- initial probabilities with normalization / clipping ---
    P_O0 = float(p.P_O0)
    P_C0 = float(p.P_C0)

    # clip raw guesses to [0,1] first (hard box)
    P_O0 = np.clip(P_O0, 0.0, 1.0)
    P_C0 = np.clip(P_C0, 0.0, 1.0)

    # provisional P_I0 from "whatever is left"
    P_I0 = 1.0 - P_O0 - P_C0

    # if that went negative (P_O0 + P_C0 > 1), renormalize
    if P_I0 < 0.0:
        total = max(P_O0 + P_C0, 1e-12)
        P_O0 /= total
        P_C0 /= total
        P_I0 = 0.0
    else:
        # all three non-negative; now renormalize to sum exactly 1
        total = P_O0 + P_I0 + P_C0
        if total <= 0.0:
            # completely pathological guess -> bail with NaNs
            return np.full_like(t_eval, np.nan, dtype=float)
        P_O0 /= total
        P_I0 /= total
        P_C0 /= total

    y0 = np.array([P_O0, P_I0, P_C0], dtype=float)

    # kinetic parameters (already box-constrained in the fit)
    k_params = np.array(
        [p.k_OI, p.k_IO, p.k_IC, p.k_CI, p.k_BO, p.k_BI, p.k_BC], dtype=float
    )

    sol = cast(
        "OdeResult",
        cast(
            "object",
            solve_ivp(
                fun=rhs_hsp90_numba,
                t_span=(t_eval.min(), t_eval.max()),
                y0=y0,
                t_eval=t_eval,
                vectorized=False,
                args=(k_params,),
                # method='RK45',
            ),
        ),
    )

    if not sol.success:
        return np.full_like(t_eval, np.nan, dtype=float)

    # clamp tiny negative / >1 probabilities from numerical error
    P = np.clip(sol.y, 0.0, 1.0)

    # optional: renormalize so P_O + P_I + P_C <= 1 (rest is bleached)
    S = P.sum(axis=0)
    mask = S > 1.0
    if np.any(mask):
        P[:, mask] /= S[mask]

    P_O_t = P[0]
    P_I_t = P[1]
    P_C_t = P[2]

    # also make sure FRET levels live in [0,1]
    E_open = np.clip(p.E_open, 0.0, 1.0)
    E_inter = np.clip(p.E_inter, 0.0, 1.0)
    E_closed = np.clip(p.E_closed, 0.0, 1.0)

    E_t = E_open * P_O_t + E_inter * P_I_t + E_closed * P_C_t
    return E_t

Total FRET: E_total(t) = f_dyn * E_dyn(t) + (1 - f_dyn) * E_static

Parameters:

Name	Type	Description	Default
`t_eval`	`ndarray`	Time points to evaluate.	required
`fit`	`Hsp90Fit3State`	Fitted model.	required

Returns:

Type	Description
`ndarray`	Total FRET efficiency at each time point.

Source code in pipeline.py

def model_total_fret(t_eval: np.ndarray, fit: Hsp90Fit3State) -> np.ndarray:
    """
    Total FRET: E_total(t) = f_dyn * E_dyn(t) + (1 - f_dyn) * E_static

    Parameters
    ----------
    t_eval : ndarray
        Time points to evaluate.
    fit : Hsp90Fit3State
        Fitted model.
    Returns
    -------
    ndarray
        Total FRET efficiency at each time point.
    """
    E_dyn = model_fret_3state(t_eval, fit.params)
    return fit.f_dyn * E_dyn + (1.0 - fit.f_dyn) * fit.E_static

Load combined_fret_matrix CSV file into time grid and FRET matrix.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to combined_fret_matrix CSV file.	required

Returns:

Name	Type	Description
`t`	`(T,) array`	Time grid.
`E_mat`	`(T, N) array`	FRET trajectories matrix.
`traj_cols`	`list of str`	Names of trajectory columns (same order as E_mat columns).

Source code in pipeline.py

def load_combined_matrix(path: Path) -> tuple[np.ndarray, np.ndarray, list[str]]:
    """
    Load combined_fret_matrix CSV file into time grid and FRET matrix.

    Parameters
    ----------
    path : Path
        Path to combined_fret_matrix CSV file.
    Returns
    -------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    traj_cols : list of str
        Names of trajectory columns (same order as E_mat columns).
    """
    df = pd.read_csv(path)
    if "time_s" not in df.columns:
        raise ValueError("Expected a 'time_s' column in the combined matrix.")

    t = df["time_s"].values
    traj_cols = [c for c in df.columns if c != "time_s"]
    E_mat = df[traj_cols].to_numpy()

    row_valid = np.isfinite(E_mat).any(axis=1)
    t = t[row_valid]
    E_mat = E_mat[row_valid, :]

    return t, E_mat, traj_cols

Parse combined_fret_matrix trajectory column names into a metadata DataFrame.

Expected format (from your export code): __p e.g. "Hsp90_409_601_241107_p00001"

Returns:

Name	Type	Description
`meta`	`DataFrame with columns:`	col: original column name construct: e.g. "Hsp90_409_601" or "esDNA" exp_id: e.g. "241107" particle: string/ID after 'p' condition: default grouping key "_"

Source code in pipeline.py

def parse_column_metadata(col_names: list[str]) -> pd.DataFrame:
    """
    Parse combined_fret_matrix trajectory column names into a metadata DataFrame.

    Expected format (from your export code):
        <construct>_<exp_id>_p<particle>
    e.g. "Hsp90_409_601_241107_p00001"

    Returns
    -------
    meta : DataFrame with columns:
        - col: original column name
        - construct: e.g. "Hsp90_409_601" or "esDNA"
        - exp_id: e.g. "241107"
        - particle: string/ID after 'p'
        - condition: default grouping key "<construct>_<exp_id>"
    """
    records = []
    for c in col_names:
        if c == "time_s":
            continue

        # Split from the right: [..., construct, exp_id, pXXXXX]
        parts = c.split("_")
        if len(parts) < 3:
            # Fallback: treat whole name as "construct"
            construct = c
            exp_id = "unknown"
            particle = "unknown"
        else:
            particle = parts[-1]  # e.g. "p00001"
            exp_id = parts[-2]  # e.g. "241107"
            construct = "_".join(parts[:-2])  # e.g. "Hsp90_409_601" or "esDNA"

        condition = f"{construct}_{exp_id}"
        # condition = f"{construct}"
        records.append((c, construct, exp_id, particle, condition))

    meta = pd.DataFrame(
        records, columns=["col", "construct", "exp_id", "particle", "condition"]
    )
    return meta

Given the full time grid and matrix, extract a sub-matrix restricted to a subset of trajectory columns, and remove rows where all subset entries are NaN.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Full time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories for all columns in all_cols.	required
`all_cols`	`list of str`	Names of all trajectory columns (corresponding to E_mat columns).	required
`cols_subset`	`list of str`	Names of columns to keep for this subset.	required

Returns:

Name	Type	Description
`t_sub`	`(T_sub,) array`	Time grid for which at least one trajectory in the subset is finite.
`E_sub`	`(T_sub, N_sub) array`	Subset FRET matrix.

Source code in pipeline.py

def subset_matrix_by_columns(
    t: np.ndarray, E_mat: np.ndarray, all_cols: list[str], cols_subset: list[str]
) -> tuple[np.ndarray, np.ndarray]:
    """
    Given the full time grid and matrix, extract a sub-matrix restricted to
    a subset of trajectory columns, and remove rows where all subset entries are NaN.

    Parameters
    ----------
    t : (T,) array
        Full time grid.
    E_mat : (T, N) array
        FRET trajectories for all columns in all_cols.
    all_cols : list of str
        Names of all trajectory columns (corresponding to E_mat columns).
    cols_subset : list of str
        Names of columns to keep for this subset.

    Returns
    -------
    t_sub : (T_sub,) array
        Time grid for which at least one trajectory in the subset is finite.
    E_sub : (T_sub, N_sub) array
        Subset FRET matrix.
    """
    name_to_idx = {c: i for i, c in enumerate(all_cols)}
    idx = [name_to_idx[c] for c in cols_subset if c in name_to_idx]

    if not idx:
        raise ValueError("No matching columns found for subset.")

    E_sub_full = E_mat[:, idx]
    row_valid = np.isfinite(E_sub_full).any(axis=1)
    t_sub = t[row_valid]
    E_sub = E_sub_full[row_valid, :]
    return t_sub, E_sub

Compute ensemble RMSE and R^2 for a given condition and fitted model.

Uses ensemble mean vs model prediction.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories matrix.	required
`fit`	`Hsp90Fit3State`	Fitted model.	required

Returns:

Type	Description
`dict`	Dictionary with keys: - rmse : float Root Mean Square Error between mean observed and model. - r2 : float Coefficient of determination. - n_time : int Number of time points used in the comparison. - n_traj : int Number of trajectories in E_mat.

Source code in pipeline.py

def compute_ensemble_metrics(
    t: np.ndarray, E_mat: np.ndarray, fit: Hsp90Fit3State
) -> dict:
    """
    Compute ensemble RMSE and R^2 for a given condition and fitted model.

    Uses ensemble mean vs model prediction.

    Parameters
    ----------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    fit : Hsp90Fit3State
        Fitted model.
    Returns
    -------
    dict
        Dictionary with keys:
            - rmse : float
                Root Mean Square Error between mean observed and model.
            - r2 : float
                Coefficient of determination.
            - n_time : int
                Number of time points used in the comparison.
            - n_traj : int
                Number of trajectories in E_mat.
    """
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_plot = t[row_valid]
    E_plot = E_mat[row_valid, :]

    if t_plot.size == 0:
        return {"rmse": np.nan, "r2": np.nan, "n_time": 0, "n_traj": 0}

    E_mean = np.nanmean(E_plot, axis=1)
    E_model = model_total_fret(t_plot, fit)

    mask = np.isfinite(E_mean) & np.isfinite(E_model)
    E_obs = E_mean[mask]
    E_mod = E_model[mask]

    if E_obs.size == 0:
        return {"rmse": np.nan, "r2": np.nan, "n_time": 0, "n_traj": E_plot.shape[1]}

    residuals = E_obs - E_mod
    rmse = np.sqrt(np.mean(residuals**2))
    ss_res = np.sum(residuals**2)
    ss_tot = np.sum((E_obs - E_obs.mean()) ** 2)
    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan

    return {
        "rmse": float(rmse),
        "r2": float(r2),
        "n_time": int(len(E_obs)),
        "n_traj": int(E_mat.shape[1]),
    }

Fit the 3-state+bleaching+static model separately for each condition.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Global time grid.	required
`E_mat`	`(T, N) array`	Full combined FRET matrix.	required
`col_names`	`list of str`	Names of trajectory columns (same order as E_mat columns).	required
`group_by`	`(condition, construct, exp_id)`	How to group trajectories: - "condition": construct+exp_id (default, per coverslip/day) - "construct": pool all days for each construct - "exp_id": per date across constructs (usually less useful)	`"condition"`
`do_plots`	`bool`	If True, make per-condition time plots using plot_hsp90_fit_time.	`False`
`max_overlay_traces`	`int`	Maximum number of individual trajectories to overlay per condition.	`100`

Returns:

Name	Type	Description
`summary_df`	`DataFrame`	One row per condition with fitted parameters and metrics.
`fits`	`dict`	Mapping: condition_key -> Hsp90Fit3State

Source code in pipeline.py

def fit_all_conditions(
    t: np.ndarray,
    E_mat: np.ndarray,
    col_names: list[str],
    group_by: str = "condition",
    do_plots: bool = False,
    max_overlay_traces: int = 100,
    n_starts: int = 5,
    n_jobs: int = 4,
) -> tuple[pd.DataFrame, dict]:
    """
    Fit the 3-state+bleaching+static model separately for each condition.

    Parameters
    ----------
    t : (T,) array
        Global time grid.
    E_mat : (T, N) array
        Full combined FRET matrix.
    col_names : list of str
        Names of trajectory columns (same order as E_mat columns).
    group_by : {"condition", "construct", "exp_id"}
        How to group trajectories:
        - "condition": construct+exp_id (default, per coverslip/day)
        - "construct": pool all days for each construct
        - "exp_id": per date across constructs (usually less useful)
    do_plots : bool
        If True, make per-condition time plots using plot_hsp90_fit_time.
    max_overlay_traces : int
        Maximum number of individual trajectories to overlay per condition.

    Returns
    -------
    summary_df : DataFrame
        One row per condition with fitted parameters and metrics.
    fits : dict
        Mapping: condition_key -> Hsp90Fit3State
    """
    meta = parse_column_metadata(col_names)
    if group_by not in meta.columns:
        raise ValueError(
            f"group_by must be one of {', '.join(['condition', 'construct', 'exp_id'])}"
        )

    group_keys = sorted(meta[group_by].unique())
    logger.info(f"Starting parallel fit for {len(group_keys)} groups...")

    # --- PARALLEL FITTING ---
    # n_jobs=-1 uses all available CPU cores.
    # verbose=1 provides a progress update in the logger.info.
    results = Parallel(n_jobs=n_jobs, verbose=0)(
        delayed(_fit_single_condition_worker)(
            key, t, E_mat, col_names, meta, group_by, n_starts, n_jobs
        )
        for key in group_keys
    )
    # ------------------------

    # Unpack results from parallel workers
    fits_list = []
    fit_dict = {}
    for res in results:
        if res is not None:
            key, fit, rec = res
            fit_dict[key] = fit
            fits_list.append(rec)

    # Create summary DataFrame
    if not fits_list:
        summary_df = pd.DataFrame(
            columns=[
                "group_by",
                "group_key",
                "n_traj",
                "n_time",
                "rmse",
                "r2",
                "k_OI",
                "k_IO",
                "k_IC",
                "k_CI",
                "k_BO",
                "k_BI",
                "k_BC",
                "E_open",
                "E_inter",
                "E_closed",
                "P_O0",
                "P_C0",
                "f_dyn",
                "E_static",
            ]
        )
    else:
        summary_df = pd.DataFrame(fits_list)

    # --- SEQUENTIAL PLOTTING --
    # Must be done serially because Matplotlib is not thread-safe.
    if do_plots and fit_dict:
        for key in sorted(fit_dict.keys()):
            logger.info(f"  Plotting {key}...")
            # Re-subset data just for plotting
            cols = meta.loc[meta[group_by] == key, "col"].tolist()
            t_sub, E_sub = subset_matrix_by_columns(t, E_mat, col_names, cols)

            plot_hsp90_fit_time(
                t_sub,
                E_sub,
                fit_dict[key],
                n_traces_overlay=max_overlay_traces,
                random_seed=0,
                condition_key=key,
            )

    return summary_df, fit_dict

Fit the 3-state + bleaching model plus static fraction to the ensemble mean. Uses a small multi-start strategy around theta0 to avoid bad local minima.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories matrix.	required
`theta0`	`(14,) array`	Initial guess for parameters: [k_OI, k_IO, k_IC, k_CI, k_BO, k_BI, k_BC, E_open, E_inter, E_closed, P_O0, P_C0, f_dyn, E_static]	`None`
`n_starts`	`int`	Number of multi-starts for fitting (default: 5).	`5`

Returns:

Name	Type	Description
`fit`	`Hsp90Fit3State`	Fitted model.

Raises:

Type	Description
`RuntimeError`	If no valid data is available for fitting.

Source code in pipeline.py

def fit_global_3state(
    t: np.ndarray,
    E_mat: np.ndarray,
    theta0: np.ndarray = None,
    n_starts: int = 5,
    n_jobs: int = 1,
) -> Hsp90Fit3State:
    """
    Fit the 3-state + bleaching model plus static fraction to the ensemble mean.
    Uses a small multi-start strategy around theta0 to avoid bad local minima.

    Parameters
    ----------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    theta0 : (14,) array, optional
        Initial guess for parameters:
        [k_OI, k_IO, k_IC, k_CI, k_BO,
            k_BI, k_BC,
            E_open, E_inter, E_closed,
            P_O0, P_C0,
            f_dyn, E_static]
    n_starts : int
        Number of multi-starts for fitting (default: 5).
    Returns
    -------
    fit : Hsp90Fit3State
        Fitted model.
    Raises
    ------
    RuntimeError
        If no valid data is available for fitting.
    """
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_fit = t[row_valid]
    E_mean = np.nanmean(E_mat[row_valid, :], axis=1)

    mask = np.isfinite(E_mean)
    t_fit = t_fit[mask]
    E_fit = E_mean[mask]

    # compute timewise SD to use as weights (sigma)
    E_std_all = np.nanstd(E_mat[row_valid, :], axis=1)
    sigma = E_std_all[mask]
    # avoid zeros
    sigma = np.where(np.isfinite(sigma) & (sigma > 1e-6), sigma, 1e-6)

    if t_fit.size == 0:
        raise RuntimeError("No valid data for fitting.")

    # 14 parameters: 7 rates + 3 FRET + 2 initials + 2 static
    if theta0 is None:
        theta0 = np.array(
            [
                0.01,
                0.01,
                0.01,
                0.01,  # k_OI, k_IO, k_IC, k_CI
                0.003,
                0.006,
                0.012,  # k_BO, k_BI, k_BC
                0.4,
                0.5,
                0.7,  # E_open, E_inter, E_closed (rough guesses)
                0.35,
                0.55,  # P_O0, P_C0
                0.7,
                0.18,  # f_dyn, E_static
            ],
            dtype=float,
        )

    lower = np.array(
        [
            0.0,
            0.0,
            0.0,
            0.0,  # rates
            0.0,
            0.0,
            0.0,  # bleaching
            0.0,
            0.0,
            0.0,  # FRET in [0,1]
            0.0,
            0.0,  # P_O0, P_C0
            0.0,
            0.0,  # f_dyn, E_static
        ],
        dtype=float,
    )

    upper = np.array(
        [
            10.0,
            10.0,
            10.0,
            10.0,
            2.0,
            2.0,
            2.0,  # bleaching slower-ish
            1.0,
            1.0,
            1.0,  # FRET ≤ 1
            1.0,
            1.0,
            1.0,
            1.0,
        ],
        dtype=float,
    )

    def fret_wrapper_3s(
        t_in: NDArray[np.float64],
        k_oi: float,
        k_io: float,
        k_ic: float,
        k_ci: float,
        k_bo: float,
        k_bi: float,
        k_bc: float,
        e_o: float,
        e_i: float,
        e_c: float,
        p_o0: float,
        p_c0: float,
        f_dyn: float,
        e_static: float,
    ) -> NDArray[np.float64]:
        params = Hsp90Params3State(
            k_OI=k_oi,
            k_IO=k_io,
            k_IC=k_ic,
            k_CI=k_ci,
            k_BO=k_bo,
            k_BI=k_bi,
            k_BC=k_bc,
            E_open=e_o,
            E_inter=e_i,
            E_closed=e_c,
            P_O0=p_o0,
            P_C0=p_c0,
        )
        E_dyn = model_fret_3state(t_in, params)
        return f_dyn * E_dyn + (1.0 - f_dyn) * e_static

    # ------------------------------------------------------------------
    # Multi-start around theta0 to avoid local minima
    # ------------------------------------------------------------------
    rng = np.random.default_rng(0)

    # Precompute all starting points deterministically
    start_configs: list[tuple[int, np.ndarray, str]] = []
    for s in range(n_starts):
        if s == 0:
            theta_start = theta0.copy()
            kind = "base"
        else:
            jitter = 1.0 + 0.3 * rng.normal(size=theta0.size)
            theta_start = theta0 * jitter
            theta_start = np.clip(theta_start, lower + 1e-8, upper - 1e-8)
            kind = "jitter"
        start_configs.append((s, theta_start, kind))

    def _single_start_worker(cfg: tuple[int, np.ndarray, str]) -> dict:
        """
        Run one curve_fit multi-start and return diagnostics.

        Parameters
        ----------
        cfg : tuple[int, ndarray, str]
            (start_index, theta_start, kind)
        Returns
        -------
        dict
            Dictionary with fit result and diagnostics.
        """
        s, theta_start, kind = cfg
        try:
            res_obj = cast(
                "object",
                curve_fit(
                    fret_wrapper_3s,
                    t_fit,
                    E_fit,
                    p0=theta_start,
                    bounds=(lower, upper),
                    sigma=sigma,
                    absolute_sigma=False,
                    maxfev=20000,
                ),
            )
            popt, pcov = cast(
                "tuple[NDArray[np.float64], NDArray[np.float64]]", res_obj
            )
        except Exception as e:
            return {
                "s": s,
                "kind": kind,
                "ok": False,
                "msg": str(e),
                "popt": None,
                "cost": np.inf,
                "rmse": np.inf,
            }

        # Evaluate fit quality
        E_model = fret_wrapper_3s(t_fit, *popt)
        mask_obj = np.isfinite(E_fit) & np.isfinite(E_model)
        if not np.any(mask_obj):
            return {
                "s": s,
                "kind": kind,
                "ok": False,
                "msg": "no valid points",
                "popt": None,
                "cost": np.inf,
                "rmse": np.inf,
            }

        resid = E_fit[mask_obj] - E_model[mask_obj]
        cost = float(np.mean(resid**2))
        rmse = float(np.sqrt(cost))

        return {
            "s": s,
            "kind": kind,
            "ok": True,
            "msg": "",
            "popt": popt,
            "cost": cost,
            "rmse": rmse,
        }

    results = Parallel(n_jobs=n_jobs, verbose=0)(
        delayed(_single_start_worker)(cfg) for cfg in start_configs
    )

    # Select best result
    best_popt: NDArray[np.float64] | None = None
    best_cost = np.inf

    for res in results:
        s = res["s"]
        kind = res["kind"]
        if not res["ok"]:
            msg = res["msg"]
            logger.info(
                f"[fit_3state] multi-start {s + 1}/{n_starts} ({kind}) failed: {msg}"
            )
            continue

        cost = res["cost"]
        rmse = res["rmse"]
        logger.info(
            f"[fit_3state] multi-start {s + 1}/{n_starts} ({kind}): RMSE = {rmse:.6f}"
        )

        if cost < best_cost:
            best_cost = cost
            best_popt = res["popt"]

    if best_popt is None:
        raise RuntimeError("fit_global_3state: all multi-start attempts failed.")

    logger.info(f"[fit_3state] selected solution with RMSE = {np.sqrt(best_cost):.6f}")
    popt = best_popt

    (
        k_oi,
        k_io,
        k_ic,
        k_ci,
        k_bo,
        k_bi,
        k_bc,
        e_o,
        e_i,
        e_c,
        p_o0,
        p_c0,
        f_dyn,
        e_static,
    ) = popt

    params = Hsp90Params3State(
        k_OI=float(k_oi),
        k_IO=float(k_io),
        k_IC=float(k_ic),
        k_CI=float(k_ci),
        k_BO=float(k_bo),
        k_BI=float(k_bi),
        k_BC=float(k_bc),
        E_open=float(e_o),
        E_inter=float(e_i),
        E_closed=float(e_c),
        P_O0=float(p_o0),
        P_C0=float(p_c0),
    )

    return Hsp90Fit3State(params=params, f_dyn=float(f_dyn), E_static=float(e_static))

Goodness-of-fit plot based on ensemble-averaged FRET. Plots mean observed FRET vs model FRET at each time point.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories matrix.	required
`fit`	`Hsp90Fit3State`	Fitted model.	required
`outdir`	`Path`	Output directory to save the plot.	required

Returns:

Type	Description
`None`
`-----------`
`Compute ensemble mean and std dev at each time point.`
`Evaluate model at those time points.`
`Plot observed mean vs model prediction with y=x reference line.`

Compute and log RMSE and R^2.

Source code in pipeline.py

def plot_ensemble_fit(
    t: np.ndarray, E_mat: np.ndarray, fit: Hsp90Fit3State, outdir: Path
) -> None:
    """
    Goodness-of-fit plot based on ensemble-averaged FRET.
    Plots mean observed FRET vs model FRET at each time point.

    Parameters
    ----------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    fit : Hsp90Fit3State
        Fitted model.
    outdir : Path
        Output directory to save the plot.
    Returns
    -------
    None
    -----------
    Compute ensemble mean and std dev at each time point.
    Evaluate model at those time points.
    Plot observed mean vs model prediction with y=x reference line.
    Compute and log RMSE and R^2.
    -----------
    """
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_plot = t[row_valid]
    E_plot = E_mat[row_valid, :]

    if t_plot.size == 0:
        raise RuntimeError("No valid time points for ensemble fit plot.")

    E_mean = np.nanmean(E_plot, axis=1)
    np.nanstd(E_plot, axis=1)

    E_model = model_total_fret(t_plot, fit)

    mask = np.isfinite(E_mean) & np.isfinite(E_model)
    E_obs = E_mean[mask]
    E_mod = E_model[mask]

    if E_obs.size == 0:
        raise RuntimeError("No valid (mean, model) pairs for goodness-of-fit plot.")

    residuals = E_obs - E_mod
    rmse = np.sqrt(np.mean(residuals**2))
    ss_res = np.sum(residuals**2)
    ss_tot = np.sum((E_obs - E_obs.mean()) ** 2)
    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan

    logger.info(f"Ensemble RMSE (mean data - model): {rmse:.6f}")
    logger.info(f"Ensemble R^2: {r2:.6f}")

    xmin = np.min(E_mod)
    xmax = np.max(E_mod)
    ymin = np.min(E_obs)
    ymax = np.max(E_obs)
    lo = min(xmin, ymin)
    hi = max(xmax, ymax)

    plt.figure(figsize=(8, 8))
    plt.scatter(E_mod, E_obs, s=20, alpha=0.8, label="Time points (ensemble mean)")
    plt.plot([lo, hi], [lo, hi], "k--", lw=1.5, label="y = x")

    plt.xlabel("Model FRET (ensemble)")
    plt.ylabel("Observed FRET (ensemble mean)")
    plt.title(
        f"Goodness of fit (ensemble mean vs model)\nRMSE = {rmse:.4f}, R^2 = {r2:.4f}"
    )
    plt.xlim(lo, hi)
    plt.ylim(lo, hi)
    plt.gca().set_aspect("equal", adjustable="box")
    plt.legend(loc="best")
    plt.tight_layout()

    plt.savefig(outdir / "ensemble_fit.png", dpi=300)
    plt.close()

Plot Hsp90 3-state + bleaching + static fraction vs data across time.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories matrix.	required
`fit`	`Hsp90Fit3State`	Fitted model.	required
`n_traces_overlay`	`int`	Number of individual trajectories to overlay (default: 200).	`200`
`random_seed`	`int`	Random seed for trajectory selection.	`0`
`outdir`	`Path`	Output directory to save the plot.	`outdir`

Returns:

Type	Description
`None`
`-----------`
`Compute ensemble mean and std dev at each time point.`
`Evaluate model at those time points.`

Plot individual trajectories (subset), mean ± std dev, and model.

Source code in pipeline.py

def plot_hsp90_fit_time(
    t: np.ndarray,
    E_mat: np.ndarray,
    fit: Hsp90Fit3State,
    n_traces_overlay: int = 200,
    random_seed: int = 0,
    condition_key: str | None = None,
    outdir: Path = outdir,
) -> None:
    """
    Plot Hsp90 3-state + bleaching + static fraction vs data across time.

    Parameters
    ----------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    fit : Hsp90Fit3State
        Fitted model.
    n_traces_overlay : int
        Number of individual trajectories to overlay (default: 200).
    random_seed : int
        Random seed for trajectory selection.
    outdir : Path
        Output directory to save the plot.
    Returns
    -------
    None
    -----------
    Compute ensemble mean and std dev at each time point.
    Evaluate model at those time points.
    Plot individual trajectories (subset), mean ± std dev, and model.
    -----------
    """
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_plot = t[row_valid]
    E_plot = E_mat[row_valid, :]

    E_mean = np.nanmean(E_plot, axis=1)
    E_std = np.nanstd(E_plot, axis=1)

    E_model = model_total_fret(t_plot, fit)

    n_traj_total = E_plot.shape[1]
    if n_traces_overlay > 0 and n_traj_total > 0:
        n_traces_overlay = min(n_traces_overlay, n_traj_total)
        rng = np.random.default_rng(random_seed)
        idx = rng.choice(n_traj_total, size=n_traces_overlay, replace=False)
        E_subset = E_plot[:, idx]
    else:
        E_subset = None

    fig, ax = plt.subplots(figsize=(8, 8))

    if E_subset is not None:
        for j in range(E_subset.shape[1]):
            ax.plot(t_plot, E_subset[:, j], color="gray", alpha=0.05, lw=0.5)

    ax.plot(t_plot, E_mean, color="tab:blue", lw=2, label="Data Mean")
    ax.fill_between(
        t_plot,
        E_mean - E_std,
        E_mean + E_std,
        color="tab:blue",
        alpha=0.2,
        label="Data ±1 SD",
    )

    ax.plot(t_plot, E_model, color="tab:red", lw=2, label="Model")

    ax.set_xlabel("time (s)")
    ax.set_ylabel("FRET")
    ax.set_title(
        f"Model Fit - {condition_key}\n"
        f"{E_plot.shape[1]} trajectories, {len(t_plot)} time points"
    )
    ax.legend(loc="best")
    plt.tight_layout()

    plt.savefig(outdir / f"{condition_key}_fit.png", dpi=300)
    plt.close()

Perform bootstrap resampling and fitting for a single condition.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Global time grid.	required
`E_mat`	`(T, N) array`	Full combined FRET matrix.	required
`col_names`	`list of str`	Names of trajectory columns (same order as E_mat columns).	required
`meta`	`DataFrame`	Metadata DataFrame parsed from column names.	required
`group_key`	`str`	Condition key to bootstrap.	required
`group_by`	`(condition, construct, exp_id)`	How to group trajectories.	`"condition"`
`n_boot`	`int`	Number of bootstrap replicates.	`100`
`random_seed`	`int`	Base random seed for resampling.	`0`
`n_jobs`	`int`	Number of parallel jobs.	`4`

Returns:

Type	Description
`DataFrame`	DataFrame with one row per bootstrap replicate and fitted parameters.
`-----------`
`1. Identify columns for the specified condition.`
`2. Subset the data matrix for this condition.`
`3. Perform parallel bootstrap fitting.`

Collect and return results.

Source code in pipeline.py

def bootstrap_condition_params(
    t: np.ndarray,
    E_mat: np.ndarray,
    col_names: list[str],
    meta: pd.DataFrame,
    group_key: str,
    group_by: str = "condition",
    n_boot: int = 100,
    random_seed: int = 0,
    n_jobs: int = 4,
) -> pd.DataFrame:
    """
    Perform bootstrap resampling and fitting for a single condition.

    Parameters
    ----------
    t : (T,) array
        Global time grid.
    E_mat : (T, N) array
        Full combined FRET matrix.
    col_names : list of str
        Names of trajectory columns (same order as E_mat columns).
    meta : DataFrame
        Metadata DataFrame parsed from column names.
    group_key : str
        Condition key to bootstrap.
    group_by : {"condition", "construct", "exp_id"}
        How to group trajectories.
    n_boot : int
        Number of bootstrap replicates.
    random_seed : int
        Base random seed for resampling.
    n_jobs : int
        Number of parallel jobs.
    Returns
    -------
    pd.DataFrame
        DataFrame with one row per bootstrap replicate and fitted parameters.
    -----------
    1. Identify columns for the specified condition.
    2. Subset the data matrix for this condition.
    3. Perform parallel bootstrap fitting.
    4. Collect and return results.
    -----------
    """
    cols_subset = meta.loc[meta[group_by] == group_key, "col"].tolist()
    if not cols_subset:
        raise ValueError(f"No columns for {group_by}={group_key}")

    # Build submatrix for this condition (once)
    name_to_idx = {c: i for i, c in enumerate(col_names)}
    idx_all = [name_to_idx[c] for c in cols_subset if c in name_to_idx]
    E_full = E_mat[:, idx_all]

    row_valid = np.isfinite(E_full).any(axis=1)
    t_sub = t[row_valid]
    E_sub = E_full[row_valid, :]

    logger.info(f"Starting {n_boot} parallel bootstrap fits for {group_key}...")

    # --- PARALLEL BOOTSTRAP FITTING ---
    # We pass t_sub and E_sub (large arrays) once,
    # and iterate over the unique seeds.
    # verbose=5 will show a progress bar
    results = Parallel(n_jobs=n_jobs, verbose=0)(
        delayed(_bootstrap_worker)(t_sub, E_sub, random_seed + b) for b in range(n_boot)
    )
    # ------------------------

    # Unpack results, filtering out any failed (None) runs
    records = [res for res in results if res is not None]

    if not records:
        logger.info(f"Warning: All {n_boot} bootstrap fits failed for {group_key}.")
        return pd.DataFrame()

    return pd.DataFrame(records)

Plot bootstrap 95% confidence intervals for a given parameter across all conditions.

Parameters:

Name	Type	Description	Default
`boot_summary`	`DataFrame`	DataFrame with bootstrap summary statistics.	required
`param`	`str`	Parameter name to plot (e.g., "k_OI", "E_open", etc.).	required
`title_suffix`	`str`	Suffix to add to the plot title.	`''`
`outdir`	`Path`	Output directory to save the plot.	`outdir`

Returns:

Type	Description
`None`
`-----------`
`1. Filter bootstrap summary for the specified parameter.`

Create error bar plot of mean ± 95% CI across conditions.

Source code in pipeline.py

def plot_bootstrap_errorbars_all_conditions(
    boot_summary: pd.DataFrame,
    param: str,
    title_suffix: str = "",
    outdir: Path = outdir,
) -> None:
    """
    Plot bootstrap 95% confidence intervals for a given parameter across all conditions.

    Parameters
    ----------
    boot_summary : DataFrame
        DataFrame with bootstrap summary statistics.
    param : str
        Parameter name to plot (e.g., "k_OI", "E_open", etc.).
    title_suffix : str
        Suffix to add to the plot title.
    outdir : Path
        Output directory to save the plot.
    Returns
    -------
    None
    -----------
    1. Filter bootstrap summary for the specified parameter.
    2. Create error bar plot of mean ± 95% CI across conditions.
    -----------
    """
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    df = boot_summary[boot_summary["param"] == param].copy()
    if df.empty:
        logger.info(f"No bootstrap summary for param={param}")
        return

    df = df.sort_values("group_key")
    x = np.arange(len(df))
    y = df["mean"].values
    yerr_lower = y - df["lo"].values
    yerr_upper = df["hi"].values - y
    yerr = np.vstack([yerr_lower, yerr_upper])

    plt.figure(figsize=(8, 4))
    plt.errorbar(
        x,
        y,
        yerr=yerr,
        fmt="o",
        capsize=4,
        linewidth=1.5,
    )
    plt.xticks(x, df["group_key"], rotation=45, ha="right")
    plt.ylabel(param)
    plt.title(f"Bootstrap 95% CI across conditions{title_suffix}")
    plt.tight_layout()

    plt.savefig(outdir / f"bootstrap_ci_{param}.png", dpi=300)
    plt.close()

Perform Sobol sensitivity analysis on the 3-state + bleaching + static model.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Time grid.	required
`E_mat`	`(T, N) array`	FRET trajectories matrix.	required
`param_bounds`	`dict`	Dictionary mapping parameter names to (min, max) bounds for sampling. Example: { "k_OI": (0.001, 1.0), "k_IO": (0.001, 1.0), ... }	required
`n_base_samples`	`int`	Number of base samples for Saltelli sampling (default: 512).	`512`

Returns:

Type	Description
`dict`	Sobol sensitivity indices dictionary from SALib.
`-----------`
`1. Prepare data: compute ensemble mean FRET for objective.`
`2. Define SALib problem with parameter names and bounds.`
`3. Generate Saltelli samples.`
`4. Evaluate model for each sample and compute RMSE objective.`

Perform Sobol analysis on valid evaluations.

Source code in pipeline.py

def sobol_sensitivity_3state(
    t: np.ndarray,
    E_mat: np.ndarray,
    param_bounds: dict[str, tuple[float, float]],
    n_base_samples: int = 512,
    n_jobs: int = 1,
) -> dict:
    """
    Perform Sobol sensitivity analysis on the 3-state + bleaching + static model.

    Parameters
    ----------
    t : (T,) array
        Time grid.
    E_mat : (T, N) array
        FRET trajectories matrix.
    param_bounds : dict
        Dictionary mapping parameter names to (min, max) bounds for sampling.
        Example:
        {
            "k_OI": (0.001, 1.0),
            "k_IO": (0.001, 1.0),
            ...
        }
    n_base_samples : int
        Number of base samples for Saltelli sampling (default: 512).
    Returns
    -------
    dict
        Sobol sensitivity indices dictionary from SALib.
    -----------
    1. Prepare data: compute ensemble mean FRET for objective.
    2. Define SALib problem with parameter names and bounds.
    3. Generate Saltelli samples.
    4. Evaluate model for each sample and compute RMSE objective.
    5. Perform Sobol analysis on valid evaluations.
    -----------
    """

    # ---- 1. Prepare data: ensemble mean for objective -----------------
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_fit = t[row_valid]
    E_mean = np.nanmean(E_mat[row_valid, :], axis=1)

    mask = np.isfinite(E_mean)
    t_fit = t_fit[mask]
    E_fit = E_mean[mask]

    if t_fit.size == 0:
        raise RuntimeError("No valid data for sensitivity analysis.")

    # ---- 2. Define SALib problem --------------------------------------
    # Order of parameters
    names = list(param_bounds.keys())
    bounds = [param_bounds[n] for n in names]

    problem = {
        "num_vars": len(names),
        "names": names,
        "bounds": bounds,
    }

    # ---- 3. Generate samples --------------------------------
    param_values = sobol_sample.sample(
        problem, N=n_base_samples, calc_second_order=True
    )

    # ---- 4. Evaluate model for each sample -------

    def _eval_one(theta: np.ndarray) -> float:
        # Map sample vector -> parameter dict
        p_dict = dict(zip(names, theta, strict=False))

        # Defaults if not varied
        k_OI = p_dict.get("k_OI", 0.08)
        k_IO = p_dict.get("k_IO", 0.01)
        k_IC = p_dict.get("k_IC", 0.08)
        k_CI = p_dict.get("k_CI", 0.01)

        k_BO = p_dict.get("k_BO", 0.003)
        k_BI = p_dict.get("k_BI", 0.006)
        k_BC = p_dict.get("k_BC", 0.012)

        E_open = p_dict.get("E_open", 0.35)
        E_inter = p_dict.get("E_inter", 0.55)
        E_closed = p_dict.get("E_closed", 0.75)

        P_O0 = p_dict.get("P_O0", 0.4)
        P_C0 = p_dict.get("P_C0", 0.5)

        f_dyn = p_dict.get("f_dyn", 0.7)
        E_static = p_dict.get("E_static", 0.2)

        params = Hsp90Params3State(
            k_OI=k_OI,
            k_IO=k_IO,
            k_IC=k_IC,
            k_CI=k_CI,
            k_BO=k_BO,
            k_BI=k_BI,
            k_BC=k_BC,
            E_open=E_open,
            E_inter=E_inter,
            E_closed=E_closed,
            P_O0=P_O0,
            P_C0=P_C0,
        )

        fit = Hsp90Fit3State(params=params, f_dyn=f_dyn, E_static=E_static)

        try:
            E_model = model_total_fret(t_fit, fit)
            mask_obj = np.isfinite(E_model) & np.isfinite(E_fit)
            if not np.any(mask_obj):
                return np.nan

            r = E_fit[mask_obj] - E_model[mask_obj]
            rmse = np.sqrt(np.mean(r**2))
            return float(rmse)
        except Exception:
            return np.nan

    Y = np.array(
        Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(_eval_one)(theta) for theta in param_values
        ),
        dtype=float,
    )

    # Clean up NaNs
    valid_mask = np.isfinite(Y)
    if not np.any(valid_mask):
        raise RuntimeError("All Sobol evaluations failed or returned NaN.")

    Y_valid = Y[valid_mask]
    X_valid = param_values[valid_mask, :]
    n_valid = X_valid.shape[0]

    logger.info(f"Sensitivity analysis: {n_valid}/{len(Y)} evaluations valid.")

    # Re-run analysis only on valid samples
    Si = sobol.analyze(
        problem,
        Y_valid,
        calc_second_order=True,
        print_to_console=False,
        parallel=True,
        n_processors=n_jobs,
    )

    # Attach parameter names for convenience
    Si["names"] = names
    return Si

Perform Sobol sensitivity analysis for a specific group/condition.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Global time grid.	required
`E_mat`	`(T, N) array`	Full combined FRET matrix.	required
`col_names`	`list of str`	Names of trajectory columns (same order as E_mat columns).	required
`meta`	`DataFrame`	Metadata DataFrame parsed from column names.	required
`group_key`	`str`	Condition key to analyze.	required
`group_by`	`(condition, construct, exp_id)`	How to group trajectories.	`"condition"`
`param_bounds`	`dict`	Dictionary mapping parameter names to (min, max) bounds for sampling.	required
`n_base_samples`	`int`	Number of base samples for Saltelli sampling (default: 512).	`512`

Returns:

Type	Description
`DataFrame`	DataFrame with Sobol sensitivity indices for the group.
`-----------`
`1. Identify columns for the specified group.`
`2. Subset the data matrix for this group.`
`3. Perform Sobol sensitivity analysis.`

Pack results into a DataFrame.

Source code in pipeline.py

def sobol_sensitivity_for_group(
    t: np.ndarray,
    E_mat: np.ndarray,
    col_names: list[str],
    meta: pd.DataFrame,
    group_key: str,
    group_by: str,
    param_bounds: dict[str, tuple[float, float]],
    n_base_samples: int = 512,
    n_jobs: int = 1,
) -> pd.DataFrame:
    """
    Perform Sobol sensitivity analysis for a specific group/condition.

    Parameters
    ----------
    t : (T,) array
        Global time grid.
    E_mat : (T, N) array
        Full combined FRET matrix.
    col_names : list of str
        Names of trajectory columns (same order as E_mat columns).
    meta : DataFrame
        Metadata DataFrame parsed from column names.
    group_key : str
        Condition key to analyze.
    group_by : {"condition", "construct", "exp_id"}
        How to group trajectories.
    param_bounds : dict
        Dictionary mapping parameter names to (min, max) bounds for sampling.
    n_base_samples : int
        Number of base samples for Saltelli sampling (default: 512).
    Returns
    -------
    pd.DataFrame
        DataFrame with Sobol sensitivity indices for the group.
    -----------
    1. Identify columns for the specified group.
    2. Subset the data matrix for this group.
    3. Perform Sobol sensitivity analysis.
    4. Pack results into a DataFrame.
    -----------
    """
    cols_subset = meta.loc[meta[group_by] == group_key, "col"].tolist()
    if not cols_subset:
        raise ValueError(f"No columns found for {group_by}={group_key!r}")

    # subset matrix
    t_sub, E_sub = subset_matrix_by_columns(t, E_mat, col_names, cols_subset)

    logger.info(
        f"Running Sobol SA for {group_by}={group_key}, "
        f"{E_sub.shape[1]} traj, {E_sub.shape[0]} time points"
    )

    Si = sobol_sensitivity_3state(
        t_sub,
        E_sub,
        param_bounds=param_bounds,
        n_base_samples=n_base_samples,
        n_jobs=n_jobs,
    )

    # pack into a nice DataFrame
    df_sa = pd.DataFrame(
        {
            "param": Si["names"],
            "S1": Si["S1"],
            "S1_conf": Si["S1_conf"],
            "ST": Si["ST"],
            "ST_conf": Si["ST_conf"],
        }
    )
    df_sa.insert(0, "group_key", group_key)
    df_sa.insert(0, "group_by", group_by)

    return df_sa

Plot a fitted parameter vs condition.

Parameters:

Name	Type	Description	Default
`summary_df`	`DataFrame`	DataFrame with fitted parameters per condition.	required
`param`	`str`	Parameter name to plot.	required
`outdir`	`Path`	Output directory to save the plot.	required

Returns:

Type	Description
`None`
`-----------`

Source code in pipeline.py

def plot_param_vs_condition(summary_df, param, outdir) -> None:
    """
    Plot a fitted parameter vs condition.

    Parameters
    ----------
    summary_df : DataFrame
        DataFrame with fitted parameters per condition.
    param : str
        Parameter name to plot.
    outdir : Path
        Output directory to save the plot.
    Returns
    -------
    None
    -----------
    """
    plt.figure(figsize=(8, 8))
    plt.plot(summary_df["group_key"], summary_df[param], "o-")
    plt.xticks(rotation=90)
    plt.ylabel(param)
    plt.tight_layout()

    plt.savefig(outdir / f"{param}_vs_condition.png", dpi=300)
    plt.close()

Summarize bootstrap results for a given parameter.

Parameters:

Name	Type	Description	Default
`boot_df`	`DataFrame`	DataFrame with bootstrap results.	required
`name`	`str`	Parameter name to summarize.	required

Returns:

Type	Description
`tuple`	(mean, 2.5th percentile, 97.5th percentile)
`-------`

Source code in pipeline.py

def summarize_bootstrap(boot_df, name):
    """
    Summarize bootstrap results for a given parameter.

    Parameters
    ----------
    boot_df : DataFrame
        DataFrame with bootstrap results.
    name : str
        Parameter name to summarize.
    Returns
    -------
    tuple
        (mean, 2.5th percentile, 97.5th percentile)
    -------
    """
    vals = boot_df[name].values
    mean = np.nanmean(vals)
    lo, hi = np.nanpercentile(vals, [2.5, 97.5])
    return mean, lo, hi

Plot comparison of bootstrap distributions for a given parameter.

Parameters:

Name	Type	Description	Default
`boot_A`	`DataFrame`	Bootstrap results for condition A.	required
`boot_B`	`DataFrame`	Bootstrap results for condition B.	required
`param`	`str`	Parameter name to plot.	required
`label_A`	`str`	Label for condition A.	required
`label_B`	`str`	Label for condition B.	required
`outdir`	`Path`	Output directory to save the plot.	required

Returns:

Type	Description
`None`
`-------`

Source code in pipeline.py

def plot_bootstrap_compare(boot_A, boot_B, param, label_A, label_B, outdir) -> None:
    """
    Plot comparison of bootstrap distributions for a given parameter.

    Parameters
    ----------
    boot_A : DataFrame
        Bootstrap results for condition A.
    boot_B : DataFrame
        Bootstrap results for condition B.
    param : str
        Parameter name to plot.
    label_A : str
        Label for condition A.
    label_B : str
        Label for condition B.
    outdir : Path
        Output directory to save the plot.
    Returns
    -------
    None
    -------
    """
    A_vals = boot_A[param].values
    B_vals = boot_B[param].values

    plt.figure(figsize=(8, 8))
    plt.hist(A_vals, bins=30, alpha=0.5, density=True, label=label_A)
    plt.hist(B_vals, bins=30, alpha=0.5, density=True, label=label_B)
    plt.xlabel(param)
    plt.ylabel("density")
    plt.legend()
    plt.tight_layout()

    plt.savefig(outdir / f"bootstrap_compare_{param}.png", dpi=300)
    plt.close()

Plot residuals (mean data - model) over time for ensemble fit.

Parameters:

Name	Type	Description	Default
`t`	`(T,) array`	Global time grid.	required
`E_mat`	`(T, N) array`	Full combined FRET matrix.	required
`fit`	`Hsp90Fit3State`	Fitted model to use for residual calculation.	required
`outdir`	`Path`	Output directory to save the plot.	required

Returns:

Type	Description
`None`

Source code in pipeline.py

def plot_residuals_over_time(t, E_mat, fit, outdir) -> None:
    """
    Plot residuals (mean data - model) over time for ensemble fit.

    Parameters
    ----------
    t : (T,) array
        Global time grid.
    E_mat : (T, N) array
        Full combined FRET matrix.
    fit : Hsp90Fit3State
        Fitted model to use for residual calculation.
    outdir : Path
        Output directory to save the plot.

    Returns
    -------
    None
    """
    row_valid = np.isfinite(E_mat).any(axis=1)
    t_plot = t[row_valid]
    E_mean = np.nanmean(E_mat[row_valid, :], axis=1)
    E_model = model_total_fret(t_plot, fit)
    r = E_mean - E_model
    plt.figure(figsize=(8, 8))
    plt.plot(t_plot, r, lw=1)
    plt.axhline(0, ls="--", c="k", lw=1)
    plt.xlabel("time (s)")
    plt.ylabel("residual (mean - model)")
    plt.title("Ensemble residuals over time")
    plt.tight_layout()

    plt.savefig(outdir / "ensemble_residuals_time.png", dpi=300)
    plt.close()

Run the full smFRET kinetic analysis workflow.

The routine loads the combined trajectory matrix, performs global and grouped model fitting, generates diagnostic plots, computes Sobol sensitivity indices, and runs bootstrap summaries for conditions and constructs.

Returns:

Type	Description
`None`

Source code in pipeline.py

def main() -> None:
    """Run the full smFRET kinetic analysis workflow.

    The routine loads the combined trajectory matrix, performs global and grouped
    model fitting, generates diagnostic plots, computes Sobol sensitivity indices,
    and runs bootstrap summaries for conditions and constructs.

    Returns
    -------
    None
    """
    logger.info(f"[bold green]Using outdir:[/bold green] {outdir.resolve()}")

    combined_path = Path("data/timeseries/fret_matrix.csv")
    t, E_mat, col_names = load_combined_matrix(combined_path)

    logger.info(
        f"Loaded combined matrix: {E_mat.shape[0]} time points, {E_mat.shape[1]} trajectories"
    )

    # Global fit
    logger.info("[bold magenta]\n=== Global fit ===[/bold magenta]")
    fit_hat = fit_global_3state(t, E_mat, n_starts=args.multistarts, n_jobs=args.cores)
    df_fit_hat = fit_to_df(fit_hat)
    log_df(df_fit_hat, title="Best-fit parameters (global)")
    df_fit_hat.to_csv(outdir / "best_params.csv", index=False)

    # Global diagnostics
    plot_ensemble_fit(t, E_mat, fit_hat, outdir)
    plot_hsp90_fit_time(
        t,
        E_mat,
        fit_hat,
        n_traces_overlay=200,
        condition_key="Global Fit",
        outdir=outdir,
    )
    plot_residuals_over_time(t, E_mat, fit_hat, outdir)

    # per-condition fits
    # 1) Per coverslip/day (construct + exp_id)
    logger.info(
        "[bold magenta]\n=== Per-condition fits (construct + exp_id) ===[/bold magenta]"
    )
    summary_cond, fits_cond = fit_all_conditions(
        t,
        E_mat,
        col_names,
        group_by="condition",  # "<construct>_<exp_id>"
        do_plots=True,
        max_overlay_traces=100,
        n_starts=args.multistarts,
        n_jobs=args.cores,
    )
    if not summary_cond.empty:
        log_df(summary_cond.sort_values("group_key"), title="Condition-level summary")
    else:
        logger.info("[bold yellow]No condition-level fits were produced.[/bold yellow]")

    if not summary_cond.empty:
        (outdir / "summary_conditions.csv").write_text(summary_cond.to_csv(index=False))

    # 2) Per construct (pooling all days), if you want
    logger.info(
        "[bold magenta]\n=== Per-construct fits (pool all exp_id for each construct) ===[/bold magenta]"
    )
    summary_constr, fits_constr = fit_all_conditions(
        t,
        E_mat,
        col_names,
        group_by="construct",
        do_plots=True,
        max_overlay_traces=100,
        n_starts=args.multistarts,
        n_jobs=args.cores,
    )
    if not summary_constr.empty:
        log_df(summary_constr.sort_values("group_key"), title="Construct-level summary")
    else:
        logger.info("[bold yellow]No construct-level fits were produced.[/bold yellow]")

    if not summary_constr.empty:
        (outdir / "summary_constructs.csv").write_text(
            summary_constr.to_csv(index=False)
        )

    # parameter-vs-condition plots
    plot_param_vs_condition(summary_cond, "k_OI", outdir)
    plot_param_vs_condition(summary_cond, "f_dyn", outdir)
    plot_param_vs_condition(summary_cond, "E_closed", outdir)

    meta = parse_column_metadata(col_names)

    param_bounds = {
        "k_OI": (0.0, 10.0),
        "k_IO": (0.0, 10.0),
        "k_IC": (0.0, 10.0),
        "k_CI": (0.0, 10.0),
        "k_BO": (0.0, 2.0),
        "k_BI": (0.0, 2.0),
        "k_BC": (0.0, 2.0),
        "E_open": (0.0, 1.0),
        "E_inter": (0.0, 1.0),
        "E_closed": (0.0, 1.0),
        "P_O0": (0.0, 1.0),
        "P_C0": (0.0, 1.0),
        "f_dyn": (0.0, 1.0),
        "E_static": (0.0, 1.0),
    }

    Si = sobol_sensitivity_3state(
        t, E_mat, param_bounds, n_base_samples=512, n_jobs=args.cores
    )

    # Inspect first-order and total-order indices as a table
    df_sobol = pd.DataFrame(
        {
            "param": Si["names"],
            "S1": Si["S1"],
            "S1_conf": Si["S1_conf"],
            "ST": Si["ST"],
            "ST_conf": Si["ST_conf"],
        }
    )
    log_df(df_sobol, title="Global Sobol sensitivity (all trajectories)")
    df_sobol.to_csv(outdir / "sobol_indices.csv", index=False)

    # === Sobol sensitivity per condition ==============================
    sa_cond_list: list[pd.DataFrame] = []

    for key in summary_cond["group_key"]:
        try:
            df_sa = sobol_sensitivity_for_group(
                t=t,
                E_mat=E_mat,
                col_names=col_names,
                meta=meta,
                group_key=key,
                group_by="condition",
                param_bounds=param_bounds,
                n_base_samples=256,
                n_jobs=args.cores,
            )
            sa_cond_list.append(df_sa)
        except Exception as e:
            logger.info(f"[condition SA] Skipped {key}: {e}")

    if sa_cond_list:
        sa_cond = pd.concat(sa_cond_list, ignore_index=True)
        log_df(sa_cond, title="Sobol sensitivity – per condition")
        # Optionally save:
        sa_cond.to_csv(outdir / "sobol_condition.csv", index=False)
    else:
        logger.info("No per-condition Sobol results.")

    # === Sobol sensitivity per construct ==============================
    sa_constr_list: list[pd.DataFrame] = []

    for key in summary_constr["group_key"]:
        try:
            df_sa = sobol_sensitivity_for_group(
                t=t,
                E_mat=E_mat,
                col_names=col_names,
                meta=meta,
                group_key=key,
                group_by="construct",
                param_bounds=param_bounds,
                n_base_samples=256,
                n_jobs=args.cores,
            )
            sa_constr_list.append(df_sa)
        except Exception as e:
            logger.info(f"[construct SA] Skipped {key}: {e}")

    if sa_constr_list:
        sa_constr = pd.concat(sa_constr_list, ignore_index=True)
        log_df(sa_constr, title="Sobol sensitivity – per construct")
        # Optionally save:
        sa_constr.to_csv(outdir / "sobol_construct.csv", index=False)
    else:
        logger.info("No per-construct Sobol results.")

    # === Bootstrap for ALL conditions ================================
    logger.info("\n=== Bootstrap parameter uncertainty per condition ===")

    boot_records: list[dict] = []
    boot_raw: dict[str, pd.DataFrame] = {}

    # choose which parameters you care about
    params_of_interest = ["k_OI", "k_IC", "f_dyn", "E_closed"]

    for key in summary_cond["group_key"]:
        logger.info(f"\n[bootstrap] condition={key}")
        try:
            boot_df = bootstrap_condition_params(
                t=t,
                E_mat=E_mat,
                col_names=col_names,
                meta=meta,
                group_key=key,
                group_by="condition",
                n_boot=10,
                random_seed=0,
                n_jobs=args.cores,
            )
        except Exception as e:
            logger.info(f"  bootstrap failed for {key}: {e}")
            continue

        if boot_df.empty:
            logger.info(f"  no successful bootstrap fits for {key}")
            continue

        boot_raw[key] = boot_df

        # summarize mean + 95% CI per parameter
        for p in params_of_interest:
            if p not in boot_df.columns:
                continue
            m, lo, hi = summarize_bootstrap(boot_df, p)
            boot_records.append(
                {
                    "group_key": key,
                    "param": p,
                    "mean": m,
                    "lo": lo,
                    "hi": hi,
                    "n_boot": len(boot_df),
                }
            )

    if not boot_records:
        logger.info("No bootstrap summaries computed.")
    else:
        boot_summary = pd.DataFrame(boot_records)
        log_df(
            boot_summary, title="Bootstrap summary (mean ± 95% CI per condition/param)"
        )

        if not boot_summary.empty:
            (outdir / "bootstrap_summary_conditions.csv").write_text(
                boot_summary.to_csv(index=False)
            )

        # Plot all conditions together for each parameter of interest
        for p in params_of_interest:
            plot_bootstrap_errorbars_all_conditions(
                boot_summary,
                param=p,
                title_suffix=" (condition-level)",
            )

    if boot_raw:
        boot_all = pd.concat(
            [df.assign(group_key=key) for key, df in boot_raw.items()],
            ignore_index=True,
        )
        boot_all.to_csv(outdir / "bootstrap_distributions.csv", index=False)

    # === Bootstrap for ALL constructs ================================
    logger.info("\n=== Bootstrap parameter uncertainty per construct ===")

    boot_records_constr: list[dict] = []
    boot_raw_constr: dict[str, pd.DataFrame] = {}

    for key in summary_constr["group_key"]:
        logger.info(f"\n[bootstrap] construct={key}")
        try:
            boot_df = bootstrap_condition_params(
                t=t,
                E_mat=E_mat,
                col_names=col_names,
                meta=meta,
                group_key=key,
                group_by="construct",
                n_boot=args.bootstraps,
                random_seed=0,
                n_jobs=args.cores,
            )
        except Exception as e:
            logger.info(f"  bootstrap failed for construct {key}: {e}")
            continue

        if boot_df.empty:
            logger.info(f"  no successful bootstrap fits for construct {key}")
            continue

        boot_raw_constr[key] = boot_df

        for p in params_of_interest:
            if p not in boot_df.columns:
                continue
            m, lo, hi = summarize_bootstrap(boot_df, p)
            boot_records_constr.append(
                {
                    "group_type": "construct",
                    "group_key": key,
                    "param": p,
                    "mean": m,
                    "lo": lo,
                    "hi": hi,
                    "n_boot": len(boot_df),
                }
            )

    if not boot_records_constr:
        logger.info("No bootstrap summaries computed for constructs.")
    else:
        boot_summary_constr = pd.DataFrame(boot_records_constr)
        log_df(
            boot_summary_constr,
            title="Bootstrap summary (mean ± 95% CI per construct/param)",
        )

        if not boot_summary_constr.empty:
            (outdir / "bootstrap_summary_constructs.csv").write_text(
                boot_summary_constr.to_csv(index=False)
            )

        for p in params_of_interest:
            plot_bootstrap_errorbars_all_conditions(
                boot_summary_constr,
                param=p,
                title_suffix=" (construct-level)",
            )