Models¶

ForecastModel produces out-of-sample predictions used by detectors. The validator combinators aggregate several models into a single decision.

Forecast model¶

signalflow.ForecastModel `dataclass` ¶

ForecastModel(backend: object = 'lightgbm', target: Target | None = None, features: FeaturePipe | None = None, encode: WoE | None | str = _DEFAULT, select: object = _DEFAULT, sampler: Sampler | None = None, backend_params: dict = dict(), output: str = 'p_rise', n_folds: int = 5, min_train_rows: int = 50)

Trainable continuous predictor; outputs one probability column.

encode accepts a :class:WoE, None to disable encoding, or the "default" sentinel that __post_init__ resolves to a default WoE. Fitted encoders/selectors live on encode_/select_; the declarative encode/select fields are never overwritten by fit.

dump_woe_history ¶

dump_woe_history(path: str) -> str

Serialize the refit timeline (binning + statistics) to portable JSON.

Source code in src/signalflow/model/forecast.py

def dump_woe_history(self, path: str) -> str:
    """Serialize the refit timeline (binning + statistics) to portable JSON."""
    import json

    with open(path, "w", encoding="utf-8") as fh:
        json.dump(self.woe_history(), fh, default=str)
    return path

fit ¶

fit(data: Dataset, sampler: Sampler | None = None, cache=None) -> ForecastModel

Train the model and compute leak-free out-of-fold predictions.

Fits embargoed walk-forward folds (embargo width = the target horizon), stores the stitched out-of-fold predictions on oos_, then fits the final production stack on all data. When encode is a rolling WoE the folds follow its refit/window; otherwise n_folds equal folds are used. Passing an ArtifactCache reuses unchanged folds and recomputes only new ones.

Returns:

Type	Description
`ForecastModel`	The fitted model (`self`).

Source code in src/signalflow/model/forecast.py

def fit(self, data: Dataset, sampler: Sampler | None = None, cache=None) -> "ForecastModel":
    """Train the model and compute leak-free out-of-fold predictions.

    Fits embargoed walk-forward folds (embargo width = the target horizon), stores
    the stitched out-of-fold predictions on ``oos_``, then fits the final production
    stack on all data. When ``encode`` is a rolling ``WoE`` the folds follow its
    ``refit``/``window``; otherwise ``n_folds`` equal folds are used. Passing an
    ``ArtifactCache`` reuses unchanged folds and recomputes only new ones.

    Returns:
        The fitted model (``self``).
    """
    if self.target is None:
        raise ValueError("ForecastModel requires a target to fit")
    sampler = sampler or self.sampler or UniformSampler()

    feat = self.features.compute(data.frame)
    ss = sampler.sample(data)
    idx = ss.index
    if ss.weights is not None:
        idx = idx.with_columns(ss.weights.alias("_w"))
    labels = self.target.labels(data, at=ss.index)

    base = (
        idx.join(feat, on=["pair", "ts"], how="inner")
        .join(labels, on=["pair", "ts"], how="left")
        .drop_nulls(subset=[LABEL_COL, *self.features.outputs])
        .sort("ts")
    )
    if base.height < self.min_train_rows:
        raise ValueError(f"not enough labeled samples to fit ({base.height})")

    ts_unique = base.get_column("ts").unique().sort().to_list()
    horizon_bars = self.target.horizon_bars(data)
    embargo = timedelta(seconds=horizon_bars * median_dt(ts_unique))
    folds = self._walk_forward_folds(ts_unique, embargo)
    oos_parts: list[pl.DataFrame] = []
    self.refits_: list[dict] = []
    target_cfg = self.target.to_config()
    stack_fp = self._stack_fingerprint(data) if cache is not None else None
    for fold in folds:
        train = base.filter(pl.col("ts") < (fold.test_start_ts - embargo))
        if fold.train_start_ts is not None:
            train = train.filter(pl.col("ts") >= fold.train_start_ts)
        test = base.filter((pl.col("ts") >= fold.test_start_ts) & (pl.col("ts") <= fold.test_end_ts))
        if train.height < self.min_train_rows or test.height == 0:
            continue
        cached = self._load_fold(cache, stack_fp, fold, embargo) if cache is not None else None
        if cached is not None:
            preds, state = cached
        else:
            preds, enc = self._fit_fold_predict(train, test, fold=fold)
            if preds is None:
                continue
            state = enc.state_dict() if enc is not None else None
            if cache is not None:
                self._store_fold(cache, stack_fp, fold, embargo, preds, state)
        oos_parts.append(preds)
        if state is not None:
            self.refits_.append(
                {
                    "test_start": fold.test_start_ts,
                    "train_start": fold.train_start_ts,
                    "train_end": fold.test_start_ts - embargo,
                    "target": target_cfg,
                    "state": state,
                }
            )

    self.oos_ = (
        pl.concat(oos_parts).unique(subset=["pair", "ts"], keep="first").sort(["pair", "ts"])
        if oos_parts
        else pl.DataFrame(schema={"pair": pl.Utf8, "ts": base.schema["ts"], self.output: pl.Float64})
    )

    self.encode_, self.select_, self.model_ = self._fit_stack(base)

    self._build_fingerprint(data, ts_unique)
    self._fitted = True
    logger.debug(f"ForecastModel fitted: oos rows={self.oos_.height}, folds={len(folds)}")
    return self

operating_point ¶

operating_point(data: Dataset, quantile: float, column: str | None = None, oos: bool = False) -> float

Score quantile as a firing threshold; caller must pass train-window data for leak safety.

Source code in src/signalflow/model/forecast.py

def operating_point(self, data: Dataset, quantile: float, column: str | None = None, oos: bool = False) -> float:
    """Score quantile as a firing threshold; caller must pass train-window data for leak safety."""
    self._check_fitted()
    preds = self.predict_oos(data) if oos else self.predict(data)
    col = self._score_column(preds, column)
    return float(preds.get_column(col).drop_nulls().quantile(quantile))

predict ¶

predict(data: Dataset) -> pl.DataFrame

Production prediction (in-sample on history - never feed to training).

Source code in src/signalflow/model/forecast.py

def predict(self, data: Dataset) -> pl.DataFrame:
    """Production prediction (in-sample on history - never feed to training)."""
    self._check_fitted()
    feat = self.features.compute(data.frame)
    p = self._predict_stack(self.encode_, self.select_, self.model_, feat)
    return feat.select(["pair", "ts"]).with_columns(pl.Series(self.output, p))

predict_oos ¶

predict_oos(data: Dataset, strict: bool = False) -> pl.DataFrame

Leak-free out-of-fold predictions over the training span.

With strict, rows outside the cached OOS span raise :class:FingerprintMismatch instead of warning and returning nulls.

Source code in src/signalflow/model/forecast.py

def predict_oos(self, data: Dataset, strict: bool = False) -> pl.DataFrame:
    """Leak-free out-of-fold predictions over the training span.

    With ``strict``, rows outside the cached OOS span raise
    :class:`FingerprintMismatch` instead of warning and returning nulls.
    """
    self._check_fitted()
    want = data.index()
    out = want.join(self.oos_, on=["pair", "ts"], how="left")
    missing = out.get_column(self.output).null_count()
    if missing:
        if strict:
            raise FingerprintMismatch(
                f"predict_oos: {missing} of {out.height} requested rows fall outside the cached OOS "
                f"span for ForecastModel(output={self.output!r}); refit or widen the span"
            )
        logger.warning(f"predict_oos: {missing} rows outside cached OOS span (null)")
    return out

woe_history ¶

woe_history() -> list[dict]

Per-refit WoE state (bin edges + WoE table + IV) across the walk-forward.

Source code in src/signalflow/model/forecast.py

def woe_history(self) -> list[dict]:
    """Per-refit WoE state (bin edges + WoE table + IV) across the walk-forward."""
    self._check_fitted()
    return getattr(self, "refits_", [])

Validator combinators¶

signalflow.MeanValidator ¶

MeanValidator(children: list)

Bases: _Combinator

Average child probabilities.

Source code in src/signalflow/model/validators.py

def __init__(self, children: list):
    self.children = children

signalflow.MaxValidator ¶

MaxValidator(children: list)

Bases: _Combinator

Take the maximum child probability.

Source code in src/signalflow/model/validators.py

def __init__(self, children: list):
    self.children = children

signalflow.VoteValidator ¶

VoteValidator(children: list, threshold: float = 0.5)

Bases: _Combinator

Fraction of children whose probability exceeds threshold.

Source code in src/signalflow/model/validators.py

def __init__(self, children: list, threshold: float = 0.5):
    super().__init__(children)
    self.threshold = threshold

Models¶

Forecast model¶

signalflow.ForecastModel dataclass ¶

dump_woe_history ¶

fit ¶

operating_point ¶

predict ¶

predict_oos ¶

woe_history ¶

Validator combinators¶

signalflow.MeanValidator ¶

signalflow.MaxValidator ¶

signalflow.VoteValidator ¶

signalflow.ForecastModel `dataclass` ¶