rewards ¶

Reusable reward models for training methods.

Classes¶

fastvideo.train.methods.rl.rewards.ClipScoreScorer ¶

ClipScoreScorer(*, device: device | str = 'cuda')

Bases: Module

CLIPScore reward, matching DiffusionNFT normalization.

Ported from DiffusionNFT's flow_grpo/clip_scorer.py.

Source code in fastvideo/train/methods/rl/rewards/frame_rewards.py

def __init__(
    self,
    *,
    device: torch.device | str = "cuda",
) -> None:
    super().__init__()
    import torch.nn as nn
    import torchvision.transforms as T
    from transformers import CLIPModel, CLIPProcessor

    def get_size(size: Any) -> Any:
        if isinstance(size, int):
            return (size, size)
        if isinstance(size, Mapping) and "height" in size and "width" in size:
            return (size["height"], size["width"])
        if isinstance(size, Mapping) and "shortest_edge" in size:
            return size["shortest_edge"]
        raise ValueError(f"Invalid processor size: {size!r}")

    def get_frame_transform(processor: Any) -> torch.nn.Module:
        config = processor.to_dict()
        resize = T.Resize(get_size(config.get("size"))) if config.get("do_resize") else nn.Identity()
        crop = T.CenterCrop(get_size(config.get("crop_size"))) if config.get("do_center_crop") else nn.Identity()
        normalize = (T.Normalize(mean=processor.image_mean, std=processor.image_std)
                     if config.get("do_normalize") else nn.Identity())
        return T.Compose([resize, crop, normalize])

    self.device = torch.device(device)
    self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device).eval()
    self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    self.transform = get_frame_transform(self.processor.image_processor)

fastvideo.train.methods.rl.rewards.MultiRewardScorer ¶

MultiRewardScorer(reward_weights: Mapping[str, float], *, scorers: Mapping[str, RewardScorer])

Weighted sum of reusable media reward scorers.

Mirrors DiffusionNFT's flow_grpo/rewards.py::multi_score behavior, while leaving frame selection to each concrete reward.

Source code in fastvideo/train/methods/rl/rewards/media.py

def __init__(
    self,
    reward_weights: Mapping[str, float],
    *,
    scorers: Mapping[str, RewardScorer],
) -> None:
    self.reward_weights = {str(k): float(v) for k, v in reward_weights.items()}
    if not self.reward_weights:
        raise ValueError("reward_weights must contain at least one reward")

    self.scorers = dict(scorers)
    unsupported = sorted(set(self.reward_weights) - set(self.scorers))
    if unsupported:
        raise ValueError(f"Unsupported reward(s): {unsupported}. "
                         f"Available rewards: {sorted(self.scorers)}")

fastvideo.train.methods.rl.rewards.PickScoreScorer ¶

PickScoreScorer(*, device: device | str = 'cuda', dtype: dtype = float32)

Bases: Module

PickScore reward, matching DiffusionNFT normalization.

Ported from DiffusionNFT's flow_grpo/pickscore_scorer.py.

Source code in fastvideo/train/methods/rl/rewards/frame_rewards.py

def __init__(
    self,
    *,
    device: torch.device | str = "cuda",
    dtype: torch.dtype = torch.float32,
) -> None:
    super().__init__()
    from transformers import AutoModel, AutoProcessor

    processor_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
    model_path = "yuvalkirstain/PickScore_v1"
    self.device = torch.device(device)
    self.dtype = dtype
    self.processor = AutoProcessor.from_pretrained(processor_path)
    self.model = AutoModel.from_pretrained(model_path).eval().to(self.device)
    self.model = self.model.to(dtype=dtype)

Functions:¶

fastvideo.train.methods.rl.rewards.select_first_frame ¶

select_first_frame(media: Tensor) -> Tensor

Return first-frame media as [B, C, H, W].

This is a helper for reward models that are intrinsically frame-based (for example PickScore and CLIPScore). Video-aware rewards should inspect the full [B, C, T, H, W] tensor themselves.

Source code in fastvideo/train/methods/rl/rewards/media.py

def select_first_frame(media: torch.Tensor) -> torch.Tensor:
    """Return first-frame media as ``[B, C, H, W]``.

    This is a helper for reward models that are intrinsically frame-based
    (for example PickScore and CLIPScore). Video-aware rewards should inspect
    the full ``[B, C, T, H, W]`` tensor themselves.
    """
    if not torch.is_tensor(media):
        raise TypeError(f"media must be a torch.Tensor, got {type(media).__name__}")
    if media.ndim == 5:
        return media[:, :, 0]
    if media.ndim == 4:
        return media
    raise ValueError("media must have shape [B, C, H, W] or [B, C, T, H, W], "
                     f"got {tuple(media.shape)}")

Modules¶

fastvideo.train.methods.rl.rewards.frame_rewards ¶

Frame-based reward scorers used by RL training methods.

Classes¶

fastvideo.train.methods.rl.rewards.frame_rewards.ClipScoreScorer ¶

ClipScoreScorer(*, device: device | str = 'cuda')

Bases: Module

CLIPScore reward, matching DiffusionNFT normalization.

Ported from DiffusionNFT's flow_grpo/clip_scorer.py.

Source code in fastvideo/train/methods/rl/rewards/frame_rewards.py

def __init__(
    self,
    *,
    device: torch.device | str = "cuda",
) -> None:
    super().__init__()
    import torch.nn as nn
    import torchvision.transforms as T
    from transformers import CLIPModel, CLIPProcessor

    def get_size(size: Any) -> Any:
        if isinstance(size, int):
            return (size, size)
        if isinstance(size, Mapping) and "height" in size and "width" in size:
            return (size["height"], size["width"])
        if isinstance(size, Mapping) and "shortest_edge" in size:
            return size["shortest_edge"]
        raise ValueError(f"Invalid processor size: {size!r}")

    def get_frame_transform(processor: Any) -> torch.nn.Module:
        config = processor.to_dict()
        resize = T.Resize(get_size(config.get("size"))) if config.get("do_resize") else nn.Identity()
        crop = T.CenterCrop(get_size(config.get("crop_size"))) if config.get("do_center_crop") else nn.Identity()
        normalize = (T.Normalize(mean=processor.image_mean, std=processor.image_std)
                     if config.get("do_normalize") else nn.Identity())
        return T.Compose([resize, crop, normalize])

    self.device = torch.device(device)
    self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device).eval()
    self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    self.transform = get_frame_transform(self.processor.image_processor)

fastvideo.train.methods.rl.rewards.frame_rewards.PickScoreScorer ¶

PickScoreScorer(*, device: device | str = 'cuda', dtype: dtype = float32)

Bases: Module

PickScore reward, matching DiffusionNFT normalization.

Ported from DiffusionNFT's flow_grpo/pickscore_scorer.py.

Source code in fastvideo/train/methods/rl/rewards/frame_rewards.py

def __init__(
    self,
    *,
    device: torch.device | str = "cuda",
    dtype: torch.dtype = torch.float32,
) -> None:
    super().__init__()
    from transformers import AutoModel, AutoProcessor

    processor_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
    model_path = "yuvalkirstain/PickScore_v1"
    self.device = torch.device(device)
    self.dtype = dtype
    self.processor = AutoProcessor.from_pretrained(processor_path)
    self.model = AutoModel.from_pretrained(model_path).eval().to(self.device)
    self.model = self.model.to(dtype=dtype)

Functions:¶

fastvideo.train.methods.rl.rewards.media ¶

Generic media reward composition utilities.

Classes¶

fastvideo.train.methods.rl.rewards.media.MultiRewardScorer ¶

MultiRewardScorer(reward_weights: Mapping[str, float], *, scorers: Mapping[str, RewardScorer])

Weighted sum of reusable media reward scorers.

Mirrors DiffusionNFT's flow_grpo/rewards.py::multi_score behavior, while leaving frame selection to each concrete reward.

Source code in fastvideo/train/methods/rl/rewards/media.py

def __init__(
    self,
    reward_weights: Mapping[str, float],
    *,
    scorers: Mapping[str, RewardScorer],
) -> None:
    self.reward_weights = {str(k): float(v) for k, v in reward_weights.items()}
    if not self.reward_weights:
        raise ValueError("reward_weights must contain at least one reward")

    self.scorers = dict(scorers)
    unsupported = sorted(set(self.reward_weights) - set(self.scorers))
    if unsupported:
        raise ValueError(f"Unsupported reward(s): {unsupported}. "
                         f"Available rewards: {sorted(self.scorers)}")

Functions:¶

fastvideo.train.methods.rl.rewards.media.select_first_frame ¶

select_first_frame(media: Tensor) -> Tensor

Return first-frame media as [B, C, H, W].

This is a helper for reward models that are intrinsically frame-based (for example PickScore and CLIPScore). Video-aware rewards should inspect the full [B, C, T, H, W] tensor themselves.

Source code in fastvideo/train/methods/rl/rewards/media.py

def select_first_frame(media: torch.Tensor) -> torch.Tensor:
    """Return first-frame media as ``[B, C, H, W]``.

    This is a helper for reward models that are intrinsically frame-based
    (for example PickScore and CLIPScore). Video-aware rewards should inspect
    the full ``[B, C, T, H, W]`` tensor themselves.
    """
    if not torch.is_tensor(media):
        raise TypeError(f"media must be a torch.Tensor, got {type(media).__name__}")
    if media.ndim == 5:
        return media[:, :, 0]
    if media.ndim == 4:
        return media
    raise ValueError("media must have shape [B, C, H, W] or [B, C, T, H, W], "
                     f"got {tuple(media.shape)}")