Skip to content

trajectory

Camera trajectory generation for HunyuanGameCraft.

Generates Plücker coordinate embeddings from simple action commands (forward, backward, left, right, rotations) for camera conditioning.

This is a self-contained implementation that does not depend on the official Hunyuan-GameCraft-1.0 repository.

Functions

fastvideo.models.camera.trajectory.create_camera_trajectory

create_camera_trajectory(action: str, height: int, width: int, num_frames: int, action_speed: float = 0.2, device: device | str = 'cpu', dtype: dtype = bfloat16) -> Tensor

Create Plücker coordinate embeddings from an action command.

Parameters:

Name Type Description Default
action str

One of 'forward', 'backward', 'left', 'right', 'left_rot', 'right_rot', 'up_rot', 'down_rot' (or shorthand 'w', 'a', 's', 'd').

required
height int

Video height in pixels.

required
width int

Video width in pixels.

required
num_frames int

Number of video frames.

required
action_speed float

Speed of motion (default 0.2).

0.2
device device | str

Torch device for output tensor.

'cpu'
dtype dtype

Torch dtype for output tensor.

bfloat16

Returns:

Name Type Description
camera_states Tensor

[1, num_frames, 6, height, width] Plücker embeddings.

Source code in fastvideo/models/camera/trajectory.py
def create_camera_trajectory(
    action: str,
    height: int,
    width: int,
    num_frames: int,
    action_speed: float = 0.2,
    device: torch.device | str = "cpu",
    dtype: torch.dtype = torch.bfloat16,
) -> torch.Tensor:
    """
    Create Plücker coordinate embeddings from an action command.

    Args:
        action: One of 'forward', 'backward', 'left', 'right',
                'left_rot', 'right_rot', 'up_rot', 'down_rot'
                (or shorthand 'w', 'a', 's', 'd').
        height: Video height in pixels.
        width: Video width in pixels.
        num_frames: Number of video frames.
        action_speed: Speed of motion (default 0.2).
        device: Torch device for output tensor.
        dtype: Torch dtype for output tensor.

    Returns:
        camera_states: [1, num_frames, 6, height, width] Plücker embeddings.
    """
    # Generate pose list from action
    poses = _action_to_pose_list(action, value=action_speed, duration=num_frames)

    # Parse poses
    poses_parsed = [pose.split(" ") for pose in poses]
    start_idx = 0
    sample_id = [start_idx + i for i in range(num_frames)]
    poses_parsed = [poses_parsed[i] for i in sample_id]

    # Convert to w2c matrices
    w2cs = [np.asarray([float(p) for p in pose[7:]]).reshape(3, 4) for pose in poses_parsed]
    transform_matrix = np.asarray(
        [[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]
    ).reshape(4, 4)
    last_row = np.zeros((1, 4))
    last_row[0, -1] = 1.0
    w2cs = [np.concatenate((w2c, last_row), axis=0) for w2c in w2cs]
    c2ws = _get_c2w(w2cs, transform_matrix)

    # Parse camera parameters
    cam_params = [[float(x) for x in pose] for pose in poses_parsed]
    assert len(cam_params) == num_frames
    cam_params = [_Camera(cam_param) for cam_param in cam_params]

    # Compute scaled intrinsics
    monst3r_w = cam_params[0].cx * 2
    monst3r_h = cam_params[0].cy * 2
    ratio_w, ratio_h = width / monst3r_w, height / monst3r_h
    intrinsics = np.asarray(
        [
            [
                cam_param.fx * ratio_w,
                cam_param.fy * ratio_h,
                cam_param.cx * ratio_w,
                cam_param.cy * ratio_h,
            ]
            for cam_param in cam_params
        ],
        dtype=np.float32,
    )
    intrinsics = torch.as_tensor(intrinsics)[None]  # [1, n_frame, 4]

    # Get relative poses
    c2w_poses = _get_relative_pose(cam_params)
    c2w = torch.as_tensor(c2w_poses)[None]  # [1, n_frame, 4, 4]

    # Compute Plücker embeddings
    flip_flag = torch.zeros(num_frames, dtype=torch.bool, device="cpu")
    plucker_embedding = _ray_condition(intrinsics, c2w, height, width, device="cpu", flip_flag=flip_flag)
    # [1, n_frame, H, W, 6] -> [1, n_frame, 6, H, W]
    plucker_embedding = plucker_embedding[0].permute(0, 3, 1, 2).contiguous()

    # Add batch dim and convert to target dtype/device
    # Shape: [n_frame, 6, H, W] -> [1, n_frame, 6, H, W]
    camera_states = plucker_embedding.unsqueeze(0).to(device=device, dtype=dtype)

    return camera_states