Skip to content

Roverp CLI

Usage

The CLI tools use tyro:

  • Positional arguments ("required") are passed as positional command line arguments
  • Named arguments are passed as flagged command line arguments

roverp anonymize

Anonymize video by blurring faces.

Warning

Requires the anonymize extra (retina-face, tf-keras).

Expected Inputs and Outputs

Inputs: camera/video.avi

Outputs: _camera/video.avi"

Parameters:

Name Type Description Default
path str

path to the dataset.

required
out str | None

output path; defaults to the same as path.

None
Source code in processing/src/roverp/_cli/anonymize.py
def cli_anonymize(
    path: str, /, out: str | None = None
) -> None:
    """Anonymize video by blurring faces.

    !!! warning

        Requires the `anonymize` extra (`retina-face`, `tf-keras`).

    !!! io "Expected Inputs and Outputs"

        **Inputs**: `camera/video.avi`

        **Outputs**: `_camera/video.avi`"

    Args:
        path: path to the dataset.
        out: output path; defaults to the same as `path`.
    """
    from retinaface import RetinaFace

    camera = sensors.Camera(os.path.join(path, "camera"))

    if out is None:
        out = path

    _camera = sensors.DynamicSensor(
        os.path.join(out, "_camera"), create=True, exist_ok=True)
    output = _camera.create("video.avi", camera.config["video.avi"])

    def _apply_image(image):
        """Apply face blurring to an image."""
        faces = RetinaFace.detect_faces(image)

        # If no faces detected, return original image
        if not isinstance(faces, dict) or len(faces) == 0:
            return image

        # Apply blur to each detected face
        for face_key, face_data in faces.items():
            if 'facial_area' in face_data:
                x1, y1, x2, y2 = face_data['facial_area']

                # Expand by 20%
                width = x2 - x1
                height = y2 - y1
                x1 -= int(0.2 * width)
                y1 -= int(0.2 * height)
                x2 += int(0.2 * width)
                y2 += int(0.2 * height)

                # Ensure coordinates are within image bounds
                x1 = max(0, int(x1))
                y1 = max(0, int(y1))
                x2 = min(image.shape[1], int(x2))
                y2 = min(image.shape[0], int(y2))

                # Extract face region
                face_region = image[y1:y2, x1:x2]

                # Apply Gaussian blur to the face region
                if face_region.size > 0:
                    blurred_face = cv2.GaussianBlur(face_region, (99, 99), 15)
                    image[y1:y2, x1:x2] = blurred_face

        return image

    stream = camera["video.avi"].stream_prefetch()
    frame_stream = Prefetch(
        _apply_image(frame) for frame in tqdm(stream, total=len(camera))
    ).queue
    cast(channels.VideoChannel, output).consume(frame_stream)

roverp sensorpose

Get interpolated poses for a specific sensor.

Expected Inputs and Outputs

Inputs: _slam/trajectory.csv

Outputs: _{sensor}/pose.npz depending on the specified --sensor, with keys:

  • mask: binary mask, applied to the raw sensor data along the time axis, which denotes valid samples for the available poses.
  • smoothing, start_threshold, filter_size: parameters used for pose interpolation.
  • t, pos, vel, acc, rot: pose parameters; see Poses

Parameters:

Name Type Description Default
path str

Path to the dataset.

required
sensor str

Sensor timestamps to interpolate for.

'radar'
smoothing float

Smoothing coefficient; higher = more smooth.

500.0
threshold float

Exclude data points close to the starting point (in meters).

1.0
Source code in processing/src/roverp/_cli/sensorpose.py
def cli_sensorpose(
    path: str, /, sensor: str = "radar",
    smoothing: float = 500.0, threshold: float = 1.0
) -> None:
    """Get interpolated poses for a specific sensor.

    !!! io "Expected Inputs and Outputs"

        **Inputs**: `_slam/trajectory.csv`

        **Outputs**: `_{sensor}/pose.npz` depending on the specified `--sensor`, with keys:

        - `mask`: binary mask, applied to the raw sensor data along the time
        axis, which denotes valid samples for the available poses.
        - `smoothing`, `start_threshold`, `filter_size`: parameters used for
        pose interpolation.
        - `t`, `pos`, `vel`, `acc`, `rot`: pose parameters; see
            [`Poses`][roverp.readers.]

    Args:
        path: Path to the dataset.
        sensor: Sensor timestamps to interpolate for.
        smoothing: Smoothing coefficient; higher = more smooth.
        threshold: Exclude data points close to the starting point (in meters).
    """
    from roverd import Trace

    from roverp.readers import Trajectory

    cfg = {
        "smoothing": smoothing, "start_threshold": threshold,
        "filter_size": 5}

    traj = Trajectory(
        path=os.path.join(path, "_slam", "trajectory.csv"), **cfg)
    t_sensor = Trace.from_config(path)[sensor].metadata.timestamps
    poses, mask = traj.interpolate(t_sensor)

    os.makedirs(os.path.join(path, "_" + sensor), exist_ok=True)
    np.savez(
        os.path.join(path, "_" + sensor, "pose.npz"),
        **poses.as_dict(), mask=mask, **cfg)

roverp report

Generate speed report.

Warning

The cartographer slam pipeline (make trajectory or make lidar) must be run beforehand.

Expected Inputs and Outputs

Inputs: _radar/pose.npz, _slam/trajectory.csv

Outputs: _report/speed.pdf

Parameters:

Name Type Description Default
path str

path to the dataset.

required
out str | None

output path; defaults to _report/speed.pdf in the dataset.

None
width float

time series plot row width, in seconds.

30.0
Source code in processing/src/roverp/_cli/report.py
def cli_report(
    path: str, /, out: str | None = None, width: float = 30.0
) -> None:
    """Generate speed report.

    !!! warning

        The cartographer slam pipeline (`make trajectory` or `make lidar`) must
        be run beforehand.

    !!! io "Expected Inputs and Outputs"

        **Inputs**: `_radar/pose.npz`, `_slam/trajectory.csv`

        **Outputs**: `_report/speed.pdf`

    Args:
        path: path to the dataset.
        out: output path; defaults to `_report/speed.pdf` in the dataset.
        width: time series plot row width, in seconds.
    """
    from roverp.readers import RawTrajectory

    def _plot_speed(axs, tproc, traw, vproc, vraw, dmax):
        for ax, vp, vr in zip(axs, vproc, vraw):
            ax.plot(tproc, vp, linewidth=1.0, label="Processed")
            ax.plot(traw, vr, linewidth=1.0, label="Raw")
        axs[-1].plot(
            tproc, np.linalg.norm(vproc, axis=0), linewidth=1.0, label="Processed")
        axs[-1].plot(
            traw, np.linalg.norm(vraw, axis=0), linewidth=1.0, label="Raw")
        axs[-1].axhline(dmax, linestyle='--', color='black')

        axs[0].set_ylabel("$v_x$ (m/s)")
        axs[1].set_ylabel("$v_y$ (m/s)")
        axs[2].set_ylabel("$v_z$ (m/s)")
        axs[3].set_ylabel("$||v||_2$ (m/s)")

    if out is None:
        out = os.path.join(path, "_report", "speed.pdf")
    os.makedirs(os.path.dirname(out), exist_ok=True)

    trajfile = os.path.join(path, "_slam", "trajectory.csv")
    if not os.path.exists(trajfile):
        raise FileNotFoundError(
            f"Trajectory file not found: {trajfile}. "
            f"Run `make trajectory` or `make lidar` first.")

    traj = RawTrajectory.from_csv(trajfile)
    base_time = traj.t[0]
    duration = traj.t[-1] - base_time

    vraw = np.diff(traj.xyz, axis=1) / np.diff(traj.t)
    traw = (traj.t[1:] + traj.t[:-1]) / 2 - base_time  # type: ignore

    radarpose = np.load(os.path.join(path, "_radar", "pose.npz"))
    vproc = radarpose["vel"].T
    tproc = radarpose["t"] - base_time

    with open(os.path.join(path, "radar", "radar.json")) as f:
        cfg = json.load(f)
        nd = cfg["shape"][0]
        dmax = cfg["doppler_resolution"] * (nd // 2)

    with PdfPages(out) as pdf:
        for i in tqdm(range(math.ceil(duration / width / 2))):
            fig, page = plt.subplots(8, 1, figsize=(8.5, 11))
            page = cast(np.ndarray, page)

            for j, axs in enumerate([page[:4], page[4:]]):
                _plot_speed(axs, tproc, traw, vproc, vraw, dmax)
                for ax in axs:
                    ax.set_xlim(
                        (i * 2 + j) * width, (i * 2 + j + 1) * width)
                    ax.grid()

            page[-1].set_xlabel("Time (s)")
            page[0].legend(
                loc='upper left', ncols=2, bbox_to_anchor=(-0.02, 1.4),
                frameon=False)
            fig.align_ylabels(page)
            fig.tight_layout(pad=5, h_pad=0.2, w_pad=0.2)
            pdf.savefig(fig)

roverp segment

Run semantic segmentation on collected video data.

Warning

Requires the semseg extra (torch, transformers):

pip install ./processing[semseg]
# equivalent to
pip install torch torchvision transformers

By default, we use segformer-b5-finetuned-ade-640-640, which can be found here; the pytorch_model.bin weights should be downloaded and placed in processing/models/segformer-b5-finetuned-ade-640-640.

Class Definitions

The original class definitions have been reduced into 8 classes (indexed in alphabetical order), as specified in models/segformer-b5-finetuned-ade-640-640/classes.yaml:

  • 0=ceiling: ceiling, roofs, etc viewed from underneath.
  • 1=flat: flat ground such as roads, paths, floors, etc.
  • 2=nature: plants of all kinds such as trees, grass, and bushes.
  • 3=object: any free-standing objects other than plants which are not building-scale such as furniture, signs, and poles.
  • 4=person: any person who is not inside a vehicle.
  • 5=sky: the sky or other background.
  • 6=structure: buildings, fences, and other structures.
  • 7=vehicle: cars, busses, vans, trucks, etc.

Expected Inputs and Outputs

Inputs: camera/video.avi

Outputs: _camera/segment

Parameters:

Name Type Description Default
path str

path to the recording.

required
batch int

batch size for processing; may need tuning, depending on your exact GPU.

16
model str

path to model weights; if running in the red-rover/processing directory, this should be ./models/segformer-b5-finetuned-ade-640-640.

'./models/segformer-b5-finetuned-ade-640-640'
Source code in processing/src/roverp/_cli/segment.py
def cli_segment(
    path: str, /, batch: int = 16,
    model: str = "./models/segformer-b5-finetuned-ade-640-640",
) -> None:
    """Run semantic segmentation on collected video data.

    !!! warning

        Requires the `semseg` extra (`torch`, `transformers`):
        ```sh
        pip install ./processing[semseg]
        # equivalent to
        pip install torch torchvision transformers
        ```

    By default, we use `segformer-b5-finetuned-ade-640-640`, which can be found
    [here](https://huggingface.co/nvidia/segformer-b5-finetuned-ade-640-640);
    the `pytorch_model.bin` weights should be downloaded and placed in
    `processing/models/segformer-b5-finetuned-ade-640-640`.

    ??? info "Class Definitions"

        The original [class definitions](
        https://github.com/CSAILVision/sceneparsing/blob/master/objectInfo150.csv)
        have been reduced into 8 classes (indexed in alphabetical order), as
        specified in `models/segformer-b5-finetuned-ade-640-640/classes.yaml`:

        - `0=ceiling`: ceiling, roofs, etc viewed from underneath.
        - `1=flat`: flat ground such as roads, paths, floors, etc.
        - `2=nature`: plants of all kinds such as trees, grass, and bushes.
        - `3=object`: any free-standing objects other than plants which are not
            building-scale such as furniture, signs, and poles.
        - `4=person`: any person who is not inside a vehicle.
        - `5=sky`: the sky or other background.
        - `6=structure`: buildings, fences, and other structures.
        - `7=vehicle`: cars, busses, vans, trucks, etc.

    !!! io "Expected Inputs and Outputs"

        **Inputs**: `camera/video.avi`

        **Outputs**: `_camera/segment`

    Args:
        path: path to the recording.
        batch: batch size for processing; may need tuning, depending on your
            exact GPU.
        model: path to model weights; if running in the `red-rover/processing`
            directory, this should be
            `./models/segformer-b5-finetuned-ade-640-640`.
    """
    # Ignore for type checking since torch & transformers aren't shipped by
    # default.
    try:
        import torch  # type: ignore
        import transformers  # type: ignore
    except ImportError:
        raise Exception(
            "Must have `torch` and `transformers` installed (not included in "
            "default installation).")

    def _get_classmap(path: str) -> UInt8[np.ndarray, "8"]:
        with open(os.path.join(path, "classes.yaml")) as f:
            meta = yaml.load(f, Loader=yaml.SafeLoader)

        orig = {k: i for i, k in enumerate(meta["original"])}
        classmap = np.zeros(len(orig), dtype=np.uint8)
        for i, classname in enumerate(sorted(meta["reduced"].keys())):
            for oldclass in meta["reduced"][classname]:
                classmap[orig[oldclass]] = i
        return classmap

    feature_extractor = (
        transformers.SegformerImageProcessor.from_pretrained(model))
    torch_model = transformers.SegformerForSemanticSegmentation.from_pretrained(
        model).to('cuda')  # type: ignore
    classmap = torch.from_numpy(_get_classmap(model)).to('cuda')

    def _apply_image(imgs):
        with torch.no_grad(), torch.amp.autocast('cuda'):  # type: ignore
            inputs = feature_extractor(  # type: ignore
                images=imgs, return_tensors='pt').to('cuda')
            outputs = torch_model(**inputs)
            classes_raw = torch.argmax(outputs.logits, dim=1)
        classes_reduced = classmap[classes_raw]
        return classes_reduced.cpu().numpy()

    camera = sensors.DynamicSensor(os.path.join(path, "camera"))

    _camera = sensors.DynamicSensor(
        os.path.join(path, "_camera"), create=True, exist_ok=True)
    output = _camera.create("segment", {
        "format": "lzmaf", "type": "u1", "shape": [160, 160],
        "desc": "Image segmentation with 640x640 resize and 4x downsample."})
    stream = camera["video.avi"].stream_prefetch(batch=batch)

    frame_stream = (
        _apply_image(frame) for frame in
        tqdm(stream, total=int(np.ceil(len(camera) / batch))))

    # frame_stream is already batched
    cast(channels.LzmaFrameChannel, output).consume(frame_stream, batch=0)