fetch_media and stream.apply

I am wondering why stream.apply(fetch_media, stream) (or fetch_task_media) leads to a front-end fetch exception while loading audio. That is,
stream = stream.apply(fetch_media, stream) will lead to the following error:
Uncaught (in promise) TypeError: Window.fetch: data:audio/mp3;base64, ...

In short, it seems to fetch the audio data even when it has been converted to a base64 URI.

To give you more background, I have two different audio sources — AWS S3 and non-S3 (http) endpoints. Here is how I am loading audios as base64 URI. However, this will lead to Uncaught (in promise) TypeError: Window.fetch: data:audio/mp3;base64 for http endpoints. Any suggestions for restructuring?

_AUDIO_DATA_KEY = "audio"

def fetch_s3_audio(eg: TaskType) -> TaskType:
    """
    Replaces audio as base64 URI from S3
    """

    url = eg[_AUDIO_DATA_KEY]
    path = S3Path(url)
    mimetype, _ = mimetypes.guess_type(path.basename)
    b = path.read_bytes()

    eg[_AUDIO_DATA_KEY] = bytes_to_b64(b, mimetype)

    return eg


def fetch_audio(eg: TaskType) -> TaskType:
    """
    Replaces audio url and path with base64 data URIs in the stream
    """

    url = eg[_AUDIO_DATA_KEY]
    if is_s3_resource(url):
        eg = fetch_s3_audio(eg)
    else:
        eg = fetch_task_media(eg, _AUDIO_DATA_KEY)

    return eg

def fetch_resources(stream):
    """
    Fetch other resources as base64 data URIs
    """
    for eg in stream:
        # fetch other resources ...
        eg = fetch_audio(eg)

        yield eg

@prodigy.recipe(
    "test",
    dataset=Arg(help="Dataset to use"),
    source=Arg("--source", "-s", help="Source"),
)
def annotate_audio(dataset: str, source: str):

    stream = get_stream(source)
    stream = stream.apply(fetch_resources, stream)
		...
		
    return {
			"dataset": dataset,
			"stream": stream,
			...
		}

I have done more testing to identify potential cause. The following code (based on a previous question) causes Uncaught (in promise) TypeError: Window.fetch: data:audio/mp3;base64 in Firefox. However, there was no error in Google Chrome. Is there a known issue for fetch_media in Firefox?

# the recipe audio_multiple_options.py
from pathlib import Path
from typing import Union

import jinja2
import prodigy
from prodigy import set_hashes
from prodigy.components.preprocess import \
    fetch_media as fetch_media_preprocessor
from prodigy.components.stream import get_stream
from prodigy.util import msg



@prodigy.recipe(
    "multiple.choice",
    dataset=("The dataset to use", "positional", None, str),
    source=("The source data as a JSONL file", "positional", None, str),
)
def multiple_choice(
    dataset: str,
    source: str,
):
    stream = get_stream(source, loader="jsonl", input_key="audio")
    stream.apply(fetch_media_preprocessor, input_keys=["audio", "video"])
    labels = ["Label1", "Label2", "Label3"]

    def add_template(stream, labels):
        for ex in stream:
            yield set_hashes(ex)

    # custom_js = Path("custom.js").read_text()

    def before_db(examples):
        for ex in examples:
            del ex["html"]
            if "audio" in ex and ex["audio"].startswith("data:") and "path" in ex:
                ex["audio"] = ex["path"]
        return examples

    return {
        "view_id": "blocks",
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": add_template(stream, labels),  # Incoming stream of examples
        "config": {
            "blocks": [
                {"view_id": "audio"},
                {"view_id": "html"},
            ],
        },
        "before_db": before_db,
    }

I used the following .jsonl file for testing:

{"audio": "/tmp/test.mp3"}

Welcome to the forum @ft8c3 :wave:

I was going to suggest that this fetch error might happen if the base64 string is invalid or the browser doesn't support the specific audio format or encoding method being used.
That said, I just tested with mp3 from a local path as input and Firefox (132.0.2 (aarch64)) and it worked fine. And I'm not aware of Firefox specific issue related to audio encoding used in Prodigy.

One difference I'm seeing when testing with mp3 is the MIME type: I'm getting mpeg while you're getting mp3 when running the same encoding function. Not sure what the reason is - it might be server configuration, OS filesystem, file signatures etc.
Maybe you could try normalizing the MIME type to mpeg and see if that helps?

def normalize_audio_mimetype(mimetype: str) -> str:
    """Normalize audio MIME types to standard formats."""
    if mimetype.lower() in ('audio/mp3', 'audio/x-mp3'):
        return 'audio/mpeg'
    return mimetype

Here's the implementation of fetch_task_media with this normalization added for URL and file inputs:

import mimetypes
def fetch_task_media(eg: TaskType, input_key: str, skip: bool = False) -> TaskType:
    """Replace all paths and URLs in a stream with base64 data URIs. The
    `skip` keyword argument lets you specify whether to skip invalid files
    that can't be converted (e.g. because the path doesn't exist, or the URL
    can't be fetched). If set to `False`, Prodigy will raise a `ValueError` if
    it encounters invalid files.
    eg (dict): The annotation task.
    input_key (str): Task key containing the media, e.g. to 'image'.
    skip (bool): Skip examples with files that can't be fetched.
    RETURNS (dict): The example with fetched data URIs.
    """
    eg = copy.deepcopy(eg)
    media = eg[input_key]
    if media.startswith("data:"):  # valid base64-encoded data URI
        return eg
    elif media.startswith("http://") or media.startswith("https://"):
        r = requests.get(media, stream=True)
        if r.status_code != 200:
            if not skip:
                raise ValueError(f"Can't download media: {input_key}")
            log(f"PREPROCESS: Skipping media (status {r.status_code}): {media}", eg)
            return eg
        mimetype = normalize_audio_mimetype(r.headers["content-type"])
        data = BytesIO(r.content).getvalue()
        eg[input_key] = bytes_to_b64(data, mimetype)
        eg["path"] = media
        return eg
    elif Path(media).exists():  # file is local path
        mime, _ = mimetypes.guess_type(media)
        mimetype =  normalize_audio_mimetype(mime)
        eg[input_key] = file_to_b64(Path(media), mimetype)
        eg["path"] = media
        return eg
    elif not skip:
        err = f"Invalid '{input_key}' - doesn't seem to be a data URI, URL or local path: {media}"
        raise ValueError(err)
    log(f"PREPROCESS: Skipping '{input_key}' (no data URI, URL or path)", media)
    return eg

Alternatively, as a workaround, if your audio files aren't too large, you could serve them directly from your server rather than using base64 encoding.