Multi-stage speaker audio classification with `pyannote.sad.manual` and `audio manual`

So I took your advice and went ahead and did the chunking myself - 30-s. chunks in pyannote.sad.manual, where 30s is the full length of the input audio file. I'm still running into issues. I've uploaded the verbose log from this; the recipe is as follows.

import copy
import os
from pathlib import Path
from typing import List

import prodigy
from prodigy.components.db import connect
from prodigy.components.loaders import Audio
from prodigy.components.preprocess import fetch_media
from prodigy.util import get_labels, log, msg

# HERE = os.getcwd()
# prodigy.log(f'Here: {HERE}')
# AUDIO_FOLDER = '../data/audio_files/wav/'
# prodigy.log(f'Audio folder: {AUDIO_FOLDER}')

def remove_base64(examples):
    """Remove base64-encoded string if "path" is preserved in example."""
    for eg in examples:
        if "audio" in eg and eg["audio"].startswith("data:") and "path" in eg:
            eg["audio"] = eg["path"]
        if "video" in eg and eg["video"].startswith("data:") and "path" in eg:
            eg["video"] = eg["path"]
    return examples


options = [
    {'id': 'FEM', 'text': 'Female'},
    {'id': 'MAL', 'text': 'Male'},
    {'id': 'CHI(1)', 'text': 'Child (single)'},
    {'id': 'CHI(2p)', 'text': 'Children (plural)'},
]

@prodigy.recipe(
    "post-sad-multiclass",
    dataset=("The dataset to read from", "option", "d", str),
    target=("The dataset to save result in", "option", "t", str),
    source=("Source dir containing audio files", "option", "s", str),
    label=("Comma-separated label(s)", "option", "l", get_labels),
    # quiet=("Don't output anything", "flag", "q", bool)
    )
def multiclass_audio(
        dataset: str,
        target: str,
        label: List[str],
        source: str):
    """
    """

    def get_stream():
        # Load the directory of audio files and add options to each task
        prodigy.log('Instantiating DB connection')
        db = connect()
        # Load your already annotated data
        prodigy.log(f'Connecting to DB {dataset}')
        examples = db.get_dataset(dataset)
        # prodigy.log(f'Labels to apply: {[o for o in options]}')
        for eg in examples:
            audio_spans = eg.get("audio_spans", [])
            for span in audio_spans:
                # Create a new example for each annotated span, so you
                # can select one category per span - make sure to deepcopy!
                new_eg = copy.deepcopy(eg)
                new_eg["audio_spans"] = [span]
                new_eg["options"] = options
                yield new_eg

    with open('C:/Users/tslade/projects/teacherprints/prodigy/multiclass-audio-template.html', 'r') as f:
        html_template = f.read()

    with open('C:/Users/tslade/projects/teacherprints/prodigy/timestretcher.js', 'r') as f:
        javascript = f.read()


    prodigy.log('Defining blocks')
    blocks = [
        {'view_id': 'html', 'html_template': html_template},
        {'view_id': 'audio_manual'},
    ]

    prodigy.log('Instantiating stream')
    stream = get_stream()  # your custom stream, see above
    stream = fetch_media(stream, ["audio"])  # replace all "audio" keys with base64

    return {
        'dataset': target,
        'stream': stream,
        'view_id': 'blocks',
        'config': {
            'blocks': blocks,
            'javascript': javascript,
            'audio_autoplay': False,
            'audio_bar_gap': 0,
            'audio_bar_height': 2,
            'audio_bar_radius': 1,
            'audio_bar_width': 1,
            'audio_loop': False,
            'audio_max_zoom': 5000,
            'audio_rate': 1.0,
            'show_audio_cursor': True,
            'show_audio_cursor_time': True,
            'show_audio_minimap': True,
            'show_audio_timeline': True,
            'force_stream_order': True,
            'labels': ['FEM', 'MAL', 'CHI(1)', 'CHI(2p)'],
            'custom_theme': {
                'labels': {
                    'FEM': '#84E9F3',
                    'MAL': '#4E6BF3',
                    'CHI(1)': '#F2B11A',
                    'CHI(2p)': '#852215',
                }
            }
        }
    }

No waveform appears, suggesting to me that the media isn't being properly loaded. Checking the console, I see two errors - the latter of which is related to the timestretcher function, which is looking for a wavesurfer instance such as we discussed in the variable audio_rate for audio annotation support thread.


post-sad-multiclass-verbose-log.html (43.4 KB)

It appears from where the code is breaking that perhaps the window.wavesurfer object isn't getting created:

image

...and indeed, that object is not available in the console.

But if I forego the custom JS, the upstream problem remains:

...and appears to be related to the fetch_media(stream, ["audio"]) call not working properly.

image

    stream = get_stream()  # your custom stream, see above
    stream = fetch_media(stream, ["audio"])  # replace all "audio" keys with base64

That code was your suggestion, @ines, but I wasn't able to understand it - when I troubleshot by iterating through the stream and printing to console, I don't see an "audio" key in the dict. I see a path key, and it indeed contains the path to the audio files referenced by the annotation...but if I change the code to instead be

    stream = get_stream()  # your custom stream, see above
    stream = fetch_media(stream, ["path"])  # replace all "audio" keys with base64

I don't have any luck. And I've been unable to inspect the source code for the Audio(source) or the fetch_media() functions to understand how else the input they receive could be structured.