No Task Available Error and S3 loader for custom recipe

Hello Ines,

I was trying to implement the S3 loader function in our custom recipe but didn't quite succeeded, maybe I am doing something wrong.
Also we are encountering another problem of No Tasks Available in our streams.

Here's a segment of our custom recipe, I would appreciate it if you can give me a hand in that please:

import base64
import json
import random
from datetime import datetime as dt

import prodigy
from prodigy.util import split_string
from prodigy.components.db import connect

global total_annotations_since_server_start
total_annotations_since_server_start = 0
global total_files


def get_priority_videos():
    curr_path = os.path.dirname(__file__)
    with open(os.path.join(curr_path, "priority_videos.txt"), "r") as pv:
        p_videos = pv.readlines()
    return p_videos


def split_csv(_in_string):
    return _in_string.split(",")


def get_deny_list(source):
    deny_list = []

    for answer in ["accept", "ignore"]:
        answer_file_name = os.path.join(source, "annotations",
                                        f"{answer}.jsonl")

        if not os.path.exists(answer_file_name):
            continue

        with open(answer_file_name) as fp:
            for line in fp:
                annotation = json.loads(line)
                file_path = os.path.normpath(annotation["meta"]["path"])
                deny_list.append(file_path)

    return deny_list


def get_all_files(source, pv_flag):
    all_files = []
    deny_list = get_deny_list(source)

    pv = None
    if pv_flag:
        try:
            pv = get_priority_videos()
            pv = [n.split("/")[-1].lstrip().rstrip() for n in pv]
            print(f"Found list of {len(pv)} priority videos")
        except Exception as e:
            print("Priority flag ON but no priority videos found...")

    filtered_videos = []
    for path, dir, files in os.walk(source):
        for _file in files:
            if _file.endswith(".mp4"):
                # If priority videos exists, add only those
                if pv is not None:
                    if "_".join(_file.split("_")[:-1]).lstrip().rstrip() not in pv:
                        filtered_videos.append(_file)
                        continue
                file_path = os.path.join(path, _file)
                if file_path not in deny_list:
                    all_files.append(file_path)

    random.shuffle(all_files)
    print(f"Filtered {len(filtered_videos)} videos and added {len(all_files)} videos to queue for labeling...")
    return all_files


def load_filenames(all_files, review_ds):
    """

    Args:
        all_files:  A list if all pathnames to the data files
        review_ds: None in normal mode, in review mode, [dict(file_name:spans)..]

    Returns:

    """
    for _file in all_files:
        if _file.endswith(".mp4"):
            with open(_file, "rb") as image_file:
                encoded_string = 'data:video/mp4;base64,' + base64.b64encode(
                    image_file.read()).decode()
            if review_ds is not None:
                spans = review_ds[_file]
                yield dict(video=encoded_string,
                           text=_file.split(".mp4")[1],
                           meta=dict(path=_file.split(".mp4")[0]), file=_file.split(".mp4")[1], audio_spans=spans)
            else:
                yield dict(video=encoded_string,
                           text=_file.split(".mp4")[1],
                           meta=dict(path=_file.split(".mp4")[0]), file=_file.split(".mp4")[1])


def get_stream(all_files, review_ds=None):
    while True:
        stream = load_filenames(all_files, review_ds)
        return stream


@prodigy.recipe(
    "audio_custom.manual",
    dataset=("The dataset to use", "positional", None, str),
    source=("Path to a directory of images", "positional", None, str),
    pv_flag=("0 if priority videos are off 1 if ON", "option", "pv_flag", int),
    label=("One or more comma-separated labels", "option", "l", split_string),
    review=("JSONL file to review", "option", "r", str),
)
def audio_recipe(dataset, source, label, pv_flag, review):
    if review is None:
        all_files = get_all_files(source, pv_flag)
        all_spans = None
    else:
        all_files = []
        all_spans = dict()
        try:
            with open(review, "r") as jsonl_file:
                jsonl_data = jsonl_file.readlines()
                for line in jsonl_data:
                    json_data = json.loads(line)
                    file_name = json_data["file"]
                    audio_spans = json_data["audio_spans"]
                    all_files.append(file_name)
                    all_spans[file_name] = audio_spans
        except Exception as e:
            print(f"Error opening jsonl file : {e}")

    global total_files
    total_files = len(all_files)

    def before_db(examples):
        for eg in examples:
            if "video" in eg.keys():
                del eg['video']
            # Timestamp to keep track of when annotations were done
            eg["ts"] = dt.now().strftime("%Y-%m-%d-%H")
        return examples

Thank you

Hi, could you share the dictionary returned by your recipe as well?

The return stream in your get_stream helper looks potentially suspicious – I think you might want yield from stream here?

In general, "no tasks available" is shown if there are no valid examples with hashes that are not yet present in the current dataset. If you run Prodigy with PRODIGY_LOGGING=basic, you'll see more details about what's going on under the hood and whether examples are skipped. It can also be helpful to print an example to see if the JSON format is right.

Hey Ines,

Yes that's one entry from in my jsonl file:

224:{"text":"","meta":{"path":"/home/ubuntu/SCH_P1/SCH_2019.04.20/2019.04.20T20.50.12/videos/input_0_video_synced/input_0_video_synced_1"},"file":"","_input_hash":616230852,"_task_hash":701119597,"_session_id":"audio_prod_HDD_data-PC","_view_id":"blocks","audio_spans":[{"start":-0.0100006104,"end":1.4399993896,"label":"LEAD_SINGER_M","id":"12218993-8f43-4661-bb5c-510c942411cb","color":"rgba(255,215,0,0.2)"},{"start":27.6799995422,"end":30.0299995422,"label":"LEAD_SINGER_M","id":"00c7f673-4475-452b-bfee-14d04ca54c44","color":"rgba(255,215,0,0.2)"},{"start":33.3899993896,"end":36.3899993896,"label":"LEAD_SINGER_M","id":"a8bbe1b1-44f5-4a96-be40-41df3da9d5bb","color":"rgba(255,215,0,0.2)"},{"start":39.1599990845,"end":41.8099990845,"label":"LEAD_SINGER_M","id":"6ac43737-025b-455f-a09f-f85dddb6eb34","color":"rgba(255,215,0,0.2)"},{"start":44.7600006104,"end":47.7600006104,"label":"LEAD_SINGER_M","id":"c87b68fd-959f-4b7b-b939-8280245cb735","color":"rgba(255,215,0,0.2)"},{"start":51.6699981689,"end":56.0699981689,"label":"LEAD_SINGER_M","id":"2da33ae9-6d33-482b-9180-2b549be65061","color":"rgba(255,215,0,0.2)"},{"start":57.4699981689,"end":62.0199981689,"label":"LEAD_SINGER_M","id":"f9751416-a2fa-444a-b659-5bcdadbb581c","color":"rgba(255,215,0,0.2)"},{"start":63.0799987793,"end":67.4799987793,"label":"LEAD_SINGER_M","id":"a9cbf266-ab0f-47c5-a0a4-ca028c9d7902","color":"rgba(255,215,0,0.2)"},{"start":68.8299987793,"end":71.3799987793,"label":"LEAD_SINGER_M","id":"e5481bc6-144f-46a9-ac13-e70e84da541d","color":"rgba(255,215,0,0.2)"}],"answer":"ignore","ts":"2020-11-23-18"}

Also the 224 number is kinda weird.

Thanks again.

Is this the file you're loading in? Because if that has the numbers on each line, that's likely the problem, because it means every line is invalid JSON and will be skipped. Maybe something went wrong when you converted or exported the data?

No, that's one entry from the jsonlfile we are getting when we dump the database to a file.

Could you share the code or command you're using for that?

python3 -m prodigy audio_custom.manual -pv_flag 0 prodigy_audio_db /home/ubuntu/HDD_data/ --label LEAD_SINGER_M -F recipe/audio_recipe.py

Does replacing the return stream by yield from stream in the recipe above, do the job?

Yes, you should definitely use yield from if you're working with generators.

My question above was referring to the command or workflow you use to export the data to JSON, since this is what seems to include the confusing results (the number etc.).

1 Like