Checking the progress of different annotators

Hi,

Is there a way to see the progress of different annotators for a specific dataset? Something like pgy progress that shows the number of accepted, rejected, and ignored annotations by each annotator when using PRODIGY_ALLOWED_SESSIONS?

Thanks

Hi @ale
We do have progress command, but it reports progress of one or more datasets over time.
There isn't currently a version that would break it down by annotators. You can display stats for a particular dataset by running prodigy stats {dataset_id} but in the case of session/annotator specific datasets you'd have to run this for each of annotators' datasets.

It's a very reasonable thing to have so for now I quickly wrapped up this feature as a custom command:

# progress.py
import os
from collections import Counter

from prodigy.components.db import Database, connect
from prodigy.core import Arg, recipe
from prodigy.errors import RecipeError
from wasabi import msg


def stats(set_id: str, DB: Database) -> None:
    stats = {}
    DB.get_dataset_by_name(set_id)
    examples = DB.get_dataset_examples(set_id)
    meta = DB.get_meta(set_id)
    n_examples = len(examples)
    decisions = Counter()
    for eg in examples:
        if "answer" in eg:
            decisions[eg["answer"]] += 1
        elif "spans" in eg:
            for span in eg["spans"]:
                if "answer" in span:
                    decisions[span["answer"]] += 1
    assert isinstance(meta, dict)
    stats["dataset_stats"] = {
        "dataset": set_id,
        "created": meta.get("created"),
        "description": meta.get("description"),
        "author": meta.get("author"),
        "annotations": n_examples,
        "accept": decisions["accept"],
        "reject": decisions["reject"],
        "ignore": decisions["ignore"],
    }

    for key, values in stats.items():
        title = key.replace("_", " ").title()
        msg.divider(title, icon="emoji")
        if isinstance(values, list):
            msg.text(", ".join(values), spaced=True)
        else:
            msg.table(
                {
                    k.replace("_", " ").title().replace("Spacy", "spaCy"): v
                    for k, v in values.items()
                }
            )


@recipe(
    "stats.progress",
    dataset=Arg(help="Name of the dataset to report progress on."),
)
def progress(
    dataset: str,
):
    allowed_sessions = set(os.getenv("PRODIGY_ALLOWED_SESSIONS").split(","))
    if allowed_sessions is None:
        raise RecipeError(
            "Environment variable `PRODIGY_ALLOWED_SESSIONS` should be set"
        )
    DB = connect()
    if dataset not in DB:
        raise RecipeError(f"Can't find '{dataset}' in database {DB.db_name}")
    session_datasets = DB.get_dataset_sessions(dataset)
    filtered_session_datasets = [
        dataset
        for dataset in session_datasets
        if dataset.split("-")[-1] in allowed_sessions
    ]
    for set_id in filtered_session_datasets:
        stats(set_id, DB)

This is piggy backing on the current stats command but does what you want I think? If you can call it:

PRODIGY_ALLOWED_SESSIONS="bob,alex" python -m prodigy stats.progress {name_of_the_main_dataset} -F progress.py

And that should display the stats for each dataset

=============================== Dataset Stats ===============================

Dataset       sunglasses_brands-bob
Created       2024-05-03 11:39:54
Description   None
Author        None
Annotations   6
Accept        6
Reject        0
Ignore        0


=============================== Dataset Stats ===============================

Dataset       sunglasses_brands-alex
Created       2024-05-03 11:40:08
Description   None
Author        None
Annotations   6
Accept        6
Reject        0
Ignore        0