Hi @ale
We do have progress
command, but it reports progress of one or more datasets over time.
There isn't currently a version that would break it down by annotators. You can display stats for a particular dataset by running prodigy stats {dataset_id}
but in the case of session/annotator specific datasets you'd have to run this for each of annotators' datasets.
It's a very reasonable thing to have so for now I quickly wrapped up this feature as a custom command:
# progress.py
import os
from collections import Counter
from prodigy.components.db import Database, connect
from prodigy.core import Arg, recipe
from prodigy.errors import RecipeError
from wasabi import msg
def stats(set_id: str, DB: Database) -> None:
stats = {}
DB.get_dataset_by_name(set_id)
examples = DB.get_dataset_examples(set_id)
meta = DB.get_meta(set_id)
n_examples = len(examples)
decisions = Counter()
for eg in examples:
if "answer" in eg:
decisions[eg["answer"]] += 1
elif "spans" in eg:
for span in eg["spans"]:
if "answer" in span:
decisions[span["answer"]] += 1
assert isinstance(meta, dict)
stats["dataset_stats"] = {
"dataset": set_id,
"created": meta.get("created"),
"description": meta.get("description"),
"author": meta.get("author"),
"annotations": n_examples,
"accept": decisions["accept"],
"reject": decisions["reject"],
"ignore": decisions["ignore"],
}
for key, values in stats.items():
title = key.replace("_", " ").title()
msg.divider(title, icon="emoji")
if isinstance(values, list):
msg.text(", ".join(values), spaced=True)
else:
msg.table(
{
k.replace("_", " ").title().replace("Spacy", "spaCy"): v
for k, v in values.items()
}
)
@recipe(
"stats.progress",
dataset=Arg(help="Name of the dataset to report progress on."),
)
def progress(
dataset: str,
):
allowed_sessions = set(os.getenv("PRODIGY_ALLOWED_SESSIONS").split(","))
if allowed_sessions is None:
raise RecipeError(
"Environment variable `PRODIGY_ALLOWED_SESSIONS` should be set"
)
DB = connect()
if dataset not in DB:
raise RecipeError(f"Can't find '{dataset}' in database {DB.db_name}")
session_datasets = DB.get_dataset_sessions(dataset)
filtered_session_datasets = [
dataset
for dataset in session_datasets
if dataset.split("-")[-1] in allowed_sessions
]
for set_id in filtered_session_datasets:
stats(set_id, DB)
This is piggy backing on the current stats command but does what you want I think? If you can call it:
PRODIGY_ALLOWED_SESSIONS="bob,alex" python -m prodigy stats.progress {name_of_the_main_dataset} -F progress.py
And that should display the stats for each dataset
=============================== Dataset Stats ===============================
Dataset sunglasses_brands-bob
Created 2024-05-03 11:39:54
Description None
Author None
Annotations 6
Accept 6
Reject 0
Ignore 0
=============================== Dataset Stats ===============================
Dataset sunglasses_brands-alex
Created 2024-05-03 11:40:08
Description None
Author None
Annotations 6
Accept 6
Reject 0
Ignore 0