Hi, not quite what you're looking for with evaluating based on loss directly but we're thinking about including this recipe in Prodigy at some point in the future. Specifically for reviewing NER data, this recipe sorts by the F1 Score calculated per example in the provided evaluation set. It then uses the new Overlapping Spans UI to show the difference between annotated entities and predicted entities. Let us know if you find it useful!
If you specifically want to sort by loss, have a look at the spaCy NER model implentation. It's definitely doable and feel free to respond to this thread if you get it figured out.
CLI Usage
For understanding what your model learned, run this on your evaluation set:
prodigy ner.review_hardest my_dataset ./trained_spacy_ner_model /path/to/evaluation_set.jsonl -F path/to/this/recipe_file.py
For potentially identifying inconsistencies/errors in training data, you can run this on your training set:
prodigy ner.review_hardest my_dataset ./trained_spacy_ner_model /path/to/training_set.jsonl -F path/to/this/recipe_file.py
Recipe Definition
from typing import List, Optional, Union, Iterable
import murmurhash
import spacy
from spacy.language import Language
from spacy.training import Example
from spacy.tokens.doc import Doc, SetEntsDefault
from spacy.util import get_words_and_spaces
import copy
from prodigy.components.db import connect
from prodigy.components.preprocess import split_sentences, add_tokens, make_raw_doc
from prodigy.components.loaders import get_stream
from prodigy.core import recipe
from prodigy.util import set_hashes, log, split_string, get_labels, copy_nlp
from prodigy.util import INPUT_HASH_ATTR, TASK_HASH_ATTR, msg
from prodigy.types import TaskType, StreamType, RecipeSettingsType
@recipe(
"ner.review_hardest",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
spacy_model=("Loadable spaCy model for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
score_key=("Score to use in sorting examples", "option", "sk", str),
# fmt: on
)
def review_hardest(
dataset: str,
spacy_model: str,
source: Union[str, Iterable[dict]],
loader: Optional[str] = None,
label: Optional[List[str]] = None,
exclude: Optional[List[str]] = None,
score_key: str = "ents_f",
) -> RecipeSettingsType:
log("RECIPE: Starting recipe ner.review_hardest", locals())
nlp = spacy.load(spacy_model)
labels = label # comma-separated list or path to text file
if not labels:
labels = nlp.pipe_labels.get("ner", [])
if not labels:
msg.fail("No --label argument set and no labels found in model", exits=1)
msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}")
log(f"RECIPE: Annotating with {len(labels)} labels", labels)
stream = get_stream(source, loader=loader, rehash=True, input_key="text")
# Add "tokens" key to the tasks, either with words or characters
def get_tasks(
nlp: Language, stream: StreamType
) -> StreamType:
tuples = ((eg["text"], eg) for eg in stream)
scored_tasks = []
for pred, eg in nlp.pipe(tuples, as_tuples=True):
combined = copy.deepcopy(eg)
spans = eg.get("spans", [])
if "tokens" in eg:
tokens = [token["text"] for token in eg["tokens"]]
words, spaces = get_words_and_spaces(tokens, eg["text"])
reference = Doc(nlp.vocab, words=words, spaces=spaces)
else:
reference = nlp.make_doc(eg["text"])
ents = [reference.char_span(s["start"], s["end"], label=s["label"]) for s in spans]
reference.ents = ents
for ent in pred.ents:
label = ent.label_
if not labels or label in labels:
start = ent.start_char
end = ent.end_char
spans.append({
"start": start,
"end": end,
"label": f"{label}:PREDICTED"
})
combined["spans"] = sorted(spans, key = lambda s: s["start"])
if len(combined["spans"]) >= 0:
example_scores = nlp.get_pipe("ner").score([Example(pred, reference)])
score = example_scores[score_key]
if score is not None:
scored_tasks.append((score, combined))
sorted_tasks = sorted(scored_tasks, key=lambda x: x[0])
for score, task in sorted_tasks:
if "meta" not in task:
task["meta"] = {}
task["meta"][score_key] = round(score, 3)
yield task
stream = get_tasks(nlp, stream)
stream = add_tokens(nlp, stream)
def before_db(answers):
for eg in answers:
filtered_spans = [span for span in eg.get("spans", []) if not span["label"].endswith(":PREDICTED")]
eg["spans"] = filtered_spans
return answers
return {
"view_id": "spans_manual",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"before_db": before_db,
"config": {
"lang": nlp.lang,
"labels": labels,
"exclude_by": "input",
"auto_count_stream": True,
"overlapping_spans": True,
},
}