In case others like me come looking for a basic scoring recipe, here is what I cooked up.
It doesn’t consider threshold, but it evaluates model accuracy without re-training and can output either PRF or the standard prodigy score scheme.
import spacy
import spacy.scorer
from prodigy.components.db import connect
from prodigy.core import recipe, recipe_args
from prodigy.models.ner import EntityRecognizer, merge_spans
from prodigy.util import log
from prodigy.components.preprocess import split_sentences, add_tokens
def gold_to_spacy(dataset, spacy_model, biluo=False):
#### Ripped from ner.gold_to_spacy. Only change is returning annotations instead of printing or saving
DB = connect()
examples = DB.get_dataset(dataset)
examples = [eg for eg in examples if eg['answer'] == 'accept']
if biluo:
if not spacy_model:
prints("Exporting annotations in BILUO format requires a spaCy "
"model for tokenization.", exits=1, error=True)
nlp = spacy.load(spacy_model)
annotations = []
for eg in examples:
entities = [(span['start'], span['end'], span['label'])
for span in eg.get('spans', [])]
if biluo:
doc = nlp(eg['text'])
entities = spacy.gold.biluo_tags_from_offsets(doc, entities)
annot_entry = [eg['text'], entities]
else:
annot_entry = [eg['text'], {'entities': entities}]
annotations.append(annot_entry)
return annotations
def evaluate_prf(ner_model, examples):
#### Source: https://stackoverflow.com/questions/44827930/evaluation-in-a-spacy-ner-model
scorer = spacy.scorer.Scorer()
for input_, annot in examples:
doc_gold_text = ner_model.make_doc(input_)
gold = spacy.gold.GoldParse(doc_gold_text, entities=annot['entities'])
pred_value = ner_model(input_)
scorer.score(pred_value, gold)
return scorer.scores
@recipe("ner.stats",
dataset=recipe_args["dataset"],
spacy_model=recipe_args["spacy_model"],
label=recipe_args["entity_label"],
isPrf=("Output Precsion, Recall, F-Score", "flag", "prf"))
def model_stats(dataset, spacy_model, label=None, isPrf=False):
"""
Evaluate model accuracy of model based on dataset with no training
inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2
got basic model evaluation by looking at the batch-train recipe
"""
log("RECIPE: Starting recipe ner.stats", locals())
DB = connect()
nlp = spacy.load(spacy_model)
if(isPrf):
examples = gold_to_spacy(dataset, spacy_model)
score = evaluate_prf(nlp, examples)
print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format(score['ents_p'], score['ents_r'], score['ents_f']))
else:
#ripped this from ner.batch-train recipe
model = EntityRecognizer(nlp, label=label)
evaldoc = merge_spans(DB.get_dataset(dataset))
evals = list(split_sentences(model.orig_nlp, evaldoc))
scores = model.evaluate(evals)
print("Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}".format(scores['acc'], scores['right'],scores['wrong'],scores['unk'],scores['ents']))