Hi Ines,
thanks for your reply.
I checked with our collaborators, and they told me they are running a less recent version of prodigy (1.10.ish).
I had my colleague send over a copy of his ner.py file (I pasted it below).
They tried to include the function you suggsted but it doesn't appear to work. They tried pasting the function at the very end of the silver-to-gold recipe, but also before the 'return' as in the posted example, but neither of these seem to work.
I am sorry if this seems trivial, could you please point us towards the right way to implement your code?
thanks a lot
Valerio
@recipe(
"ner.silver-to-gold",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
silver_sets=("Comma-separated datasets to convert", "positional", None, split_string),
spacy_model=("Loadable spaCy model with an entity recognizer", "positional", None, str),
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
# fmt: on
)
def silver_to_gold(
dataset: str,
silver_sets: List[str],
spacy_model: str,
label: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""
Take existing "silver" datasets with binary accept/reject annotations,
merge the annotations to find the best possible analysis given the
constraints defined in the annotations, and manually edit it to create
a perfect and complete "gold" dataset.
"""
def filter_stream(stream: Iterable[dict]) -> Iterable[dict]:
# make_best uses all labels in the model, so we need to filter by label here
for eg in stream:
eg["spans"] = [s for s in eg.get("spans", []) if s["label"] in labels]
yield eg
log("RECIPE: Starting recipe ner.silver-to-gold", locals())
DB = connect()
data = []
for set_id in silver_sets:
if set_id not in DB:
msg.fail(f"Can't find input dataset '{set_id}' in database", exits=1)
examples = DB.get_dataset(set_id)
data += examples
log(f"RECIPE: Loaded {len(data)} examples from {len(silver_sets)} dataset(s)")
nlp = spacy.load(spacy_model)
labels = label
if not labels:
labels = get_labels_from_ner(nlp)
if not labels:
msg.fail("No --label argument set and no labels found in model", exits=1)
msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}")
# Initialize Prodigy's entity recognizer model, which uses beam search to
# find all possible analyses and outputs (score, example) tuples,
# then merge all annotations and find the best possible analyses
model = EntityRecognizer(nlp, label=labels)
stream = model.make_best(data)
stream = filter_stream(stream)
stream = add_tokens(nlp, stream) # add "tokens" for faster annotation
def exclude_examples(stream):
task_hashes = DB.get_task_hashes(dataset)
for eg in examples:
eg = set_hashes(eg)
if eg["_task_hash"] not in task_hashes:
yield eg
stream = exclude_examples(stream)
return {
"dataset": dataset,
"view_id": "ner_manual",
"stream": stream,
"config": {"lang": nlp.lang, "labels": labels},
}