Hi all,
I am trying to annotate a corpus for the NER and POS tagging using Prodigy with a custom recipe to make the process be done in one Prodigy user interface. When I ran the Prodigy, the Prodigy UI was displayed correctly. However, the problem was that the annotation results affected each other when I tried to annotate the text, as shown in the attached image. Can you help ensure Prodigy treats them as different tasks? Here is my custom recipe script.
import spacy
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.util import split_string
@prodigy.recipe(
"pos_ner_annotation",
dataset=("Dataset name", "positional", None, str),
source=("Input data (JSONL)", "positional", None, str),
spacy_model=("spaCy model", "option", "m", str, "en_core_web_sm"),
pos_labels=("Comma-separated POS labels", "option", "p", str, ""),
ner_labels=("Comma-separated NER labels", "option", "n", str, "")
)
def pos_ner_annotation(dataset, source, spacy_model="blank:id", pos_labels="", ner_labels=""):
"""
A Prodigy recipe that allows simultaneous POS tagging and NER annotation in the same UI,
ensuring that both are completely independent.
"""
nlp = spacy.load(spacy_model)
# Load the dataset
stream = JSONL(source)
# Process label inputs
pos_labels = split_string(pos_labels) if pos_labels else list(nlp.get_pipe("tagger").labels)
ner_labels = split_string(ner_labels) if ner_labels else list(nlp.get_pipe("ner").labels)
def get_annotations():
for eg in stream:
doc = nlp(eg["text"])
# Token-level POS annotation (must be a list)
pos_tokens = list([
{
"text": tok.text,
"id": i,
"start": tok.idx,
"end": tok.idx + len(tok.text),
"tag": tok.pos_
}
for i, tok in enumerate(doc)
])
# Span-level NER annotation (must be a list)
ner_spans = list([
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
for ent in doc.ents
if ent.label_ in ner_labels
])
yield {
"text": eg["text"],
"tokens": pos_tokens, # Ensures POS uses a list
"spans": ner_spans, # Ensures NER uses a list
"meta": {"task": "POS + NER", "model": spacy_model},
"_input_hash": hash(eg["text"] + "POS_NER"),
"_task_hash": hash(eg["text"] + "POS_NER")
}
return {
"dataset": dataset,
"view_id": "blocks",
"stream": get_annotations(),
"config": {
"blocks": [
{"view_id": "pos_manual", "labels": pos_labels}, # POS tagging
{"view_id": "ner_manual", "labels": ner_labels} # NER annotation
],
"exclude_by": "task",
"show_tok_ranks": True,
"tokens": True,
},
}
Thank you.