Long documents +6k chars (I should add a token counter to my examples)
import copy
from typing import List, Optional
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.util import split_string
import spacy
from spacy.tokens import Doc
from spacy.training import Example
from prodigy.models.textcat import TextClassifier
from prodigy.components.sorters import prefer_uncertain,prefer_low_scores
@prodigy.recipe(
"textcat.correct",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
update=("Whether to update the model during annotation", "flag", "UP", bool),
exclude=("Names of datasets to exclude", "option", "e", split_string),
threshold=("Score threshold to pre-select label", "option", "t", float),
component=("Name of text classifier component in the pipeline (will be guessed from pipeline if not set)", "option", "c", str),
)
def textcat_correct(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
update: bool = False,
exclude: Optional[List[str]] = None,
threshold: float = 0.5,
component: Optional[str] = None,
):
stream = JSONL(source)
nlp = spacy.load(spacy_model)
if not component:
component = "textcat" if "textcat" in nlp.pipe_names else "textcat_multilabel"
pipe_config = nlp.get_pipe_config(component)
exclusive = pipe_config.get("model", {}).get("exclusive_classes", True)
labels = label
if not labels:
labels = nlp.pipe_labels.get(component, [])
model = TextClassifier(nlp, labels, component)
predict = model
def add_suggestions(stream):
texts = ((eg["text"], eg) for eg in stream)
i=0
for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
task = copy.deepcopy(eg)
options = []
selected = []
i+=1
for cat, score in doc.cats.items():
if cat in labels:
options.append({"id": cat, "text": cat, "meta": f"{score:.2f}"})
if score >= threshold:
selected.append(cat)
task["options"] = options
task["accept"] = selected
yield task
def make_update(answers):
examples=[]
for eg in answers:
if eg["answer"] == "accept":
selected = eg.get("accept", [])
cats = {
opt["id"]: 1.0 if opt["id"] in selected else 0.0
for opt in eg.get("options", [])
}
doc = nlp.make_doc(eg["text"])
examples.append(Example.from_dict(doc, {"cats": cats}))
nlp.update(examples)
stream = add_suggestions(stream)
stream = prefer_uncertain(model(stream), algorithm="ema")
return {
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"update": make_update if update else None,
"exclude": exclude, # List of dataset names to exclude
}