Hi Prodigy team,
Background: Given some unexpected behaviour early in the labelling and training process, I am wondering how to tightly control the frequency with which the update callback is triggered in an active learning situation, e.g. that governed by the ner.correct
recipe with update
enabled.
Details: I am using Prodigy to design custom active learning workflows for NLP labelling and training tasks, specifically focussing on an NER task. In my current setup, I'm using a local model (e.g., the model underlying the en_core_web_md
pipeline) to learn incrementally from human-corrected labels. My goal here is to enable quick, high-quality training through a direct interplay between human input and the model's suggestions.
I have begun by very slightly modifying the open source version of your ner.correct recipe, like so:
import copy
from typing import List, Optional
import spacy
from spacy.training import Example
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens, split_sentences
from prodigy.util import split_string, set_hashes
BATCH_SIZE = 10
def make_tasks(nlp, stream, labels):
"""Add a 'spans' key to each example, with predicted entities."""
# Process the stream using spaCy's nlp.pipe, which yields doc objects.
# If as_tuples=True is set, you can pass in (text, context) tuples.
texts = ((eg["text"], eg) for eg in stream)
print("function `make_tasks` is iterating through DOCS")
for i, (doc, eg) in enumerate(nlp.pipe(texts, as_tuples=True)):
print(f"{i}. Document is being processed in make_tasks")
task = copy.deepcopy(eg)
spans = []
for j, ent in enumerate(doc.ents):
# Ignore if the predicted entity is not in the selected labels.
if labels and ent.label_ not in labels:
continue
# Create a span dict for the predicted entity.
spans.append(
{
"token_start": ent.start,
"token_end": ent.end - 1,
"start": ent.start_char,
"end": ent.end_char,
"text": ent.text,
"label": ent.label_,
}
)
task["spans"] = spans
# Rehash the newly created task so that hashes reflect added data.
task = set_hashes(task)
yield task
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"ner.correct-v2",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
update=("Whether to update the model during annotation", "flag", "UP", bool),
exclude=("Names of datasets to exclude", "option", "e", split_string),
unsegmented=("Don't split sentences", "flag", "U", bool),
component=("Name of NER component in the pipeline", "option", "c", str),
)
def ner_correct(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
update: bool = False,
exclude: Optional[List[str]] = None,
unsegmented: bool = False,
component: Optional[str] = "ner",
):
"""
Create gold-standard data by correcting a model's predictions manually.
This recipe used to be called `ner.make-gold`.
"""
# Load the spaCy model.
nlp = spacy.load(spacy_model)
labels = label
# Get existing model labels, if available.
if component not in nlp.pipe_names:
raise ValueError(
f"Can't find component '{component}' in the provided pipeline."
)
model_labels = nlp.pipe_labels.get(component, [])
# Check if we're annotating all labels present in the model or a subset.
use_all_model_labels = len(set(labels).intersection(set(model_labels))) == len( # type: ignore
model_labels
)
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
if not unsegmented:
# Use spaCy to split text into sentences.
stream = split_sentences(nlp, stream)
# Tokenize the incoming examples and add a "tokens" property to each
# example. Also handles pre-defined selected spans. Tokenization allows
# faster highlighting, because the selection can "snap" to token boundaries.
stream = add_tokens(nlp, stream)
# Add the entities predicted by the model to the tasks in the stream.
stream = make_tasks(nlp, stream, labels)
def make_update(answers):
"""Update the model with the received answers to improve future suggestions"""
examples = []
print("make_update has been called")
# Set the default label for the tokens outside the provided spans.
default_label = "outside" if use_all_model_labels else "missing"
print(f"make_update is iterating through {len(answers)} answers")
for i, eg in enumerate(answers):
print(f"{i}.")
if eg["answer"] == "accept":
# Create a "predicted" doc object and a "reference" doc objects to be used
# as a training example in the model update. If your examples contain tokenization
# make sure not to loose this information by initializing the doc object from scratch.
pred = nlp.make_doc(eg["text"])
ref = nlp.make_doc(eg["text"])
spans = [
pred.char_span(span["start"], span["end"], label=span["label"])
for span in eg.get("spans", [])
]
# Use the information in spans to set named entites in the document specifying
# how to handle the tokens outside the provided spans.
ref.set_ents(spans, default=default_label)
examples.append(Example(pred, ref))
nlp.update(examples)
print("make_update has finished retraining")
return {
"view_id": "ner_manual", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"update": make_update,
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": labels, # Selectable label options
"exclude_by": "input", # Hash value to filter out seen examples
"batch_size": BATCH_SIZE,
},
}
There are only three small modifications to the code on your repo:
- Addition of diagnostic "print" statements to trace the callback workflow.
- Introduction of "batch_size" as a global variable.
- Modification of the "update" parameter to always equal "make_update".
My confusion: I had assumed the "update" callback would be triggered after every submitted batch (so in this case, after every batch_size
= 10 labelled documents). However, the terminal output of the above code suggests a "burn-in" period where it takes three batches before update
is called:
(prodigy) stephenenrightward@Stephens-MacBook-Pro-3 prodigy-active-learning % prodigy -F ./ner_correct_recipe_copy.py ner.correct-v2 ner_terminal en_core_web_md ./terminal_docs.jsonl --label PERSON,ORG,GPE,LOC --unsegmented
function `make_tasks` is iterating through DOCS
0. Document is being processed in make_tasks
Added dataset ner_terminal to database SQLite.
✨ Starting the web server at http://localhost:8080 ...
Open the app in your browser and start annotating!
1. Document is being processed in make_tasks
2. Document is being processed in make_tasks
3. Document is being processed in make_tasks
4. Document is being processed in make_tasks
5. Document is being processed in make_tasks
6. Document is being processed in make_tasks
7. Document is being processed in make_tasks
8. Document is being processed in make_tasks
9. Document is being processed in make_tasks
10. Document is being processed in make_tasks
11. Document is being processed in make_tasks
12. Document is being processed in make_tasks
13. Document is being processed in make_tasks
14. Document is being processed in make_tasks
15. Document is being processed in make_tasks
16. Document is being processed in make_tasks
17. Document is being processed in make_tasks
18. Document is being processed in make_tasks
19. Document is being processed in make_tasks
20. Document is being processed in make_tasks
21. Document is being processed in make_tasks
22. Document is being processed in make_tasks
23. Document is being processed in make_tasks
24. Document is being processed in make_tasks
25. Document is being processed in make_tasks
26. Document is being processed in make_tasks
27. Document is being processed in make_tasks
28. Document is being processed in make_tasks
29. Document is being processed in make_tasks
make_update has been called
make_update is iterating through 10 answers
0.
1.
2.
3.
4.
5.
6.
7.
8.
9.
make_update has finished retraining
30. Document is being processed in make_tasks
31. Document is being processed in make_tasks
32. Document is being processed in make_tasks
33. Document is being processed in make_tasks
34. Document is being processed in make_tasks
35. Document is being processed in make_tasks
36. Document is being processed in make_tasks
37. Document is being processed in make_tasks
38. Document is being processed in make_tasks
39. Document is being processed in make_tasks
make_update has been called
make_update is iterating through 10 answers
0.
1.
2.
3.
4.
5.
6.
7.
8.
9.
make_update has finished retraining
Questions: Is this "burn-in period" expected behaviour, or am I using the system improperly? If it is intentional, how can I control this behaviour? Does there exist some command line argument similar to batch_size
, e.g., that governs it?
I want to understand this behaviour well, because I plan to test different local model architectures with varying complexities, so would like to have tight control over the model update frequency.