Hey @ines,
I've implemented a version of ner.manual
that uses EntityRuler
to highlight entities. I'm trying to update the EntityRuler
in the update
function with ruler.add_patterns
to add patterns as they are being annotated. However while the EntityRuler
seems to be updating (I can see the new entities in the patterns JSONL file being written out on exit), this is not being applied to the incoming examples. So the pattern exists in the EntityRuler
but the when I call nlp(eg['text'])
in my function that adds the spans to the incoming example, the nlp
object isn't being updated.
I thought this might have been because the stream was being updated by a generator function, but I've done an experiment where I add patterns to a ruler
and the generator is updated in place.
The batch_size
is set to 1 in my prodigy.json
file, so I know I'm only pulling one example at a time. I've an example dataset with the same word appearing in every example and I've been annotating that each example, hoping that in the next one it will be automatically pre-annotated but to no avail
I've added my code below - totally understand if you don't have time to look at this! I've just been stumped for the last few hours.
from typing import List, Optional, Union, Iterable
import spacy
from prodigy.components.preprocess import add_tokens
from prodigy.components.loaders import get_stream
from prodigy.core import recipe
from prodigy.util import log, split_string, get_labels
from prodigy.util import msg
from prodigy.types import TaskType, RecipeSettingsType
def remove_tokens(answers: List[TaskType]) -> List[TaskType]:
"""Remove token information from example before they're placed in the
database. Used if character highlighting is enabled."""
for eg in answers:
del eg["tokens"]
if "spans" in eg:
for span in eg["spans"]:
del span["token_start"]
del span["token_end"]
return answers
def add_ent_spans(nlp, stream):
for eg in stream:
doc = nlp(eg['text'])
spans = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in doc.ents]
eg["spans"] = spans
yield eg
@recipe(
"ner.manual.dynamic",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
patterns=("Path to match patterns file", "option", "pt", str),
writepatterns=("File to write patterns to at end of session", "option", "wp", str),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
# fmt: on
)
def manual(
dataset: str,
spacy_model: str,
source: Union[str, Iterable[dict]],
loader: Optional[str] = None,
label: Optional[List[str]] = None,
patterns: Optional[str] = None,
writepatterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
highlight_chars: bool = False,
) -> RecipeSettingsType:
"""
Mark spans by token. Requires only a tokenizer and no entity recognizer,
and doesn't do any active learning. If patterns are provided, their matches
are highlighted in the example, if available. The recipe will present
all examples in order, so even examples without matches are shown. If
character highlighting is enabled, no "tokens" are saved to the database.
Entities that are annotated will be added to the `EntityRuler` patterns
while you annotate, so recurring entities will be pre-annotated once they've
been annotated once. Patterns can be saved to a file with `writepatterns`.
"""
log("RECIPE: Starting recipe ner.manual", locals())
nlp = spacy.load(spacy_model)
labels = label # comma-separated list or path to text file
if not labels:
labels = nlp.pipe_labels.get("ner", [])
if not labels:
msg.fail("No --label argument set and no labels found in model", exits=1)
msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}")
log(f"RECIPE: Annotating with {len(labels)} labels", labels)
stream = get_stream(
source,
loader=loader,
rehash=True,
dedup=True,
input_key="text",
is_binary=False,
)
if patterns is not None:
ruler = nlp.add_pipe('entity_ruler').from_disk(patterns)
stream = add_ent_spans(nlp, stream)
# Add "tokens" key to the tasks, either with words or characters
stream = add_tokens(nlp, stream, use_chars=highlight_chars)
def update(answers):
""" Add any new patterns to EntityRuler on update """
patterns = []
for eg in answers:
for span in eg.get("spans", []):
# Get the text of each annotated span given its offsets
tokens = [t['text'] for t in eg['tokens'][span['token_start']:span['token_end']+1]]
pattern = {'label': span['label'], 'pattern': [{'LOWER': t.lower()} for t in tokens]}
log("Pattern looks like:")
log(pattern)
patterns.append(pattern)
new_patterns = [pat for pat in patterns if pat not in ruler.patterns]
log("Adding patterns to EntityRuler")
log(new_patterns)
ruler.add_patterns(new_patterns)
def on_exit(controller):
""" Write the PatternMatcher patterns to disk on exit """
if writepatterns:
ruler.to_disk(writepatterns)
return {
"view_id": "ner_manual",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"before_db": remove_tokens if highlight_chars else None,
"update": update,
"on_exit": on_exit,
"config": {
"lang": nlp.lang,
"labels": labels,
"exclude_by": "input",
"ner_manual_highlight_chars": highlight_chars,
"auto_count_stream": True,
},
}