Since this is probably not a super common use case I'll just share it here.
This is a recipe that uses both patterns and a model to highlight spans (Note: That's different from what this topic was originally about, but this suits my needs the best ). There is no strategy to prioritize matches or predictions, it's just using both (my use case has no overlap).
Model can either be spancat or ner.
from typing import List, Optional, Union, Iterable
import spacy
from spacy.language import Language
import copy
from prodigy.models.matcher import PatternMatcher
from prodigy.components.preprocess import add_tokens
from prodigy.components.loaders import get_stream
from prodigy.core import recipe
from prodigy.util import log, split_string, get_labels, msg, set_hashes, INPUT_HASH_ATTR
from prodigy.types import RecipeSettingsType, StreamType
from prodigy.recipes.spans import validate_with_suggester, remove_tokens
@recipe(
"spans.manual_model_x_patterns",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
patterns=("Path to match patterns file", "option", "pt", str),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
component=("Name of spancat component in the pipeline", "option", "c", str),
highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
suggester=("Name of suggester function registered in spaCy's 'misc' registry. Will be used to validate annotations as they're submitted. Use the -F option to provide a custom Python file", "option", "sg", str),
# fmt: on
)
def manual_model_x_patterns(
dataset: str,
spacy_model: str,
source: Union[str, Iterable[dict]],
loader: Optional[str] = None,
label: Optional[List[str]] = None,
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
component: str = "spancat",
highlight_chars: bool = False,
suggester: Optional[str] = None,
) -> RecipeSettingsType:
"""
Annotate potentially overlapping and nested spans in the data. If
patterns are provided, their matches are highlighted in the example, if
available. If a model is provided, it's predictions are highlighted as well.
The tokenizer is used to tokenize the incoming texts so the
selection can snap to token boundaries. You can also set --highlight-chars
for character-based highlighting.
"""
log("RECIPE: Starting recipe spans.manual_model_x_patterns", locals())
nlp = spacy.load(spacy_model)
if component not in nlp.pipe_names:
msg.fail(
f"Can't find component '{component}' in pipeline. Make sure that the "
f"pipeline you're using includes a trained span categorizer that you "
f"can correct. If your component has a different name, you can use "
f"the --component option to specify it.",
exits=1,
)
labels = label # comma-separated list or path to text file
model_labels = nlp.pipe_labels.get(component, [])
if not labels:
labels = model_labels
if not labels:
msg.fail("No --label argument set and no labels found in model", exits=1)
msg.text(f"Using {len(labels)} labels from model: {', '.join(labels)}")
log(f"RECIPE: Annotating with {len(labels)} labels", labels)
if component == "spancat":
key = nlp.get_pipe(component).key
msg.text(f"""Reading spans from key '{key}': doc.spans["{key}"]""")
elif component == "ner":
msg.text(f"""Reading ents from model""")
stream = get_stream(
source, loader=loader, rehash=True, dedup=True, input_key="text"
)
if patterns is not None:
pattern_matcher = PatternMatcher(
nlp, combine_matches=True, all_examples=True, allow_overlap=True
)
pattern_matcher = pattern_matcher.from_disk(patterns)
stream = (eg for _, eg in pattern_matcher(stream))
# Add "tokens" key to the tasks, either with words or characters
stream = add_tokens(nlp, stream, use_chars=highlight_chars)
def make_tasks(nlp: Language, stream: StreamType) -> StreamType:
"""Add a 'spans' key to each example, with predicted spans."""
texts = ((eg["text"], eg) for eg in stream)
for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
task = copy.deepcopy(eg)
spans = task.get("spans", [])
predicted_spans = []
if component == "spancat":
predicted_spans = doc.spans[key]
elif component == "ner":
predicted_spans = list(doc.ents)
for span in predicted_spans:
if labels and span.label_ not in labels:
continue
spans.append(
{
"token_start": span.start,
"token_end": span.end - 1,
"start": span.start_char,
"end": span.end_char,
"text": span.text,
"label": span.label_,
"source": spacy_model,
"input_hash": eg[INPUT_HASH_ATTR],
}
)
task["spans"] = spans
task = set_hashes(task)
yield task
validate_func = validate_with_suggester(nlp, suggester) if suggester else None
stream = make_tasks(nlp, stream)
return {
"view_id": "spans_manual",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"before_db": remove_tokens if highlight_chars else None,
"validate_answer": validate_func,
"config": {
"lang": nlp.lang,
"labels": labels,
"exclude_by": "input",
"ner_manual_highlight_chars": highlight_chars,
"auto_count_stream": True,
},
}