Hi,
Thank you very much for the detailed response!
I took your advice, and modified the recipe to include the custom_input_hashes function and apply it to the stream right before the return statement. As I didn't want to change the built-in recipe, I created a new python file my_recipe.py and copied the spans.manual recipe to it. Made the suggested change, but I am still getting the same behavior, i.e., the server ignores the meta-data and sends every "unique" text only once. More specifically, I am using as input the 5-lines jsonl file from my first post. The server only sends lines 1, 3, 4 to the annotator.
Below is my_recipe.py:
from typing import Callable, List, Optional, Tuple
from spacy.language import Language
from spacy.tokens import Doc
from spacy.util import registry as spacy_registry
from prodigy.components.preprocess import add_tokens
from prodigy.components.stream import get_stream
from prodigy.core import Arg, recipe
from prodigy.models.matcher import PatternMatcher
from prodigy.protocols import ControllerComponentsDict
from prodigy.types import ExistingFilePath, LabelsType, SourceType, StreamType, TaskType
from prodigy.util import (
get_pipe_labels,
log,
msg,
set_hashes,
)
@recipe(
"my_recipe",
# fmt: off
dataset=Arg(help="Dataset to save annotations to"),
nlp=Arg(help="Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)"),
source=Arg(help="Data to annotate (file path or '-' to read from standard input)"),
loader=Arg("--loader", "-lo", help="Loader (guessed from file extension if not set)"),
label=Arg("--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line"),
patterns=Arg("--patterns", "-pt", help="Path to match patterns file"),
exclude=Arg("--exclude", "-e", help="Comma-separated list of dataset IDs whose annotations to exclude"),
highlight_chars=Arg("--highlight-chars", "-C", help="Allow highlighting individual characters instead of tokens"),
suggester=Arg("--suggesters", "-sg", help="Name of suggester function registered in spaCy's 'misc' registry. Will be used to validate annotations as they're submitted. Use the -F option to provide a custom Python file"),
use_annotations=Arg("--use-annotations", "-A", help="Use annotations from the specified spaCy model."),
# fmt: on
)
def my_recipe(
dataset: str,
nlp: Language,
source: SourceType,
loader: Optional[str] = None,
label: Optional[LabelsType] = None,
patterns: Optional[ExistingFilePath] = None,
exclude: List[str] = [],
highlight_chars: bool = False,
suggester: Optional[str] = None,
use_annotations: bool = False,
) -> ControllerComponentsDict:
"""
Annotate potentially overlapping and nested spans in the data. If
patterns are provided, their matches are highlighted in the example, if
available. The tokenizer is used to tokenize the incoming texts so the
selection can snap to token boundaries. You can also set --highlight-chars
for character-based highlighting.
"""
log("RECIPE: Starting recipe my_recipe", locals())
labels = get_pipe_labels(label, nlp.pipe_labels.get("spancat", []))
log(f"RECIPE: Annotating with {len(labels)} labels", labels)
stream = get_stream(
source, loader=loader, rehash=True, dedup=True, input_key="text"
)
if patterns is not None:
pattern_matcher = PatternMatcher(
nlp, combine_matches=True, all_examples=True, allow_overlap=True
)
pattern_matcher = pattern_matcher.from_disk(patterns)
stream.apply(lambda d: (eg for _, eg in pattern_matcher(d)))
# Add "tokens" key to the tasks, either with words or characters
stream.apply(add_tokens, nlp=nlp, stream=stream)
validate_func = (
validate_with_suggester(nlp, suggester, use_annotations=use_annotations)
if suggester
else None
)
stream.apply(custom_input_hashes, stream=stream, keys=("text", "meta"))
return {
"view_id": "spans_manual",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"validate_answer": validate_func,
"config": {
"lang": nlp.lang,
"labels": labels,
"exclude_by": "input",
"ner_manual_highlight_chars": highlight_chars,
"auto_count_stream": True,
},
}
def validate_with_suggester(
nlp: Language,
suggester_name: str,
*,
use_annotations: bool,
) -> Callable[[TaskType], None]:
msg.info(f"Validating annotations against suggester function '{suggester_name}'")
suggester = spacy_registry.get("misc", suggester_name)()
def validate_answer(answer: TaskType) -> None:
spans = answer.get("spans", [])
if not spans: # No need to run suggester if we don't have spans
pass
# Don't allow spans that are not compatible with the provided suggester
words = [t["text"] for t in answer["tokens"]]
spaces = [t.get("ws", True) for t in answer["tokens"]]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
# Add annotations from other components
if use_annotations:
doc = nlp(doc)
suggested_spans = suggester([doc])
suggested_span_tuples = [(s[0], s[1]) for s in suggested_spans.data]
text = answer["text"]
annotated = {
((s["token_start"], s["token_end"] + 1)): text[s["start"] : s["end"]]
for s in spans
}
for annotated_tuple, text in annotated.items():
if annotated_tuple not in suggested_span_tuples:
start, end = annotated_tuple
err = (
f"Span with token offsets {start}:{end} ({text}) "
f"is not compatible with the provided suggester function "
f"'{suggester_name}'."
)
raise ValueError(err)
return validate_answer
def custom_input_hashes(stream: StreamType, keys: Tuple[str]) -> StreamType:
for eg in stream:
eg = set_hashes(eg, input_keys=keys, overwrite=True)
yield eg
What am I doing wrong?