Hi!
I am using a custom recipe for ner annotations since this way I can add a custom tokenizer and it works fine when I don't feed patterns into it and doesn't load anything when I do.
Here's a recipe:
@prodigy.recipe(
"ner.custom",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
patterns=("Optional match patterns", "option", "p", str),
exclude=("Names of datasets to exclude", "option", "e", split_string),
)
def ner_manual(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
):
"""
Mark spans manually by token. Requires only a tokenizer and no entity
recognizer, and doesn't do any active learning.
"""
log("RECIPE: Starting recipe ner.custo,", locals())
# Load the spaCy model for tokenization
nlp = spacy.load(spacy_model)
log(f"RECIPE: Creating EntityRecognizer using model {spacy_model}")
nlp.tokenizer = CTokenizer(nlp.vocab)
log(f"RECIPE: Tokenizing")
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
if source.endswith(".csv"):
stream = get_stream(source, None, 'csv', rehash=True, dedup=True, input_key="text")
else:
stream = get_stream(source, None, 'jsonl', rehash=True, dedup=True, input_key="text")
if patterns is not None:
pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
pattern_matcher = pattern_matcher.from_disk(patterns)
stream = (eg for _, eg in pattern_matcher(stream))
# Tokenize the incoming examples and add a "tokens" property to each
# example. Also handles pre-defined selected spans. Tokenization allows
# faster highlighting, because the selection can "snap" to token boundaries.
stream = add_tokens(nlp, stream)
return {
"view_id": "ner_manual", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": label, "exclude_by": "input" # Selectable label options
},
}
So this works
PRODIGY_LOGGING=verbose python3 -m prodigy ner.custom my_dataset xx_ent_wiki_sm my_file.csv -F ner_custom.py --label my_labels
And this works, but it doesn't suit me because of the tokenisation
PRODIGY_LOGGING=verbose python3 -m prodigy ner.manual my_dataset xx_ent_wiki_sm my_file.csv --label my_labels --patterns my_patterns.jsonl
While this doesn't
PRODIGY_LOGGING=verbose python3 -m prodigy ner.custom my_dataset xx_ent_wiki_sm my_file.csv -F ner_custom.py --label my_labels --patterns my_patterns.jsonl
With the following logging:
Any help much appreciated