Hi, I am using a custom ner.manual recipie (nothing fancy) to get annotated a small dataset (just four examples for the the sake of trying out the recipie). All seems to run fine and I click save and close the webpage when completing the annotation. After a few minutes I type ctr+c in my terminal a asyncio error: asyncio.exceptions.CancelledError
Nothing is saved to my sqlite dataset.
Is there something in my recipie that makes the syncing with the database very slow? Am I misunderstanding something really basic when it comes to how you complete a session?
This is my recipie:
def get_stream(nlp, source,patterns, highlight_chars):
stream = JSONL(source)
if patterns is not None:
pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
pattern_matcher = pattern_matcher.from_disk(patterns)
stream = (eg for _,eg in pattern_matcher(stream))
stream = add_tokens(nlp, stream, use_chars=highlight_chars)
for eg in stream:
if "spans" not in eg:
eg["spans"] = []
eg = prodigy.set_hashes(eg)
yield eg
def get_stream_loop(nlp, source, dataset, patterns, highlight_chars):
db = connect()
while True:
stream = get_stream(nlp, source, patterns, highlight_chars)
hashes_in_dataset = db.get_task_hashes(dataset)
yielded = False
for eg in stream:
# Only send out task if its hash isn't in the dataset yet
if eg["_task_hash"] not in hashes_in_dataset:
yield eg
yielded = True
if not yielded:
break
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"custom_ner_manual",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
patterns=("The match patterns file","option","p",str),
exclude=("Names of datasets to exclude", "option", "e", split_string),
highlight_chars=("Allow for highlighting individual characters instead of tokens", "flag", "C", bool),
)
def custom_ner_manual(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
highlight_chars: bool = False,
):
stream = get_stream_loop(nlp, source, dataset, patterns, highlight_chars) # Incoming stream of examples # should we place loop here?
return {
"view_id": "ner_manual", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream,
"exclude": exclude, # List of dataset names to exclude
"before_db": remove_tokens if highlight_chars else None,
# Remove token information to permit highlighting individual characters
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": label, # Selectable label options
},
In advance, thanks for all suggested solutions!