I've created my textcat recipe which only use patterns (2K patterns). It worked with 1000 annotation records. However the server reports the follow error when I tried with 10K annotation records. The reason I am trying 10K records or more is because there are very few positive cases in my data.
Here is the error -
File "/home/ec2-user/anaconda3/envs/prodigy_sense2vec/lib/python3.8/site-packages/prodigy/app.py", line 370, in _shared_get_questions
tasks = controller.get_questions(session_id=session_id, excludes=excludes)
File "cython_src/prodigy/core.pyx", line 138, in prodigy.core.Controller.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 68, in prodigy.components.feeds.SharedFeed.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 73, in prodigy.components.feeds.SharedFeed.get_next_batch
File "cython_src/prodigy/components/feeds.pyx", line 153, in prodigy.components.feeds.SessionFeed.get_session_stream
File "cython_src/prodigy/components/feeds.pyx", line 135, in prodigy.components.feeds.SessionFeed.validate_stream
File "/home/ec2-user/anaconda3/envs/prodigy_sense2vec/lib/python3.8/site-packages/toolz/itertoolz.py", line 376, in first
return next(iter(seq))
RuntimeError: cannot re-enter the tee iterator
Here is the teach code -
indent preformatted text by 4 spaces
def textcat_pattern_teach(
dataset: str,
spacy_model: str,
source: Union[str, Iterable[dict]] = "-",
label: Optional[List[str]] = None,
api: Optional[str] = None,
patterns: Optional[str] = None,
init_tok2vec: Optional[Union[str, Path]] = None,
loader: Optional[str] = None,
long_text: bool = False,
exclude: Optional[List[str]] = None
):
components = teach(dataset=dataset, spacy_model=spacy_model,
source=source, patterns=patterns, label=label)
if spacy_model.startswith("blank:"):
nlp = spacy.blank(spacy_model.replace("blank:", ""))
else:
nlp = spacy.load(spacy_model)
#model = TextClassifier(nlp, label, long_text=long_text, init_tok2vec=init_tok2vec)
stream = JSONL(source)
if patterns is None:
nlp = space.load(spacy_model)
#predict = model
#update = model.update
else:
matcher = PatternMatcher(
nlp,
prior_correct=5.0,
prior_incorrect=5.0,
label_span=False,
label_task=True,
filter_labels=label,
combine_matches=True,
task_hash_keys=("label",),
)
matcher = matcher.from_disk(patterns)
# Combine the textcat model with the PatternMatcher to annotate both
# match results and predictions, and update both models.
#predict, update = combine_models(model, matcher)
predict, update = matcher, matcher.update
#stream = prefer_uncertain(predict(stream))
stream = prefer_high_scores(predict(stream))
return {
"view_id": "classification",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"update": update
}