Hi,
I’m finally working on adding this code snippet into the textcat.teach recipe. I appreciate the question from @reb-greazy because it helped me put the filter_stream() function in the right place. However, I’m still seeing duplicate annotations after adding in this snippet. Is something else amiss here that I’m not seeing?
@recipe("textcat.teach",
dataset=recipe_args["dataset"], spacy_model=recipe_args["spacy_model"],
source=recipe_args["source"], label=recipe_args["label_set"], api=recipe_args["api"],
loader=recipe_args["loader"], patterns=recipe_args["patterns"], long_text=("Long text", "flag", "L",
bool), exclude=recipe_args["exclude"],)
def teach(dataset, spacy_model, source=None, label=None, api=None, patterns=None, loader=None,
long_text=False, exclude=None):
"""
Collect the best possible training data for a text classification model
with the model in the loop. Based on your annotations, Prodigy will decide
which questions to ask next.
"""
log("RECIPE: Starting recipe textcat.teach", locals())
if label is None:
prints(
"No label specified",
"To use the textcat.teach recipe, you "
"need to provide at least one category label via the --label "
"or -l argument.",
error=True,
exits=1,
)
nlp = spacy.load(spacy_model, disable=["ner", "parser"])
log("RECIPE: Creating TextClassifier with model {}".format(spacy_model))
model = TextClassifier(nlp, label, long_text=long_text)
stream = get_stream(source, api, loader, rehash=True, dedup=True, input_key="text")
if patterns is None:
predict = model
update = model.update
else:
matcher = PatternMatcher(
model.nlp,
prior_correct=5.0,
prior_incorrect=5.0,
label_span=False,
label_task=True,
filter_labels=label,
)
matcher = matcher.from_disk(patterns)
log("RECIPE: Created PatternMatcher and loaded in patterns", patterns)
# Combine the textcat model with the PatternMatcher to annotate both
# match results and predictions, and update both models.
predict, update = combine_models(model, matcher)
# Rank the stream. Note this is continuous, as model() is a generator.
# As we call model.update(), the ranking of examples changes.
def filter_stream(stream):
seen = set()
for eg in stream:
# Get the hash idenfitying the original input, e.g. the text
input_hash = eg["_input_hash"]
if input_hash not in seen:
yield eg
seen.add(input_hash)
stream = prefer_uncertain(predict(stream))
stream = filter_stream(stream)
return {
"view_id": "classification",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"update": update,
"config": {"lang": nlp.lang, "labels": model.labels},
}