Hi--thanks for the reply! I've tried implementing this solution, but have been unable to deploy prodigy. I keep running into a ValueError: Failed to load task (invalid JSON).
error which causes my custom recipe to fail. Apols for the code dump, but I'm just going to share everything with you.
To briefly fill you in on the project, I'm trying to tag a bunch of twitter data in terms of whether the tweets evince one of three concepts, integrity, competence, and trust.
My custom recipe looks like this:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from collections import defaultdict
import srsly
from prodigy.core import recipe, recipe_args
from prodigy.components.preprocess import add_label_options
from prodigy.components.loaders import get_stream
from prodigy.util import log
def get_stream_with_matches(stream,patterns,nlp):
patterns = srsly.read_jsonl(patterns)
patterns_by_label = defaultdict(list)
for pattern in patterns:
patterns_by_label[pattern["label"]].append(pattern["pattern"])
matcher = Matcher(nlp.vocab)
for label, rules in patterns_by_label.items():
matcher.add(label, *rules)
data_tuples = ((eg["text"], eg) for eg in stream)
for doc, eg in nlp.pipe(data_tuples, as_tuples=True):
spans = [] # matched spans
matched_labels = set() # all labels that were matched
for match_id, start, end in matcher(doc):
span = Span(doc, start, end, label=label)
matched_labels.add(span.label_)
spans.append({"start": span.start_char, "end": span.end_char, "label": span.label_})
eg["spans"] = spans
eg["accept"] = list(matched_labels)
yield eg
@recipe(
"custom.textcat.manual",
dataset=recipe_args["dataset"],
spacy_model=recipe_args["spacy_model"],
source=recipe_args["source"],
api=recipe_args["api"],
loader=recipe_args["loader"],
label=recipe_args["label_set"],
exclusive=recipe_args["exclusive"],
exclude=recipe_args["exclude"],
patterns=recipe_args['patterns']
)
def manual(
dataset,
spacy_model,
source=None,
api=None,
loader=None,
label=None,
exclusive=False,
exclude=None,
patterns=None
):
"""
Manually annotate categories that apply to a text. If more than one label
is specified, categories are added as multiple choice options. If the
--exclusive flag is set, categories become mutually exclusive, meaning that
only one can be selected during annotation.
"""
log("RECIPE: Starting recipe textcat.manual", locals())
nlp = spacy.load(spacy_model)
log("RECIPE: Loaded model {}".format(spacy_model))
labels = label
has_options = len(labels) > 1
log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
stream = get_stream(
source, api=api, loader=loader, rehash=True, dedup=True, input_key="text")
stream = get_stream_with_matches(stream,patterns,nlp)
if has_options:
stream = add_label_options(stream, label)
return {
"view_id": "choice" if has_options else "classification",
"dataset": dataset,
"stream": stream,
"exclude": exclude,
"config": {
"lang": nlp.lang,
"labels": labels,
"choice_style": "single" if exclusive else "multiple",
},
}
And my jsonl
of patterns looks like this:
{"label": "TRUST", "pattern": [{"lower": "commitment"}]}
{"label": "TRUST", "pattern": [{"lower": "confidence"}]}
{"label": "TRUST", "pattern": [{"lower": "trust"}]}
{"label": "TRUST", "pattern": [{"lower": "believe"}]}
{"label": "TRUST", "pattern": [{"lower": "trusting"}]}
{"label": "TRUST", "pattern": [{"lower": "truth"}]}
{"label": "TRUST", "pattern": [{"lower": "dependability"}]}
{"label": "TRUST", "pattern": [{"lower": "responsibility"}]}
{"label": "INTEGRITY", "pattern": [{"lower": "integrity"}]}
{"label": "INTEGRITY", "pattern": [{"lower": "reliability"}]}
{"label": "INTEGRITY", "pattern": [{"lower": "honesty"}]}
{"label": "COMPETENCE", "pattern": [{"lower": "competence"}]}
{"label": "COMPETENCE", "pattern": [{"lower": "capability"}]}
I call it as follows:
prodigy custom.textcat.manual trust en_core_web_sm --label INTEGRITY,TRUST,COMPETENCE -F custom_teach.py --patterns trust.jsonl
.
Full error trace back is:
Exception when serving /get_session_questions
Traceback (most recent call last):
File "cython_src/prodigy/components/loaders.pyx", line 145, in prodigy.components.loaders.JSONL
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/srsly/_json_api.py", line 38, in json_loads
return ujson.loads(data)
ValueError: Expected object or value
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/waitress/channel.py", line 336, in service
task.service()
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/waitress/task.py", line 175, in service
self.execute()
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/waitress/task.py", line 452, in execute
app_iter = self.channel.server.application(env, start_response)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/hug/api.py", line 451, in api_auto_instantiate
return module.__hug_wsgi__(*args, **kwargs)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/falcon/api.py", line 244, in __call__
responder(req, resp, **params)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/hug/interface.py", line 789, in __call__
raise exception
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/hug/interface.py", line 762, in __call__
self.render_content(self.call_function(input_parameters), context, request, response, **kwargs)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/hug/interface.py", line 698, in call_function
return self.interface(**parameters)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/hug/interface.py", line 100, in __call__
return __hug_internal_self._function(*args, **kwargs)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/prodigy/_api/hug_app.py", line 228, in get_session_questions
tasks = controller.get_questions(session_id=session_id)
File "cython_src/prodigy/core.pyx", line 130, in prodigy.core.Controller.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 58, in prodigy.components.feeds.SharedFeed.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 63, in prodigy.components.feeds.SharedFeed.get_next_batch
File "cython_src/prodigy/components/feeds.pyx", line 140, in prodigy.components.feeds.SessionFeed.get_session_stream
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/toolz/itertoolz.py", line 376, in first
return next(iter(seq))
File "cython_src/prodigy/components/preprocess.pyx", line 237, in add_label_options
File "custom_teach.py", line 30, in get_stream_with_matches
for doc, eg in nlp.pipe(data_tuples, as_tuples=True):
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/language.py", line 723, in pipe
for doc, context in izip(docs, contexts):
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/language.py", line 751, in pipe
for doc in docs:
File "nn_parser.pyx", line 221, in pipe
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/util.py", line 463, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "nn_parser.pyx", line 221, in pipe
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/util.py", line 463, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "pipes.pyx", line 397, in pipe
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/util.py", line 463, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/language.py", line 726, in <genexpr>
docs = (self.make_doc(text) for text in texts)
File "/anaconda3/envs/prodigy/lib/python3.6/site-packages/spacy/language.py", line 715, in <genexpr>
texts = (tc[0] for tc in text_context1)
File "custom_teach.py", line 29, in <genexpr>
data_tuples = ((eg["text"], eg) for eg in stream)
File "cython_src/prodigy/components/filters.pyx", line 35, in filter_duplicates
File "cython_src/prodigy/components/filters.pyx", line 16, in filter_empty
File "cython_src/prodigy/components/loaders.pyx", line 22, in _rehash_stream
File "cython_src/prodigy/components/loaders.pyx", line 152, in JSONL
ValueError: Failed to load task (invalid JSON).
I've played with reducing the number of labels in the patterns file to one, as well as loading it directly from within the custom recipe, etc., but the error persists. I guess I don't fully understand what the code is doing and it's hard to debug. Any help appreciated.