Hi there. I'm using a variant of your cat facts custom recipe with a ner and a textcat to train an initial data set:
@prodigy.recipe("ares")
def ares(dataset, lang="en"):
blocks = [
{"view_id": "ner_manual"},
{"view_id": "choice", "text": None, "choice_style": "multiple"},
]
options = [
{"id": 5, "text": "Meeting tag"},
{"id": 4, "text": "Document tag"},
{"id": 3, "text": "Agenda tag"},
{"id": 2, "text": "OPGA tag"},
{"id": 1, "text": "Substantive tag"},
{"id": 0, "text": "No tag"},
]
def get_stream():
with Session(engine) as session:
res = session.exec(select(Paragraph)).all()
for fact in res:
yield {"text": fact.text, "options": options}
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.blank(lang) # blank spaCy pipeline for tokenization
stream = get_stream() # set up the stream
stream = add_tokens(nlp, stream) # tokenize the stream for ner_manual
return {
"dataset": dataset, # the dataset to save annotations to
"view_id": "blocks", # set the view_id to "blocks"
"stream": stream, # the stream of incoming examples
"config": {
"labels": [
"SESSION",
"MANDATEE",
],
"blocks": blocks
},
}
The resulting data set is then passed through prodigy train
and saved as a model. I have then modified your textcat.teach to continue training with the model 'in the loop':
@prodigy.recipe("ares-in-the-loop")
def ares(dataset, lang="en"):
blocks = [
{"view_id": "ner_manual"},
{"view_id": "choice", "text": None, "choice_style": "multiple"},
]
options = [
{"id": 5, "text": "Meeting tag"},
{"id": 4, "text": "Document tag"},
{"id": 3, "text": "Agenda tag"},
{"id": 2, "text": "OPGA tag"},
{"id": 1, "text": "Substantive tag"},
{"id": 0, "text": "No tag"},
]
def get_stream():
with Session(engine) as session:
res = session.exec(select(Paragraph)).all()
for fact in res:
yield {"text": fact.text, "options": options}
stream = get_stream() # set up the stream
nlp = spacy.load("v1/model-best")
label = "5", "4", "3", "2", "1", "0"
model = TextClassifier(nlp, label)
predict = model
update = model.update
stream = add_tokens(nlp, stream) # tokenize the stream for ner_manual
stream = prefer_uncertain(predict(stream))
# stream = add_tokens(nlp, stream) # tokenize the stream for ner_manual
return {
"dataset": dataset, # the dataset to save annotations to
"view_id": "classification", # set the view_id to "blocks"
"stream": stream, # the stream of incoming examples
"update": update # a function for updating the model
}
Unfortunately, the above code returns an error:
File "/Users/lt/python/recipe-in-the-loop.py", line 42, in ares model = TextClassifier(nlp, label)
File "cython_src/prodigy/models/textcat.pyx", line 87, in prodigy.models.textcat.TextClassifier.__init__
TypeError: __init__() takes exactly 4 positional arguments (3 given)
Would anyone be able to tell me what is happening here? There might be other bugs in the above code - I have not been able to move beyond this above. Thanks a mil, and also for a great product!