Hi, I'd like to modify the behavior of the receive_answers method with custom recipe. The reason for this is to avoid duplicate annotations in the database. I tried setting the exclude_by and exclude flags but they don't seem to work. I imagined prodigy would, by default, avoid storing duplicate entries into the DB but that filtering doesn't seem to work. So I modified the update() method to only save distinct answers to the DB. But it seems like prodigy actually saves the answers to the DB before calling update().
Here's my recipe code:
def custom_recipe(
dataset: str,
view_id: str,
spacy_model: str,
source: str,
label: Optional[List[str]],
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None):
LOGGER.info('RECIPE: Starting recipe textcat.custom-recipe')
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
print(source)
stream = JSONL(source)
ctrl = Controller('MRO_Monitoring_June', 'text', [], None, True, None, None, None, None, None, {})
def on_save(answers):
print(f"\non_save received {len(answers)} annotations!")
answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
print(f"\non_save filtered to {len(answers)} annotations!")
def update(answers):
# This function is triggered when Prodigy receives annotations
print(f"\nReceived {len(answers)} annotations!")
answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
print(f"\nSaving {len(answers)} annotations!")
for ans in answers:
epoch_time = int(time.time())
ans['answer_timestamp'] = epoch_time
del ans['html'] #remove html coming from view id
del ans['id']
saved_data.add(ans["meta"]['id'])
db.add_examples(answers, datasets=['MRO_Monitoring_June'])
# db_answers = [{"id": eg["id"], "answer": eg["answer"]} for eg in answers]
#save_to_your_custom_db(db_answers)
def on_exit(controller):
"""
Triggered when server is stopped i.e Ctrl C
A function that is invoked when you stop the Prodigy server.
It takes the Controller as an argument, giving you access to the database.
Split data into train, validation, test
Write data in flashtext format
"""
return {
"dataset": dataset,
"view_id": 'blocks',
"stream": get_stream_loop(), #list conversion for progress bar on UI
"update": update,
"on_exit": on_exit,
'exclude': [dataset],
# 'db': False,
"config": {
'exclude_by': 'input',
"blocks": [
{"view_id": "html"},
{"view_id": "text_input",
"field_id": "answer_comment",
"field_label": "Enter your comments here (Optional)",
"field_rows": 5,
"field_placeholder": "e.g Description is too short, and contains undefined abbreviations"
}
]
},
'db': db
}