I am using this script below (recipe.py)
import prodigy
import srsly
from typing import Dict
from functools import partial
import logging
# Set up logging
logging.basicConfig(level=logging.DEBUG)
@prodigy.recipe(
"my-custom-recipe",
dataset=("Dataset to save answers to", "positional", None, str),
jsonl_file=("Jsonl File to Label", "positional", None, str),
gold_file=("Gold Answers Jsonl File", "positional", None, str)
)
def my_custom_recipe(dataset, jsonl_file, gold_file):
try:
# Load your stream from the JSONL file
stream = list(srsly.read_jsonl(jsonl_file))
logging.info(f"Loaded {len(stream)} examples from {jsonl_file}")
if len(stream) == 0:
logging.error("No examples found in the JSONL file.")
return {"dataset": dataset, "stream": [], "view_id": "blocks"}
except Exception as e:
logging.error(f"Failed to load stream from {jsonl_file}: {e}")
return {"dataset": dataset, "stream": [], "view_id": "blocks"}
try:
# Load gold answers from the JSONL file
gold_data = list(srsly.read_jsonl(gold_file))
gold_answers = {item["_input_hash"]: item for item in gold_data}
logging.info(f"Loaded {len(gold_answers)} gold answers from {gold_file}")
if len(gold_answers) == 0:
logging.error("No gold answers found in the JSONL file.")
return {"dataset": dataset, "stream": [], "view_id": "blocks"}
except Exception as e:
logging.error(f"Failed to load gold answers from {gold_file}: {e}")
return {"dataset": dataset, "stream": [], "view_id": "blocks"}
def correct_answers(eg: Dict, gold_answer: Dict) -> bool:
logging.debug(f"Evaluating example: {eg}")
logging.debug(f"Gold answer: {gold_answer}")
# Check if the necessary fields exist in the example and gold answer
required_fields = ['text', 'label1', 'label2']
for field in required_fields:
if field not in gold_answer:
logging.warning(f"Gold answer missing '{field}' field: {gold_answer}")
return False
# Ensure that the example has 'label1' and 'label2' keys with default values if they don't exist
example_label1 = eg.get('label1', None)
example_label2 = eg.get('label2', None)
return (eg['text'] == gold_answer['text'] and
example_label1 == gold_answer['label1'] and
example_label2 == gold_answer['label2'])
def validate_answer(eg: Dict, gold_answers: Dict):
logging.debug(f"Validating example: {eg}")
if "_input_hash" in eg:
input_hash = eg["_input_hash"]
if input_hash in gold_answers:
result = correct_answers(eg, gold_answers[input_hash])
assert result is True, "You need to pay more attention!"
else:
logging.warning(f"No gold answer found for _input_hash: {input_hash}")
else:
logging.warning("Example does not contain '_input_hash'")
# Ensure each example in the stream contains the necessary fields
for example in stream:
if "_input_hash" not in example or "text" not in example:
logging.error(f"Example missing required fields: {example}")
continue
# Add default values for 'label1' and 'label2' if they don't exist
example.setdefault('label1', None)
example.setdefault('label2', None)
example.setdefault('accept', [])
example.setdefault('reject', [])
example.setdefault('ignore', [])
blocks = [
{"view_id": "html", "html_template": "{{text}}"},
{
"view_id": "choice",
"field_id": "label1",
"text": "اختر المستوى الأول:",
"choices": [
{"id": "easy", "text": "سهل"},
{"id": "medium", "text": "متوسط"},
{"id": "hard", "text": "صعب"}
]
},
{
"view_id": "choice",
"field_id": "label2",
"text": "اختر المستوى الثاني:",
"choices": [
{"id": "easy", "text": "سهل"},
{"id": "medium", "text": "متوسط"},
{"id": "hard", "text": "صعب"}
]
}
]
return {
"dataset": dataset,
"view_id": "blocks",
"stream": stream,
"config": {
"blocks": blocks
},
"validate_answer": partial(validate_answer, gold_answers=gold_answers)
}
However I faced error
my data sample look like:
(gold.jsonl)
{"_input_hash": 3, "text": "ميرزاجن3 هو واحد من مجموعة من الأدوية تسمى مضادات الاكتئاب.تستخدم أقراص ميرزاجن لعلاج مرض الاكتئاب.", "label1": "صعب", "label2": "متوسط"}
(sample.jsonl)
{"_input_hash": 1, "text": "ميرزاجن1 هو واحد من مجموعة من الأدوية تسمى مضادات الاكتئاب.تستخدم أقراص ميرزاجن لعلاج مرض الاكتئاب."}
{"_input_hash": 2, "text": "ميرزاجن2 هو واحد من مجموعة من الأدوية تسمى مضادات الاكتئاب.تستخدم أقراص ميرزاجن لعلاج مرض الاكتئاب."}
{"_input_hash": 3, "text": "ميرزاجن3 هو واحد من مجموعة من الأدوية تسمى مضادات الاكتئاب.تستخدم أقراص ميرزاجن لعلاج مرض الاكتئاب."}
{"_input_hash": 4, "text": "ميرزاجن4 هو واحد من مجموعة من الأدوية تسمى مضادات الاكتئاب.تستخدم أقراص ميرزاجن لعلاج مرض الاكتئاب."}
!python -m prodigy my-custom-recipe testing_new_1 sample.jsonl gold.jsonl -F recipe.py