Hey everyone,
I'm currently working on a dataset with bank transaction data. I use a pre-calculation-step that clusters example into JSONL files. I review these files and group the examples into JSONL files that apply to a certain label or label-set. So I end up with JSONL files that I want to import into Prodigy with a given set of labels.
I've written my own custom recipe for this:
import json
from collections.abc import Iterable
from typing import Union
from prodigy.components.db import connect
from prodigy.components.stream import get_stream
from prodigy.core import Arg, recipe
@recipe(
"bulk-import",
# fmt: off
dataset=Arg(help="Dataset to save annotations to"),
source=Arg(help="Data to import (file path or '-' to read from standard input)"),
label=Arg(
"--label", "-l", help="Comma-separated label(s) to assign to all examples"
),
loader=Arg(
"--loader", "-lo", help="Loader (guessed from file extension if not set)"
),
# fmt: on
)
def bulk_import(
dataset: str,
source: Union[str, Iterable[dict]], #noqa
label: Union[str, None] = None, #noqa
loader: Union[str, None] = None, #noqa
) -> None:
"""
Bulk import JSON data into a dataset with fixed labels.
"""
print("RECIPE: Starting recipe bulk-import", locals())
if label:
label = [l.strip() for l in label.split(",")]
# Validate the source as a JSONL file
if isinstance(source, str) and source != "-":
try:
with open(source, encoding="utf-8") as f:
for i, line in enumerate(f, start=1):
try:
json.loads(line)
except json.JSONDecodeError as e:
print(f"RECIPE: Invalid JSON on line {i} in file '{source}': {e}")
raise ValueError(
f"Invalid JSON on line {i} in file '{source}': {e}"
) from e
except FileNotFoundError as e:
raise ValueError(f"Source file '{source}' not found.") from e
except Exception as e:
raise ValueError(f"Error reading source file '{source}': {e}") from e
stream = get_stream(source, loader=loader, rehash=True, input_key="text")
stream = list(stream)
if not stream:
raise ValueError("No examples loaded from the source. Make sure each JSONL line has a 'text' field.")
print(f"Loaded {len(stream)} examples")
# Add the fixed labels to each example
def add_labels_to_stream(stream, labels):
for example in stream:
example["accept"] = labels
example["answer"] = "accept"
yield example
print(f"Adding labels {label} to examples")
if label:
stream = add_labels_to_stream(stream, label)
db = connect()
if dataset not in db.datasets:
db.add_dataset(dataset)
db.add_examples(stream, [dataset])
print(f"RECIPE: Successfully imported data into dataset '{dataset}'")
Since after my last imports my model performed worse, I now wanted to make sure that I'm no missing anything in the recipe.
Is the above correct? Do I need to keep anything in mind when adding new categories through this process? Do they need to be added to the options
somehow?
Thanks for you help!