Hello!
I have encountered the issue mentioned above and could not find a solution. I have searched everywhere in this forum and on google but found nothing that helped me.
I tried running with both PRODIGY_LOGGING=basic
and with PRODIGY_LOGGING=verbose
. No logs show any errors. Tried swapping sqlite with postgresql, same error.
Specs:
prodigy==1.12.7
Ubuntu 23.10
Python 3.11.6
Custom recipe, ran with:
script:
- >-
prodigy image.assisted images7
datasets/images7
datasets/images7.pred.beta.mini.json
--label TITLE,AUTHORS,AUTHOR_AFFILIATIONS,ABSTRACT,THANKS,BODY,FIGURE,FIGURE.CAPTION,TABLE,TABLE.CAPTION,EQUATION,PAGE_NO,BIBLIOGRAPHY,FOOTNOTES,META,MISC,HEADER,FOOTER,COPYRIGHT
-F scripts/recipes/assisted.py
recipe in question:
import copy
import json
import typing as t
from typing import List, Optional
import prodigy
from prodigy.components.loaders import Images
from prodigy.types import StreamType
from prodigy.util import get_labels, set_hashes, split_string
from scripts.constants import CLASS_NAMES
from scripts.recipes.loaders import _to_poly
class PredictedSpan(t.TypedDict):
label: str
x: float
y: float
width: float
height: float
center: t.Tuple[float, float]
points: t.List[t.Tuple[float, float]]
def make_spans(
boxes: t.List[t.Tuple[float, float, float, float]],
labels: List[int],
scores: List[float],
) -> List[PredictedSpan]:
"""Create the spans for the predicted bounding boxes"""
spans = []
for box, label_idx, score in zip(boxes, labels, scores):
x0, y0, x1, y1 = box
center = ((x0 + x1) / 2, (y0 + y1) / 2)
span = {
"label": CLASS_NAMES[label_idx - 1],
"x": x0,
"y": y0,
"width": x1 - x0,
"height": y1 - y0,
"center": center,
"points": _to_poly(box),
}
spans.append(span)
return spans
def make_labels(bbox_path: str, stream: StreamType, threshold: float) -> StreamType:
"""Add the predicted labels in the 'labels' key of the image spans"""
examples = list(stream)
with open(bbox_path, "r") as f:
predictions = json.load(f)
for eg in examples:
task = copy.deepcopy(eg)
filename = task["path"]
if prediction := predictions.get(filename):
boxes, labels, scores = (
prediction["boxes"],
prediction["labels"],
prediction["scores"],
)
# Filter the predictions based on the threshold
boxes = [box for i, box in enumerate(boxes) if scores[i] > threshold]
labels = [label for i, label in enumerate(labels) if scores[i] > threshold]
scores = [score for score in scores if score > threshold]
spans = make_spans(boxes, labels, scores)
task["spans"] = spans
task = set_hashes(task)
yield task
@prodigy.recipe(
"image.assisted",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
source=(
"Data to assisted/annotate (directory of images, file path or '-' to read from standard input)", "positional",
None,
str), # noqa: E501
# noqa: E501
bbox_path=(
"Path to the bounding box annotations file (this model doesn't have OCR installed)", "positional", None, str),
# noqa: E501
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
threshold=("Threshold to filter the predictions (0 - 1)", "option", "t", float),
darken=("Darken image to make boxes stand out more", "flag", "D", bool),
# fmt: on
)
def assisted(
dataset: str,
source: str,
bbox_path: str,
label: Optional[List[str]] = None,
exclude: Optional[List[str]] = None,
threshold: int = 0.7,
darken: bool = False,
):
"""
Annotate documents with the help of a layout model.
"""
# Much of the proceeding blocks of code is based from image.manual
# Source: https://github.com/explosion/prodigy-recipes/blob/master/image/image_manual.py
stream = Images(source)
# Update the stream to add bounding boxes (based from annotations) and labels (based from the
# finetuned model).
# stream = make_bboxes(bbox_path, stream)
stream = make_labels(bbox_path, stream, threshold)
def before_db(examples):
for eg in examples:
if eg["image"].startswith("data:") and "path" in eg:
eg["image"] = eg["path"]
return examples
return {
"view_id": "image_manual", # Annotation interface to use
"before_db": before_db, # Function to call before the examples are added to the database
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"label": ", ".join(label) if label is not None else "all",
"labels": label, # Selectable label options,
"darken_image": 0.3 if darken else 0,
},
}
Example of data that is causing the error can be found here
Any help would be greatly appreciated!