Hello,
Thanks a lot for your message. I did not recieved any notifications .
We are indeed using pdf.image.manual.
@prodigy.recipe(
"custom.image.manual",
document_type=("The document type to annotate", "positional", None, DocumentType),
exclude=("Names of datasets to exclude", "option", "e", split_string),
darken=("Darken image to make boxes stand out more", "flag", "D", bool),
)
def image_manual(
document_type: DocumentType,
exclude: Optional[List[str]] = None,
darken: bool = False,
) -> ControllerComponentsDict:
"""Manually annotate images by drawing rectangular bounding boxes or
polygon shapes on the image."""
pdf_2_images = {
pdf_path: pdf_to_images(pdf_path)
for pdf_path in find_category_pdfs(document_type)
}
logger.info(
"Found images to annotate.",
num_pdfs=len(pdf_2_images),
num_images=sum(len(images) for images in pdf_2_images.values()),
)
# Have the user choose the field to annotate
field = choose_field(document_type)
# Load LLMs generated bounding boxes
pdf_2_boxes = {
pdf_path: load_precomputed_boxes(pdf_path, field)
for pdf_path in find_category_pdfs(document_type)
}
# Generator examples to annotate
examples = image_generator(pdf_2_images, pdf_2_boxes, field)
logger.info(
"Generated examples to annotate.",
num_examples=len(examples),
field_chosen=field,
)
recipe_config = {
"dataset": get_dataset_name(
document_type, field
), # Name of dataset to save annotations
"view_id": "image_manual",
"before_db": remove_b64,
"stream": examples, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"labels": [field],
# "darken_image": 0.3 if darken else 0,
# "pages_show_thumbnails": True,
"image_manual_stroke_width": 2,
"custom_theme": {
"labels": {
field: LABEL_COLOR,
}
},
},
}
return recipe_config
This is our reciepie code. The config seems to be here ?
As well as our image generator:
def image_generator(
pdf_to_images: dict[Path, List[Path]],
pdf_to_boxes: dict[Path, List[BoundingBox]],
field: str,
) -> List[dict]:
# First collect all examples, separating annotated and unannotated
def make_example(
parsed_path: Path,
pdf_path: Path,
img_path: Path,
page_boxes: list[BoundingBox],
field: str,
) -> dict:
page = infer_page_from_image_path(img_path)
with Image.open(img_path) as img:
width, height = img.size
return {
"image": file_to_b64(img_path),
"view_id": "image_manual",
"width": width,
"height": height,
"spans": [
infer_span_from_bbox(box, width, height, field) for box in page_boxes
],
"meta": {
"page": page,
"parsed_path": parsed_path,
"pdf_path": pdf_path,
"field": field,
},
"path": img_path.as_posix(),
}
# Process all PDFs and collect examples
all_pdfs = []
for pdf_path, img_paths in pdf_to_images.items():
pdf_boxes = pdf_to_boxes.get(pdf_path, [])
sorted_images = sorted(img_paths, key=infer_page_from_image_path)
all_examples = []
for img_path in sorted_images:
page = infer_page_from_image_path(img_path)
page_boxes = [x for x in pdf_boxes if x.page == page]
parsed_path = TestDocument.get_cached_parse_path(
PARSED_DOCUMENTS_FOLDER.as_posix(), pdf_path.as_posix()
)
example = make_example(
parsed_path, pdf_path.as_posix(), img_path, page_boxes, field
)
all_examples.append(example)
# assert len(all_examples) > 0
all_pdfs.append(
set_hashes(
{
"pages": all_examples,
"config": {
"view_id": "pages",
},
}
)
)
return all_pdfs
Do you spot any issues? The error message is quite broad ?
Thanks a lot for your help,
Pierre