import prodigy
from prodigy.components.loaders import JSONL
from prodigy.util import split_spans
@prodigy.recipe("custom_ner_recipe",
dataset=("Menus", "positional", None, str),
label1=("menu_item", "positional", None, str),
label2=("price", "positional", None, str),
span_label=("Menu_group", "positional", None, str))
def custom_ner_annotation(dataset, Menu_Item, price, menu_group):
def add_tokens_to_spans(examples):
for eg in examples:
text = eg["text"] # Assuming your data has a "text" field
menu_items = eg.get("MENU_ITEM", ["Meat Lovers"])
prices = eg.get("PRICE", ["1.99"]) # Assuming you have a PRICE field
menu_groups = eg.get("Menu_group", ["Extra Large Pizza"]) # Assuming you have a Menu_group field
spans = []
for menu_item in menu_items:
spans.append({
"start": text.index(menu_item),
"end": text.index(menu_item) + len(menu_item),
"label": menu_item # Replace with the appropriate label for menu items
})
for price in prices:
spans.append({
"start": text.index(price),
"end": text.index(price) + len(price),
"label": price # Replace with the appropriate label for prices
})
for menu_group in menu_groups:
spans.append({
"start": text.index(menu_group),
"end": text.index(menu_group) + len(menu_group),
"label": menu_group # Replace with the appropriate label for menu groups
})
eg["spans"] = spans
yield eg
stream = JSONL("C:\Users\matt\Downloads\json-fixer (3).jsonl") # Corrected path
components = [
add_tokens_to_spans,
"ner_manual", # Built-in NER annotation UI
{
"label": menu_group, # Use span_label instead of menu_group
"pattern": [{"label": menu_group}], # Corrected pattern label
"on_exit": prodigy.set_hashes,
},
]
return {
"dataset": dataset,
"stream": stream,
"view_id": "ner_manual",
"config": {
"labels": [Menu_Item, price,menu_group],
"exclude_by": input,
},
"update": None,
"before_db": None,
"after_db": None,
"on_exit": None,
"config_auto": False,
"progress": None,
"total": None,
"get_session_id": None,
"components": components,
}
The first few lines of the jsonl file look like this
[
{"restaurant_id": "restaurant_id", "category": "category", "name": "name", "description": "description", "price": "price"},
{"restaurant_id": "1", "category": "Extra Large Pizza", "name": "Extra Large Meat Lovers", "description": "Whole pie.", "price": "15.99 USD"},
{"restaurant_id": "1", "category": "Extra Large Pizza", "name": "Extra Large Supreme", "description": "Whole pie.", "price": "15.99 USD"},
{"restaurant_id": "1", "category": "Extra Large Pizza", "name": "Extra Large Pepperoni", "description": "Whole pie.", "price": "14.99 USD"},
The error I am getting is
Using 18 labels from model: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC,
MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART
✘ Error while validating stream: no first example.
This likely means that your stream is empty. This can also mean all the examples
in your stream have been annotated in datasets included in your --exclude recipe
parameter.
This is my very first NER project for spacy forgive me if I am missing something basic. Any help would be wonderful.