I searched but did not identify a way to customize the text displayed in the rows of history; I have a use case where snippets are extracted from larger documents and numbered snippet are displayed for labeling within a NER task. It's helpful to know which offset the numbered snippet is from. Is it possible to display additional or custom information in the history pane for each instance of labeled data?
EDIT
Following this thread I used blocks
to define a ner_manual that pulled text from a defined text.input
field. However ner_manual
still pulls from text
and not text.input
as expected, e.g.
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string
import spacy
from typing import List, Optional
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"ner.form-ui",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
exclude=("Names of datasets to exclude", "option", "e", split_string),
)
def ner_form_ui(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
exclude: Optional[List[str]] = None,
):
"""
Mark spans manually by token. Requires only a tokenizer and no entity
recognizer, and doesn't do any active learning.
"""
# Load the spaCy model for tokenization
nlp = spacy.load(spacy_model)
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
# Tokenize the incoming examples and add a "tokens" property to each
# example. Also handles pre-defined selected spans. Tokenization allows
# faster highlighting, because the selection can "snap" to token boundaries.
stream = add_tokens(nlp, stream)
blocks = [
{"view_id": "ner_manual", "text":"input.text"}
#{"view_id": "ner_manual"}
]
return {
"dataset": dataset, # Name of dataset to save annotations
"view_id": "blocks", # set the view_id to "blocks"
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": label, # Selectable label options,
"blocks": blocks # add the blocks to the config
},
}
With,
{"text": "BOOGER", "meta":{"form_name":"12588114345_wehf_SampleProjectSupportProposal.txt","page_number":0,"n":200,"jsonl_version":"0.01"},"input.text":"\n\nWALTER & ELISE ...
but, prodigy ner.form-ui ner_forms en_core_web_lg ./data/processed/form_snippets.jsonl --label FIELD,DESCRIPTION,ANSWER -F ./src/models/form_ui.py
displays "BOOGER" instead of "\n\nWALTER & ELISE ..." as expected.