Hello!
I am writing a custom recipe to use multiple NER blocks, each relying on a different set of tokens. The task is such that the all blocks need to be annotated at the same time; I can't easily break this into multiple tasks. However, I have only found how to make all the NER blocks rely on the same set of tokens, set in the 'token' key of the example. Ideally, I would like each NER block to pull tokens from a different key, set like "field_id" for "text_input".
Here's a simplified version of my recipe:
from pathlib import Path
from typing import Union, Dict, Any
import srsly
from prodigy.core import recipe
@recipe(
"ner.double",
dataset=("Dataset to save annotations to", "positional", None, str),
example_file=("JSONL file with examples", "positional", None, str),
)
def ner_double(
dataset: str,
example_file: Union[str, Path],
) -> Dict[str, Any]:
def get_stream(examples):
for example in examples:
yield {
"id": example["id"],
"tokens_1": make_prodigy_tokens(example["key_1"]),
"tokens_2": make_prodigy_tokens(example["key_2"]),
}
examples = srsly.read_jsonl(example_file)
stream = get_stream(examples)
blocks = [
{
"view_id": "ner_manual",
"field_id": "tokens_1", # Does not work.
"labels": ["label_1"],
},
{
"view_id": "ner_manual",
"field_id": "tokens_2", # Does not work.
"labels": ["label_2"],
},
]
return {
"dataset": dataset,
"stream": stream,
"view_id": "blocks",
"config": {
"blocks": blocks,
},
}
class Tokenizer:
def tokenize(self, text):
return text.split()
def make_prodigy_tokens(
string: str,
tokenizer: Tokenizer = Tokenizer(),
):
def wrap_token(i, token, last_end):
start = last_end + string[last_end:].find(token)
end = start + len(token)
return {
"id": i,
"text": token,
"start": start,
"end": end,
}
token_list = []
for i, token in enumerate(tokenizer.tokenize(string)):
last_end = 0 if i == 0 else token_list[-1]["end"]
token_dict = wrap_token(i, token, last_end)
token_list.append(token_dict)
return token_list
Is there any way to do this?
Thank you in advance!