ENV
pandas==1.4.2
transformers==4.17.0
spacy==3.2.4
spacy-alignments==0.8.5
spacy-legacy==3.0.9
spacy-loggers==1.0.2
spacy-sentence-bert==0.1.2
spacy-transformers==1.1.5
cupy-cuda113==10.5.0
I am using the ber.ner.manual recipe.
@prodigy.recipe(
"bert.ner.manual",
# fmt: off
dataset=("Dataset to save annotations to", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
label=("Comma-separated label(s) to annotate or text file with one label per line", "option", "l", get_labels),
tokenizer_vocab=("Tokenizer vocab file", "option", "tv", str),
lowercase=("Set lowercase=True for tokenizer", "flag", "LC", bool),
hide_special=("Hide SEP and CLS tokens visually", "flag", "HS", bool),
hide_wp_prefix=("Hide wordpieces prefix like ##", "flag", "HW", bool),
suggest_model=("Model to predict labels", "option", "sm", str)
# fmt: on
)
def add_tokens(stream):
for eg in stream:
eg_tokens = BertTokenizer(eg)
eg["tokens"] = eg_tokens
yield eg
def BertTokenizer(eg):
tokens = tokenizer.encode(eg["text"])
eg_tokens = []
idx = 0
for (text, (start, end), tid) in zip(
tokens.tokens, tokens.offsets, tokens.ids
):
# If we don't want to see special tokens, don't add them
if hide_special and text in special_tokens:
continue
# If we want to strip out word piece prefix, remove it from text
if hide_wp_prefix and wp_prefix is not None:
if text.startswith(wp_prefix):
text = text[len(wp_prefix) :]
token = {
"text": text,
"id": idx,
"start": start,
"end": end,
# This is the encoded ID returned by the tokenizer
"tokenizer_id": tid,
# Don't allow selecting spacial SEP/CLS tokens
"disabled": text in special_tokens,
}
eg_tokens.append(token)
idx += 1
for i, token in enumerate(eg_tokens):
# If the next start offset != the current end offset, we
# assume there's whitespace in between
if i < len(eg_tokens) - 1 and token["text"] not in special_tokens:
next_token = eg_tokens[i + 1]
token["ws"] = (
next_token["start"] > token["end"]
or next_token["text"] in special_tokens
)
else:
token["ws"] = True
return eg_tokens
With this mask_task
function.
def make_tasks(nlp, stream, labels):
"""Add a 'spans' key to each example, with predicted entities."""
# Process the stream using spaCy's nlp.pipe, which yields doc objects.
# If as_tuples=True is set, you can pass in (text, context) tuples.
texts = ((eg["text"], eg) for eg in stream)
for doc, eg in nlp.pipe(texts, as_tuples=True):
task = copy.deepcopy(eg)
spans = []
for ent in doc.ents:
# Continue if predicted entity is not selected in labels
if labels and ent.label_ not in labels:
continue
# Create span dict for the predicted entitiy
try:
if len(doc._.trf_data.align[ent.start].data) == 0:
continue
spans.append(
{
"token_start": int(doc._.trf_data.align[ent.start].data[0][0]),
"token_end": int(doc._.trf_data.align[ent.end-1].data[-1][0]),
"start": ent.start_char,
"end": ent.end_char,
"text": ent.text,
"label": ent.label_,
}
)
except Exception as e:
print(e)
#import code; code.interact(local=locals())
raise e
task["spans"] = spans
# Rehash the newly created task so that hashes reflect added data
task = set_hashes(task)
yield task
It seems like the alignment is off. Each labeled token appears to be off by a factor of two characters to the right.
I think it has something to do with the word piece prefix, but i'm going blank trying to figure out how to adjust.