I have annotation data that was generated on paragraphs in the UI. I tried training a sentence-level NER model with batch-train, but discovered that prodigy’s split_sentences
function has two unexpected behaviors.
- It drops all spans from everything except the first sentence
- It includes all original tokens in all sentences.
I have code below which solves the issue for my use case (reading in jsonl files). I’ve tested it in the UI, and the new spans are all proper. But it would probably need to be altered for a general purpose Prodigy solution.
Edit: added newline character in sample text “\n” to demonstrate the need for the strip() function
import copy
def split_task_sentences(nlp, tasks):
"""
Throws away spans across sentence boundaries. TODO: option to trim spans across boundaries
"""
def _clean_toks(toks, start_char, end_char):
toks = [tok for tok in toks if tok['start'] >= start_char and tok['end'] <= end_char]
id_offset = toks[0]['id']
for tok in toks:
tok['start'] -= start_char
tok['end'] -= start_char
tok['id'] -= id_offset
return id_offset, toks
def _clean_ents(ents, start_char, end_char, id_offset):
ents = [ent for ent in ents if ent['start'] >= start_char and ent['end'] <= end_char]
for ent in ents:
ent['start'] -= start_char
ent['end'] -= start_char
ent['token_start'] -= id_offset
ent['token_end'] -= id_offset
return ents
sent_tasks = []
for task in tasks:
start_offset = 0
text = task['text']
for sent in nlp(text).sents:
st = str(sent).strip()
sent_task = copy.deepcopy(task)
sent_task.update({'text': st})
start_char = start_offset + text.index(st)
end_char = start_char + len(st)
start_offset += len(str(sent))
text = text[len(str(sent)):]
id_offset, sent_task['tokens'] = _clean_toks(sent_task['tokens'], start_char, end_char)
sent_task['spans'] = _clean_ents(sent_task['spans'], start_char, end_char, id_offset)
yield sent_task
Demonstration of bug and fix:
import pprint
from prodigy.components.preprocess import split_sentences
import spacy
nlp = spacy.load('en_core_web_sm')
text = "Hello world.\nGoodbye world."
task = {
"text": text,
"tokens": [{"text": str(tok), "start": tok.idx, "end": len(tok)+tok.idx, 'id': tok.i} for tok in nlp(text)],
"spans": [{ "text": "Hello", "token_start":0, "token_end":0, "start":0, "end":5, "label": 'LOC'},
{ "text": "Goodbye", "token_start":3, "token_end":3, "start":13, "end":20, "label": 'LOC'}]
}
print("Task")
pprint.pprint(task)
print("\nProdigy Split Sentences")
pprint.pprint(list(split_sentences(nlp, [task])))
print("\nNew Split Sentences")
pprint.pprint(list(split_task_sentences(nlp, [task])))
Output:
Task
{'spans': [{'end': 5,
'label': 'LOC',
'start': 0,
'text': 'Hello',
'token_end': 0,
'token_start': 0},
{'end': 20,
'label': 'LOC',
'start': 13,
'text': 'Goodbye',
'token_end': 3,
'token_start': 3}],
'text': 'Hello world.\nGoodbye world.',
'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
{'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
{'end': 12, 'id': 2, 'start': 11, 'text': '.'},
{'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
{'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
{'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
{'end': 27, 'id': 6, 'start': 26, 'text': '.'}]}
Prodigy Split Sentences
[{'_input_hash': 2075819077,
'_task_hash': 2067075801,
'spans': [{'end': 5,
'label': 'LOC',
'start': 0,
'text': 'Hello',
'token_end': 0,
'token_start': 0}],
'text': 'Hello world.\n',
'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
{'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
{'end': 12, 'id': 2, 'start': 11, 'text': '.'},
{'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
{'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
{'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
{'end': 27, 'id': 6, 'start': 26, 'text': '.'}]},
{'_input_hash': -860371345,
'_task_hash': 793178080,
'spans': [],
'text': 'Goodbye world.',
'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
{'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
{'end': 12, 'id': 2, 'start': 11, 'text': '.'},
{'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
{'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
{'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
{'end': 27, 'id': 6, 'start': 26, 'text': '.'}]}]
New Split Sentences
[{'spans': [{'end': 5,
'label': 'LOC',
'start': 0,
'text': 'Hello',
'token_end': 0,
'token_start': 0}],
'text': 'Hello world.',
'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
{'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
{'end': 12, 'id': 2, 'start': 11, 'text': '.'}]},
{'spans': [{'end': 7,
'label': 'LOC',
'start': 0,
'text': 'Goodbye',
'token_end': -1,
'token_start': -1}],
'text': 'Goodbye world.',
'tokens': [{'end': 7, 'id': 0, 'start': 0, 'text': 'Goodbye'},
{'end': 13, 'id': 1, 'start': 8, 'text': 'world'},
{'end': 14, 'id': 2, 'start': 13, 'text': '.'}]}]