Partially Fixed: ner.batch-train's split_sentences does not properly handle tokens and spans

I have annotation data that was generated on paragraphs in the UI. I tried training a sentence-level NER model with batch-train, but discovered that prodigy’s split_sentences function has two unexpected behaviors.

  • It drops all spans from everything except the first sentence
  • It includes all original tokens in all sentences.

I have code below which solves the issue for my use case (reading in jsonl files). I’ve tested it in the UI, and the new spans are all proper. But it would probably need to be altered for a general purpose Prodigy solution.

Edit: added newline character in sample text “\n” to demonstrate the need for the strip() function

import copy
def split_task_sentences(nlp, tasks):
    """
    Throws away spans across sentence boundaries. TODO: option to trim spans across boundaries
    """
   def _clean_toks(toks, start_char, end_char):
        toks = [tok for tok in toks if tok['start'] >= start_char and tok['end'] <= end_char]
        id_offset = toks[0]['id']
        for tok in toks:
            tok['start'] -= start_char
            tok['end'] -= start_char
            tok['id'] -= id_offset
        return id_offset, toks
    
    def _clean_ents(ents, start_char, end_char, id_offset):
        ents = [ent for ent in ents if ent['start'] >= start_char and ent['end'] <= end_char]
        for ent in ents:
            ent['start'] -= start_char
            ent['end'] -= start_char
            ent['token_start'] -= id_offset
            ent['token_end'] -= id_offset
        return ents
    
    sent_tasks = []
    for task in tasks:
        start_offset = 0
        text = task['text']
        for sent in nlp(text).sents:
            st = str(sent).strip()
            sent_task = copy.deepcopy(task)
            sent_task.update({'text': st})
            start_char = start_offset + text.index(st)
            end_char = start_char + len(st)
            start_offset += len(str(sent))
            text = text[len(str(sent)):]
            
            id_offset, sent_task['tokens'] = _clean_toks(sent_task['tokens'], start_char, end_char)
            sent_task['spans'] = _clean_ents(sent_task['spans'], start_char, end_char, id_offset)
            yield sent_task

Demonstration of bug and fix:

import pprint
from prodigy.components.preprocess import split_sentences
import spacy
nlp = spacy.load('en_core_web_sm')
text = "Hello world.\nGoodbye world."
task = {
    "text": text,
    "tokens": [{"text": str(tok), "start": tok.idx, "end": len(tok)+tok.idx, 'id': tok.i} for tok in nlp(text)],
    "spans": [{ "text": "Hello", "token_start":0, "token_end":0, "start":0, "end":5, "label": 'LOC'},
              { "text": "Goodbye", "token_start":3, "token_end":3, "start":13, "end":20, "label": 'LOC'}]
}
print("Task")
pprint.pprint(task)
print("\nProdigy Split Sentences")
pprint.pprint(list(split_sentences(nlp, [task])))
print("\nNew Split Sentences")
pprint.pprint(list(split_task_sentences(nlp, [task])))

Output:

Task
{'spans': [{'end': 5,
            'label': 'LOC',
            'start': 0,
            'text': 'Hello',
            'token_end': 0,
            'token_start': 0},
           {'end': 20,
            'label': 'LOC',
            'start': 13,
            'text': 'Goodbye',
            'token_end': 3,
            'token_start': 3}],
 'text': 'Hello world.\nGoodbye world.',
 'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
            {'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
            {'end': 12, 'id': 2, 'start': 11, 'text': '.'},
            {'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
            {'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
            {'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
            {'end': 27, 'id': 6, 'start': 26, 'text': '.'}]}

Prodigy Split Sentences
[{'_input_hash': 2075819077,
  '_task_hash': 2067075801,
  'spans': [{'end': 5,
             'label': 'LOC',
             'start': 0,
             'text': 'Hello',
             'token_end': 0,
             'token_start': 0}],
  'text': 'Hello world.\n',
  'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
             {'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
             {'end': 12, 'id': 2, 'start': 11, 'text': '.'},
             {'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
             {'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
             {'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
             {'end': 27, 'id': 6, 'start': 26, 'text': '.'}]},
 {'_input_hash': -860371345,
  '_task_hash': 793178080,
  'spans': [],
  'text': 'Goodbye world.',
  'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
             {'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
             {'end': 12, 'id': 2, 'start': 11, 'text': '.'},
             {'end': 13, 'id': 3, 'start': 12, 'text': '\n'},
             {'end': 20, 'id': 4, 'start': 13, 'text': 'Goodbye'},
             {'end': 26, 'id': 5, 'start': 21, 'text': 'world'},
             {'end': 27, 'id': 6, 'start': 26, 'text': '.'}]}]

New Split Sentences
[{'spans': [{'end': 5,
             'label': 'LOC',
             'start': 0,
             'text': 'Hello',
             'token_end': 0,
             'token_start': 0}],
  'text': 'Hello world.',
  'tokens': [{'end': 5, 'id': 0, 'start': 0, 'text': 'Hello'},
             {'end': 11, 'id': 1, 'start': 6, 'text': 'world'},
             {'end': 12, 'id': 2, 'start': 11, 'text': '.'}]},
 {'spans': [{'end': 7,
             'label': 'LOC',
             'start': 0,
             'text': 'Goodbye',
             'token_end': -1,
             'token_start': -1}],
  'text': 'Goodbye world.',
  'tokens': [{'end': 7, 'id': 0, 'start': 0, 'text': 'Goodbye'},
             {'end': 13, 'id': 1, 'start': 8, 'text': 'world'},
             {'end': 14, 'id': 2, 'start': 13, 'text': '.'}]}]
1 Like

Thanks so much for the detailed analysis and suggestion! :pray: Will have a look at this in more detail and implement the fix for the next version.