Prodigy crash caused by spacy component, not sure why

Hello,

I've been building a custom pipeline by building off of scispacy, retraining the NER component with prodigy data, and adding a set of custom Spacy components that we've built in house. Most of this works fine, but I'm at the end of a long series of steps and am suddenly hamstrung by a crash in Prodigy now. When I add the following component to my pipeline to expand abbreviations:

[..snip..]
abbreviation_rules = { "AAA": "American Academy of Audiology", ... snip long object ... }

@Language.component("abbreviation_expander")
def abbreviation_expander(doc):
    whitespaces = [token.whitespace_ for token in doc]
    new_tokens = []
    for idx, token in enumerate(doc):
        if token.text in abbreviation_rules:
            expanded_text = abbreviation_rules[token.text]
            white_spaces = re.findall(r"\s+", expanded_text)
            expanded_tokens = expanded_text.split()
            new_tokens.extend(expanded_tokens)
            for space in white_spaces:
                whitespaces.insert(idx, space)
        else:
            new_tokens.append(token.text)
    
    new_doc = Doc(doc.vocab, words=new_tokens, spaces=whitespaces)

    return new_doc

add the compoent to the model...

nlp = spacy.load(base_model) # our model with the spacy-trained NER component here
# this model works fine if you just run a text through it now

nlp.add_pipe("abbreviation_expander", first=True)
# This model will work fine if I run it locally in a Python REPL
# import spacy; nlp = spacy.load("model_name_here"); doc = nlp("some text") --> sensible results

nlp.to_disk(nlp_dir)

and I try to do:

prodigy ner.correct dataset_name_here model_name_here data.jsonl --label ENTITY

and ... I get:

Using 1 label(s): ENTITY
Traceback (most recent call last):
  File "/Users/winawer/.pyenv/versions/3.10.12/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/winawer/.pyenv/versions/3.10.12/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/__main__.py", line 50, in <module>
    main()
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/__main__.py", line 44, in main
    controller = run_recipe(run_args)
  File "cython_src/prodigy/cli.pyx", line 135, in prodigy.cli.run_recipe
  File "cython_src/prodigy/core.pyx", line 155, in prodigy.core.Controller.from_components
  File "cython_src/prodigy/core.pyx", line 307, in prodigy.core.Controller.__init__
  File "cython_src/prodigy/components/stream.pyx", line 191, in prodigy.components.stream.Stream.is_empty
  File "cython_src/prodigy/components/stream.pyx", line 230, in prodigy.components.stream.Stream.peek
  File "cython_src/prodigy/components/stream.pyx", line 343, in prodigy.components.stream.Stream._get_from_iterator
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/recipes/ner.py", line 266, in preprocess_stream
    for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1574, in pipe
    for doc in docs:
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1618, in pipe
    for doc in docs:
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/transition_parser.pyx", line 245, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
    batch = list(itertools.islice(items, int(batch_size)))
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/transition_parser.pyx", line 245, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
    batch = list(itertools.islice(items, int(batch_size)))
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/pipe.pyx", line 55, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/pipe.pyx", line 55, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/trainable_pipe.pyx", line 73, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
    batch = list(itertools.islice(items, int(batch_size)))
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/trainable_pipe.pyx", line 73, in pipe
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
    batch = list(itertools.islice(items, int(batch_size)))
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1713, in _pipe
    for doc in docs:
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1615, in <genexpr>
    docs = (self._ensure_doc(text) for text in texts)
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1564, in <genexpr>
    docs_with_contexts = (
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/recipes/ner.py", line 265, in <genexpr>
    texts = ((eg["text"], eg) for eg in stream)
  File "cython_src/prodigy/components/decorators.pyx", line 121, in inner
  File "cython_src/prodigy/components/decorators.pyx", line 50, in prodigy.components.decorators._is_structured_stream
  File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/toolz/itertoolz.py", line 999, in peek
    item = next(iterator)
  File "cython_src/prodigy/components/decorators.pyx", line 165, in inner
  File "cython_src/prodigy/components/preprocess.pyx", line 66, in split_sentences
  File "cython_src/prodigy/components/preprocess.pyx", line 272, in prodigy.components.preprocess._add_tokens
TypeError: argument of type 'NoneType' is not iterable

I'm not sure how to debug this. It's pretty clear that the error is somehow localised - or is at least started off by - the component code; removing the component from the model stops the crash, and none of the other half-dozen components we've built cause the crash. I'm guessing it's because I'm rewriting the doc with the new text, but we're doing that because we need the NER to run on the full text instead of adding a separate attribute like SciSpacy's abbreviation detector does (scispacy/scispacy/abbreviation.py at b32d8e29de552dd3613e166cdd0d4482bb4278f5 · allenai/scispacy · GitHub).

Is there a way to fix this, or am I back to square one?

Hi @Winawer,

Apologies for the delay in response!
Admittedly, it's difficult to debug without being able to step through the entire stack.
You also hit a bit of an edge case as it is currently assumed that the nlp pipeline won't change the actual text to be annotated. With that in mind we might consider improving error handling in future but for now let me explain what went wrong and how to fix it.

The main reason the recipe is failing is that the abbreviation_expander component is not propagating the ._context attribute of the Doc, which is where the recipe stores the actual content of the annotation task.
All nlp processing that Prodigy recipes do, such as e.g splitting sentences, adding tokens, adding span annotation are done via spaCy Language.pipe method with as_tuples set to True. This means that the input is tuples of the text to be piped and the python dictionary with the annotation task (context) and the output is the doc object and the copy of the context.
As you can see here, spaCy pipe stores this context in ._context atrribute.
Since abbreviation_expander creates a doc object from scratch, the context nlp.pipe is supposed to return is None which propagates down the stack, ultimately leading to the error you see.
So the first thing to fix would be to copy over this ._context atrribute in your component:

@Language.component("abbreviation_expander", retokenizes=True)
def abbreviation_expander(doc):
    whitespaces = [token.whitespace_ for token in doc]
    new_tokens = []
    for idx, token in enumerate(doc):
        if token.text in abbreviation_rules:
            expanded_text = abbreviation_rules[token.text]
            white_spaces = re.findall(r"\s+", expanded_text)
            expanded_tokens = expanded_text.split()
            new_tokens.extend(expanded_tokens)
            for space in white_spaces:
                whitespaces.insert(idx, space)
        else:
            new_tokens.append(token.text)
    
    new_doc = Doc(doc.vocab, words=new_tokens, spaces=whitespaces)
    new_doc._context = doc._context
    return new_doc

I also set retokenizes to True in the factory setting as this is what the component does.
However, it not only retokenizes, it also changes the the words as it expands the abbreviations, so Prodigy recipe should also be overwriting the text attribute of the annotation task based on the outcome of the nlp.pipe. As I mentioned above, this is an edge case that current recipes do not take into account, so you'll need to add a custom stream processing function that does that.
The easiest way would be to edit the built in ner.correct recipe. You can find the local copy in your Prodigy installation path (run prodigy stats to recall where that was).
You'll find ner.correct in recipes/ner.py and in this file, before calling preprocess_stream on line 234, you can add your extra preprocessing function like so:

 def expand_abbreviations(stream: StreamType, nlp: Language) -> StreamType:
        texts = ((eg["text"], eg) for eg in stream)
        for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
            task = copy.deepcopy(eg)
            task["text"] = doc.text
            yield task

    stream.apply(expand_abbreviations, stream=stream, nlp=nlp)
    # the rest of the recipe as is:
    stream = preprocess_stream(stream, nlp, labels=labels, unsegmented=unsegmented)