I've been building a custom pipeline by building off of scispacy, retraining the NER component with prodigy data, and adding a set of custom Spacy components that we've built in house. Most of this works fine, but I'm at the end of a long series of steps and am suddenly hamstrung by a crash in Prodigy now. When I add the following component to my pipeline to expand abbreviations:
abbreviation_rules = { "AAA": "American Academy of Audiology", ... snip long object ... }
def abbreviation_expander(doc):
whitespaces = [token.whitespace_ for token in doc]
new_tokens = []
for idx, token in enumerate(doc):
if token.text in abbreviation_rules:
expanded_text = abbreviation_rules[token.text]
white_spaces = re.findall(r"\s+", expanded_text)
expanded_tokens = expanded_text.split()
for space in white_spaces:
whitespaces.insert(idx, space)
new_doc = Doc(doc.vocab, words=new_tokens, spaces=whitespaces)
return new_doc
add the compoent to the model...
nlp = spacy.load(base_model) # our model with the spacy-trained NER component here
# this model works fine if you just run a text through it now
nlp.add_pipe("abbreviation_expander", first=True)
# This model will work fine if I run it locally in a Python REPL
# import spacy; nlp = spacy.load("model_name_here"); doc = nlp("some text") --> sensible results
and I try to do:
prodigy ner.correct dataset_name_here model_name_here data.jsonl --label ENTITY
and ... I get:
Using 1 label(s): ENTITY
Traceback (most recent call last):
File "/Users/winawer/.pyenv/versions/3.10.12/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/Users/winawer/.pyenv/versions/3.10.12/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/__main__.py", line 50, in <module>
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/__main__.py", line 44, in main
controller = run_recipe(run_args)
File "cython_src/prodigy/cli.pyx", line 135, in prodigy.cli.run_recipe
File "cython_src/prodigy/core.pyx", line 155, in prodigy.core.Controller.from_components
File "cython_src/prodigy/core.pyx", line 307, in prodigy.core.Controller.__init__
File "cython_src/prodigy/components/stream.pyx", line 191, in prodigy.components.stream.Stream.is_empty
File "cython_src/prodigy/components/stream.pyx", line 230, in prodigy.components.stream.Stream.peek
File "cython_src/prodigy/components/stream.pyx", line 343, in prodigy.components.stream.Stream._get_from_iterator
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/recipes/ner.py", line 266, in preprocess_stream
for doc, eg in nlp.pipe(texts, as_tuples=True, batch_size=10):
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1574, in pipe
for doc in docs:
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1618, in pipe
for doc in docs:
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/transition_parser.pyx", line 245, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/transition_parser.pyx", line 245, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/pipe.pyx", line 55, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/pipe.pyx", line 55, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/trainable_pipe.pyx", line 73, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1703, in _pipe
yield from proc.pipe(docs, **kwargs)
File "spacy/pipeline/trainable_pipe.pyx", line 73, in pipe
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1650, in minibatch
batch = list(itertools.islice(items, int(batch_size)))
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/util.py", line 1713, in _pipe
for doc in docs:
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1615, in <genexpr>
docs = (self._ensure_doc(text) for text in texts)
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/spacy/language.py", line 1564, in <genexpr>
docs_with_contexts = (
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/prodigy/recipes/ner.py", line 265, in <genexpr>
texts = ((eg["text"], eg) for eg in stream)
File "cython_src/prodigy/components/decorators.pyx", line 121, in inner
File "cython_src/prodigy/components/decorators.pyx", line 50, in prodigy.components.decorators._is_structured_stream
File "/Users/winawer/.pyenv/versions/spacy-test/lib/python3.10/site-packages/toolz/itertoolz.py", line 999, in peek
item = next(iterator)
File "cython_src/prodigy/components/decorators.pyx", line 165, in inner
File "cython_src/prodigy/components/preprocess.pyx", line 66, in split_sentences
File "cython_src/prodigy/components/preprocess.pyx", line 272, in prodigy.components.preprocess._add_tokens
TypeError: argument of type 'NoneType' is not iterable
I'm not sure how to debug this. It's pretty clear that the error is somehow localised - or is at least started off by - the component code; removing the component from the model stops the crash, and none of the other half-dozen components we've built cause the crash. I'm guessing it's because I'm rewriting the doc with the new text, but we're doing that because we need the NER to run on the full text instead of adding a separate attribute like SciSpacy's abbreviation detector does (scispacy/scispacy/abbreviation.py at b32d8e29de552dd3613e166cdd0d4482bb4278f5 · allenai/scispacy · GitHub).
Is there a way to fix this, or am I back to square one?