I'm trying to run ner.make-gold on a rather large dataset of about half a million examples. prodigy crashes every time:
user@user-Syntaxnet:~$ python3 -m prodigy ner.make-gold big_dataset en_core_web_lg --label ~/Documents/labels.txt
Using 9 labels from /home/user/Documents/labels.txtStarting the web server at http://localhost:8080 ...
Open the app in your browser and start annotating!16:16:46 - Exception when serving /get_questions
Traceback (most recent call last):
File "cython_src/prodigy/components/loaders.pyx", line 117, in prodigy.components.loaders.JSONL
ValueError: Expected object or valueDuring handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/waitress/channel.py", line 338, in service
task.service()
File "/usr/local/lib/python3.6/dist-packages/waitress/task.py", line 169, in service
self.execute()
File "/usr/local/lib/python3.6/dist-packages/waitress/task.py", line 399, in execute
app_iter = self.channel.server.application(env, start_response)
File "/usr/local/lib/python3.6/dist-packages/hug/api.py", line 423, in api_auto_instantiate
return module.hug_wsgi(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/falcon/api.py", line 244, in call
responder(req, resp, **params)
File "/usr/local/lib/python3.6/dist-packages/hug/interface.py", line 793, in call
raise exception
File "/usr/local/lib/python3.6/dist-packages/hug/interface.py", line 766, in call
self.render_content(self.call_function(input_parameters), context, request, response, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/hug/interface.py", line 703, in call_function
return self.interface(**parameters)
File "/usr/local/lib/python3.6/dist-packages/hug/interface.py", line 100, in call
return __hug_internal_self._function(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/prodigy/app.py", line 105, in get_questions
tasks = controller.get_questions()
File "cython_src/prodigy/core.pyx", line 109, in prodigy.core.Controller.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 56, in prodigy.components.feeds.SharedFeed.get_questions
File "cython_src/prodigy/components/feeds.pyx", line 61, in prodigy.components.feeds.SharedFeed.get_next_batch
File "cython_src/prodigy/components/feeds.pyx", line 130, in prodigy.components.feeds.SessionFeed.get_session_stream
File "/home/user/.local/lib/python3.6/site-packages/toolz/itertoolz.py", line 368, in first
return next(iter(seq))
File "/usr/local/lib/python3.6/dist-packages/prodigy/recipes/ner.py", line 209, in make_tasks
for doc, eg in nlp.pipe(texts, as_tuples=True):
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 548, in pipe
for doc, context in izip(docs, contexts):
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 572, in pipe
for doc in docs:
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "pipeline.pyx", line 431, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 551, in
docs = (self.make_doc(text) for text in texts)
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 544, in
texts = (tc[0] for tc in text_context1)
File "/usr/local/lib/python3.6/dist-packages/prodigy/recipes/ner.py", line 208, in
texts = ((eg['text'], eg) for eg in stream)
File "cython_src/prodigy/components/preprocess.pyx", line 118, in add_tokens
File "cython_src/prodigy/components/preprocess.pyx", line 36, in split_sentences
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 548, in pipe
for doc, context in izip(docs, contexts):
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 572, in pipe
for doc in docs:
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "pipeline.pyx", line 431, in pipe
File "cytoolz/itertoolz.pyx", line 1047, in cytoolz.itertoolz.partition_all.next
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 551, in
docs = (self.make_doc(text) for text in texts)
File "/home/user/.local/lib/python3.6/site-packages/spacy/language.py", line 544, in
texts = (tc[0] for tc in text_context1)
File "cython_src/prodigy/components/preprocess.pyx", line 35, in genexpr
File "cython_src/prodigy/components/filters.pyx", line 35, in filter_duplicates
File "cython_src/prodigy/components/filters.pyx", line 16, in filter_empty
File "cython_src/prodigy/components/loaders.pyx", line 22, in _rehash_stream
File "cython_src/prodigy/components/loaders.pyx", line 125, in JSONL
ValueError: Failed to load task (invalid JSON)....
Note that I was able to use this dataset to train a model. Any idea how I could debug this?