I have 40 text files each having less than 200,000 words, the complete dataset size is 11MB. My task is to train a language model with custom entity types. For it,
- created a
my_patterns.jsonl
which has the examples for each custom entity types, to show the direction to model. - created an empty dataset
my_dataset_ner
where the new annotation for each text file will be saved. - tool a pre-trained spaCy language Model
en_core_web_lg
. - written a
custom.ner.teach
recipe over thener.teach
recipe.
recipe_train.py recipe :
from pathlib import Path
from argparse import ArgumentParser, ArgumentTypeError
import prodigy
from prodigy.components.loaders import TXT
from prodigy.recipes.ner import teach
from prodigy.components.db import connect
@prodigy.recipe('custom.ner.teach',
dataset=prodigy.recipe_args['dataset'],
spacy_model=prodigy.recipe_args['spacy_model'],
file_path=("The path to the CSV file", "positional", None, str),
label_set= prodigy.recipe_args['label_set'],
patterns= prodigy.recipe_args['patterns']
)
def custom_ner_teach(dataset, spacy_model, file_path, label_set, patterns):
stream = custom_txt_loader(file_path )
#setting up the ner.teach recipe.
components = teach( dataset= dataset,
spacy_model=spacy_model,
source=stream,
label=label_set,
patterns=patterns )
return components
def custom_txt_loader(file_path):
# Loaders recipe allows streaming in data from a text file.
for stopper, item in enumerate(Path(file_path).iterdir()):
yield { 'text': list(TXT( str(item) ))[0]['text'] }
my_patterns.jsonl
(hand-created with 60 examples of 8 different custom entity types ):
{"label":"LABEL-1","pattern":[{"lower":"%%%%%%%%%"}]}
{"label":"lABEL-2","pattern":[{"lower":"XXXXXXXXXXXXX"}]}
{"label":"lABEL-2","pattern":[{"lower":"^^^^^^^^^^^"}]}
...
Then I used the following command to run this custom.ner.teach recipe
$ prodigy custom.ner.teach my_dataset_ner en_core_web_lg "Path/to/my/40/text/files/" --label "my_custom_label" --patterns my_patterns.jsonl -F recipe_train.py
After waiting for about 2 mins, the script failed with memory error
.
The stack trace is as follow:
Traceback (most recent call last):
File "/usr/lib64/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib64/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/prodigy/__main__.py", line 259, in <module>
controller = recipe(*args, use_plac=True)
File "cython_src/prodigy/core.pyx", line 178, in prodigy.core.recipe.recipe_decorator.recipe_proxy
File "cython_src/prodigy/core.pyx", line 55, in prodigy.core.Controller.__init__
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/toolz/itertoolz.py", line 368, in first
return next(iter(seq))
File "cython_src/prodigy/core.pyx", line 84, in iter_tasks
File "cython_src/prodigy/components/sorters.pyx", line 136, in __iter__
File "cython_src/prodigy/components/sorters.pyx", line 51, in genexpr
File "cython_src/prodigy/util.pyx", line 263, in predict
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/toolz/itertoolz.py", line 716, in partition_all
prev = next(it)
File "cython_src/prodigy/components/preprocess.pyx", line 36, in split_sentences
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/spacy/language.py", line 554, in pipe
for doc, context in izip(docs, contexts):
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/spacy/language.py", line 578, in pipe
for doc in docs:
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1046, in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1046, in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)
File "pipeline.pyx", line 433, in pipe
File "pipeline.pyx", line 438, in spacy.pipeline.Tagger.predict
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/model.py", line 161, in __call__
return self.predict(x)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 55, in predict
X = layer(X)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/model.py", line 161, in __call__
return self.predict(x)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 293, in predict
X = layer(layer.ops.flatten(seqs_in, pad=pad))
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/model.py", line 161, in __call__
return self.predict(x)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 55, in predict
X = layer(X)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/resnet.py", line 15, in __call__
return X + self._layers[0](X)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/model.py", line 161, in __call__
return self.predict(x)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 55, in predict
X = layer(X)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/model.py", line 161, in __call__
return self.predict(x)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/convolution.py", line 25, in predict
return self.ops.seq2col(X, self.nW)
File "ops.pyx", line 466, in thinc.neural.ops.NumpyOps.seq2col
File "ops.pyx", line 162, in thinc.neural.ops.Ops.allocate
MemoryError
Even after reducing the loaded data-size (number of text files) to only 4 files having total 4MB, the script fails.
Traceback (most recent call last):
File "/usr/lib64/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib64/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/prodigy/__main__.py", line 259, in <module>
controller = recipe(*args, use_plac=True)
File "cython_src/prodigy/core.pyx", line 178, in prodigy.core.recipe.recipe_decorator.recipe_proxy
File "cython_src/prodigy/core.pyx", line 55, in prodigy.core.Controller.__init__
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/toolz/itertoolz.py", line 368, in first
return next(iter(seq))
File "cython_src/prodigy/core.pyx", line 84, in iter_tasks
File "cython_src/prodigy/components/sorters.pyx", line 136, in __iter__
File "cython_src/prodigy/components/sorters.pyx", line 51, in genexpr
File "cython_src/prodigy/util.pyx", line 263, in predict
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/toolz/itertoolz.py", line 716, in partition_all
prev = next(it)
File "cython_src/prodigy/components/preprocess.pyx", line 36, in split_sentences
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/spacy/language.py", line 554, in pipe
for doc, context in izip(docs, contexts):
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/spacy/language.py", line 578, in pipe
for doc in docs:
File "nn_parser.pyx", line 367, in pipe
File "cytoolz/itertoolz.pyx", line 1046, in cytoolz.itertoolz.partition_all.__next__ (cytoolz/itertoolz.c:14538)
File "nn_parser.pyx", line 374, in pipe
File "nn_parser.pyx", line 401, in spacy.syntax.nn_parser.Parser.parse_batch
File "nn_parser.pyx", line 729, in spacy.syntax.nn_parser.Parser.get_batch_model
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 61, in begin_update
X, inc_layer_grad = layer.begin_update(X, drop=drop)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 280, in begin_update
drop=drop)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 61, in begin_update
X, inc_layer_grad = layer.begin_update(X, drop=drop)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/resnet.py", line 18, in begin_update
y, bp_y = self._layers[0].begin_update(X, drop=drop)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/api.py", line 61, in begin_update
X, inc_layer_grad = layer.begin_update(X, drop=drop)
File "/home/vagrant/.virtualenvs/Cognitive/lib64/python3.6/site-packages/thinc/neural/_classes/convolution.py", line 28, in begin_update
X__bo = self.ops.seq2col(X__bi, self.nW)
File "ops.pyx", line 462, in thinc.neural.ops.NumpyOps.seq2col
File "cymem/cymem.pyx", line 42, in cymem.cymem.Pool.alloc (cymem/cymem.cpp:1091)
MemoryError: Error assigning 315124608 bytesError assigning 315124608 bytesError assigning 315124608 bytesError assigning 315124608 bytes
Could you please provide your suggestion on what’s not been done correctly? I wanna move ahead and create more annotation for my dataset and then see how my annotation work on a completely new test set?
@honnibal & @ines , Using the en_core_web_sm
instead of en_core_web_lg
, I am able to surpass the Memory error
, but I would love to use the en_core_web_lg
model as it’s predictions are far better. Moreover, Even with en_core_web_sm
fails I get a memory error if I used more than 3 text files having size ~300MB.