I'm using joblib to train multiple textcat models at once. I'm getting this warning -- should I be worried?
Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (684831, 300))
Here is a rough outdraft of the code -- is it sensible to use the same nlp
object for all jobs?
from joblib import Parallel, delayed
from functools import partial
def main()
nlp = spacy.load('en_core_web_lg')
trainer_ = delayed(partial(
trainer,
nlp=nlp,
labels=labels,
n_iter=n_iter,
dropout=dropout,
learn_rate=learn_rate,
batch_start=batch_start,
batch_max=batch_max
))
executor = Parallel(n_jobs=4, backend="multiprocessing", prefer="processes")
tasks = (trainer_(tdata) for tdata in output_dirs, train_data)
executor(tasks)
def trainer(
train_data,
nlp,
labels,
n_iter,
dropout,
learn_rate,
batch_start,
batch_max,
):
config = {"exclusive_classes": False, "architecture": 'bow'}
textcat = nlp.create_pipe("textcat", config=config)
nlp.add_pipe(textcat, last=True)
for label in labels:
textcat.add_label(label)
batch_sizes = compounding(batch_start, batch_max, 1.001)
other_pipes = [pipe for pipe in nlp.pipe_names if not pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for epoch in range(1, n_iter + 1):
losses = {}
random.shuffle(train_data)
batches = minibatch(train_data, size=batch_sizes)
for batch in batches:
texts, cats = zip(*batch)
nlp.update(texts, cats, sgd=optimizer, drop=dropout, losses=losses)
I'm only getting 2 cores active even though n_jobs=4
; which is strange.