Hello,
In terms.py it is possible to have fast customs loaders.
Original code :
def train_vectors(output_model, source=None, loader=None, spacy_model=None,
lang='xx', size=128, window=5, min_count=10, negative=5,
n_iter=2, n_workers=4, merge_ents=False, merge_nps=False):
"""Train word vectors from a text source."""
log("RECIPE: Starting recipe terms.train-vectors", locals())
if spacy_model is None:
nlp = spacy.blank(lang)
print("Using blank spaCy model ({})".format(lang))
nlp.add_pipe(nlp.create_pipe('sentencizer'))
log("RECIPE: Added sentence boundary detector to blank model")
else:
nlp = spacy.load(spacy_model)
if merge_ents:
nlp.add_pipe(preprocess.merge_entities, name='merge_entities')
log("RECIPE: Added pipeline component to merge entities")
if merge_nps:
nlp.add_pipe(preprocess.merge_noun_chunks, name='merge_noun_chunks')
log("RECIPE: Added pipeline component to merge noun chunks")
Word2Vec = get_word2vec()
if not output_model.exists():
output_model.mkdir(parents=True)
log("RECIPE: Created output directory")
sentences = SentenceIterator(nlp,
lambda: get_stream(source, loader=loader, input_key='text'))
w2v = Word2Vec(sentences, size=size, window=window, min_count=min_count,
sample=1e-5, iter=n_iter, workers=n_workers,
negative=negative)
log("RECIPE: Resetting vectors with size {}".format(size))
nlp.vocab.reset_vectors(width=size)
log("RECIPE: Adding {} vectors to model vocab".format(len(w2v.wv.vocab)))
for word in w2v.wv.vocab:
nlp.vocab.set_vector(word, w2v.wv.word_vec(word))
nlp.to_disk(output_model)
prints('Trained Word2Vec model', output_model.resolve())
return False
And have something like (change near SentenceIterator):
def train_vectors(output_model, source=None, loader=None, spacy_model=None,
lang='xx', size=128, window=5, min_count=10, negative=5,
n_iter=2, n_workers=4, merge_ents=False, merge_nps=False):
"""Train word vectors from a text source."""
log("RECIPE: Starting recipe terms.train-vectors", locals())
if spacy_model is None:
nlp = spacy.blank(lang)
print("Using blank spaCy model ({})".format(lang))
nlp.add_pipe(nlp.create_pipe('sentencizer'))
log("RECIPE: Added sentence boundary detector to blank model")
else:
nlp = spacy.load(spacy_model)
if merge_ents:
nlp.add_pipe(preprocess.merge_entities, name='merge_entities')
log("RECIPE: Added pipeline component to merge entities")
if merge_nps:
nlp.add_pipe(preprocess.merge_noun_chunks, name='merge_noun_chunks')
log("RECIPE: Added pipeline component to merge noun chunks")
Word2Vec = get_word2vec()
if not output_model.exists():
output_model.mkdir(parents=True)
log("RECIPE: Created output directory")
if not callable(loader):
sentences = SentenceIterator(nlp,
lambda: get_stream(source, loader=loader, input_key='text'))
else:
sentences = SentenceIterator(nlp, loader)
w2v = Word2Vec(sentences, size=size, window=window, min_count=min_count,
sample=1e-5, iter=n_iter, workers=n_workers,
negative=negative)
log("RECIPE: Resetting vectors with size {}".format(size))
nlp.vocab.reset_vectors(width=size)
log("RECIPE: Adding {} vectors to model vocab".format(len(w2v.wv.vocab)))
for word in w2v.wv.vocab:
nlp.vocab.set_vector(word, w2v.wv.word_vec(word))
nlp.to_disk(output_model)
prints('Trained Word2Vec model', output_model.resolve())
return False
So we can do an import and run it.
Or even better : write code in a file, use the -F option and put the loader name we gave to the function (but I don’t know how :/)