I was reviewing the terms.train-vectors
recipe and I noticed that it reads in the entire dataset before calling word2vec:
for doc in nlp.pipe((eg['text'] for eg in stream)):
for sent in doc.sents:
sentences.append([w.text for w in sent])
print("Extracted {} sentences".format(len(sentences)))
w2v = Word2Vec(sentences, size=size, window=window, min_count=min_count,
sample=1e-5, iter=n_iter, workers=n_workers,
negative=negative)
This is extremely inefficient and probably accounts for the out-of-memory errors others have been seeing.
I haven’t adapted it to prodigy yet but I found some of my old code for gensim word2vec which I’ve attached until I have a chance to fix the prodigy recipe:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities, utils
from gensim.utils import simple_preprocess, iter_windows
from gensim.summarization.textcleaner import get_sentences
import glob
import pandas as pd
class MakeIter(object):
def __init__(self, generator_func, *args, **kwargs):
self.generator_func = generator_func
self.args = args
self.kwargs = kwargs
def __iter__(self):
return self.generator_func(*self.args, **self.kwargs)
data_dir = 'data_dir'
docs = [f for f in glob.glob(data_dir + '*.json')]
def tokenize(text):
return [simple_preprocess(sent) for sent in get_sentences(text)]
def yield_docs(filenames):
for fn in filenames:
with open(fn, 'r') as f:
df = pd.read_json(f, orient='columns')
for note in df['TEXT']:
for sent in tokenize(note):
yield sent
del df
doc_stream = MakeIter(yield_docs, docs)
word2vec = models.Word2Vec(doc_stream, workers=5)
word2vec.save(save_dir + 'word2ec.model')