Hi,
I have trained my own word2vec model on a corpus of documents. I now want to create a blank spacy model that uses those vectors. To do this I have taken code I have seen on this forum - see code below…
Once I have my blank spacy model with custom vectors I attempt to do an ner.batch-train:
python -m prodigy ner.batch-train my_dataset en_core_apparel --output my_output --label MY_LABELS --eval-split 0.2 --n-iter 30 --batch-size 8
The recipe successfully loads the model and prints out that its using 20% of accept/reject examples. And then it errors out with either a bus error or a segementation fault. Any ideas on what can be wrong? Am I adding my vectors correctly? - My word2vec model is not large - bin file ~40MB - 19351 entries, 19420 vectors
Code I use to create a blank model:
from gensim import models
import spacy
import numpy as np
from prodigy.util import export_model_data
from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer
from spacy.pipeline import SentenceSegmenter
from spacy.pipeline import DependencyParser
import logging
def pkl_to_bin(pkl_file, bin_file):
logging.info("converting word2vec to bin file")
word2vec = models.Word2Vec.load(pkl_file)
word2vec.wv.save_word2vec_format(bin_file)
def create_blank_spacy(entities):
logging.info("creating blank model")
nlp = spacy.blank('en')
tokenizer = English().Defaults.create_tokenizer(nlp)
ner = EntityRecognizer(nlp.vocab)
for entity in entities:
ner.add_label(entity)
nlp.add_pipe(ner)
nlp.add_pipe(SentenceSegmenter(nlp.vocab))
nlp.add_pipe(DependencyParser(nlp.vocab))
return nlp
def add_vectors_to_model(spacy_model, w2v_bin_file):
logging.info("adding vectors to blank model")
rows, cols = 0, 0
for i, line in enumerate(open(w2v_bin_file, 'r')):
if i == 0:
rows, cols = line.split()
rows, cols = int(rows), int(cols)
spacy_model.vocab.reset_vectors(shape=(rows,cols))
else:
word, *vec = line.split()
vec = np.array([float(i) for i in vec])
spacy_model.vocab.set_vector(word, vec)
def model_to_disk(model, model_name, model_outfile):
logging.info("saving model to disk")
optimizer = nlp.begin_training(lambda: [])
model.meta['name'] = model_name
model.to_disk(model_outfile)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
word2vec_model_file = "../apparel/models/w2v/product_word2vec.pkl"
word2vec_bin_file = "../apparel/models/w2v/product_word2vec.bin"
spacy_word2vec = "en_core_apparel"
pkl_to_bin(word2vec_model_file, word2vec_bin_file)
nlp = create_blank_spacy(['BRAND', 'PRODUCT_TYPE', 'SIZE', 'MATERIAL', 'AGE', 'COLOUR', 'GENDER'])
add_vectors_to_model(nlp, word2vec_bin_file)
model_to_disk(nlp, spacy_word2vec, spacy_word2vec)
A bit more info:
I am on spacy version 2.0.11. Running python 3.6.1