Hi… I hit this webpage when i was looking to resolve an issue that i faced during custom NER training. The error is same as what the original post says:
['For', 'Endorsement', 'SRM', 'phase', '6', 'role', 'EA', ' ', 'if', 'it', 'is', 'not', 'a', 'Central', 'Site', 'Transaction', 'or', ' ', 'if', 'it', '’s', 'a', 'Central', 'Site', 'Transaction', 'and', 'Local', 'Central', 'Site', 'Transaction', 'remove', 'the', 'refferal', 'code']
['O', 'B-CUSTOM', 'I-CUSTOM', 'L-CUSTOM', 'O', 'B-CUSTOM', 'L-CUSTOM', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM', 'I-CUSTOM', 'L-CUSTOM', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM', 'I-CUSTOM', 'L-CUSTOM', 'O', 'B-CUSTOM', 'I-CUSTOM', 'I-CUSTOM', 'L-CUSTOM', 'O', 'O', 'O', 'O']
[72, 73, 74, 75, 72, 73, 75, 72, 72, 72, 72, 72, 72, 73, 74, 75, 72, 72, 72, 72, 72, 72, 73, 74, 75, 72, 73, 74, 74, 75, 72, 72, 72, 72]
[5, 1, 2, 3, 5, 1, 3, 5, 5, 5, 5, 5, 5, 1, 2, 3, 5, 5, 5, 5, 5, 5, 1, 2, 3, 5, 1, 2, 2, 3, 5, 5, 5, 5]
[0, 6084221849022979412, 6084221849022979412, 6084221849022979412, 0, 6084221849022979412, 6084221849022979412, 0, 0, 0, 0, 0, 0, 6084221849022979412, 6084221849022979412, 6084221849022979412, 0, 0, 0, 0, 0, 0, 6084221849022979412, 6084221849022979412, 6084221849022979412, 0, 6084221849022979412, 6084221849022979412, 6084221849022979412, 6084221849022979412, 0, 0, 0, 0]
('Self labels', [378, 394, 381, 382, 9191306739292312949, 391, 379, 388, 389, 393, 390, 384, 387, 383, 392, 386, 385, 448, 378, 394, 381, 382, 9191306739292312949, 391, 379, 388, 389, 393, 390, 384, 387, 383, 392, 386, 385, 448, 378, 394, 381, 382, 9191306739292312949, 391, 379, 388, 389, 393, 390, 384, 387, 383, 392, 386, 385, 448, 378, 394, 381, 382, 9191306739292312949, 391, 379, 388, 389, 393, 390, 384, 387, 383, 392, 386, 385, 448, 0, 6084221849022979412, 6084221849022979412, 6084221849022979412, 6084221849022979412])
Traceback (most recent call last):
File "psuedo-rehearsal.py", line 163, in <module>
train_model(revision_texts,matches_dict)
File "psuedo-rehearsal.py", line 124, in train_model
nlp.update(docs, golds, sgd=optimizer, drop=0.35, losses=losses)
File "D:\Ananth\python-modules-custom\spacy\language.py", line 415, in update
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
File "nn_parser.pyx", line 555, in spacy.syntax.nn_parser.Parser.update
File "nn_parser.pyx", line 676, in spacy.syntax.nn_parser.Parser._init_gold_batch
File "transition_system.pyx", line 75, in spacy.syntax.transition_system.TransitionSystem.get_oracle_sequence
File "transition_system.pyx", line 140, in spacy.syntax.transition_system.TransitionSystem.set_costs
ValueError: Could not find a gold-standard action to supervise the entity recognizer. The transition system has 77 actions.
Below is my code that creates this problem. I have made sure the DOC
object is not reused. So i have used 2 separate nlp models.
import spacy
from spacy.gold import GoldParse
import random, time
from toolz import itertoolz
from pathlib import Path
from ner_build_goldparse import BuildGoldParse
#load existing model...
nlp = spacy.load("D:/Ananth/xxxxxxxx/spacy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0")
revision_data = []
training_data = []
other_pipes = []
OUT_DIR = 'D:/Ananth/xxxxxxxx/spacy/models/trained'
NEW_MODEL_NAME = 'custom_trained'
#entity_label
LABEL = 'CUSTOM'
def initialize_nlp(nlp):
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
ner = nlp.get_pipe('ner')
ner.add_label(LABEL) # add new entity label to entity recognizer
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
def get_revision_text(file_name):
"""
Randomly select 1000 sentences.
"""
all_data = None
revision_texts = []
indexes = []
with open(file_name, encoding="utf-8") as file_data:
all_data = file_data.read()
all_data_splitted = all_data.split('\n')
if all_data:
random.seed(time.clock())
for x in range(1005):
i = random.randint(0,len(all_data_splitted)-1)
indexes.append(i)
#ignore any blank sentences
if(len(all_data_splitted[i].strip()) > 0):
revision_texts.append(all_data_splitted[i])
return revision_texts
def get_training_text(file_name):
all_data = None
all_data_splitted = []
with open(file_name, encoding="utf-8") as file_data:
all_data = file_data.read()
all_data_splitted = all_data.split('\n')
return all_data_splitted
def create_revision_data(revision_texts):
nlp_training = spacy.load("D:/Ananth/xxxxxxxx/spacy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0", disable=['parser'])
for doc in nlp_training.pipe(revision_texts):
#tags = [w.tag_ for w in doc]
#heads = [w.head.i for w in doc]
#deps = [w.dep_ for w in doc]
n = len(doc)
tags = [None] * n
heads = [None] * n
deps = [None] * n
entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
revision_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
deps=deps, entities=entities)))
print('deleteing nlp_training model (1)')
del nlp_training
return revision_data
def train_model(revision_texts, matches_dict):
"""
Apply the initial model to raw examples. You'll want to experiment
with finding a good number of revision texts. It can also help to
filter out some data.
"""
#revision_data = create_revision_data(revision_texts)
nlp_training = spacy.load("D:/Ananth/xxxxxxxx/spacy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0", disable=['ner'])
#nlp_training.entity.add_label(LABEL)
for key, value in matches_dict.items():
# disable ner for training data
doc = nlp_training(key)
n = len(doc)
tags = [None] * n
heads = [None] * n
deps = [None] * n
# tags = [w.tag_ for w in doc]
# heads = [w.head.i for w in doc]
# deps = [w.dep_ for w in doc]
losses = {}
entities = [(e[1],e[2], LABEL) for e in value]
training_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
deps=deps, entities=entities)))
#delete training module loaded...
print('deleting nlp_training model....(2)')
del nlp_training
#print(revision_data)
n_epoch = 5
batch_size = 32
#add the label to nlp pipelin
nlp.entity.add_label(LABEL)
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for i in range(n_epoch):
examples = revision_data + training_data
#examples = training_data
losses = {}
random.shuffle(examples)
for batch in itertoolz.partition_all(batch_size, examples):
docs, golds = zip(*batch)
print('progress... training batch:', i+1*batch_size)
#print(batch)
nlp.update(docs, golds, sgd=optimizer, drop=0.35, losses=losses)
print('training completed... losses:', losses)
# test the trained model
test_text = 'What are different Product Type that comes after\
Conviction date or occurence data of an endorsement insurance? This is New york'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
print(ent.label_, ent.text)
# save model to a directory
output_dir = Path(OUT_DIR)
if not output_dir.exists():
output_dir.mkdir()
nlp.meta['name'] = NEW_MODEL_NAME # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
print(ent.label_, ent.text)
#*-------------------------------------------*
# Code starts from here...
#
bgp = BuildGoldParse()
initialize_nlp(nlp)
training_file = 'D:/Ananth/xxxxxxxx/spacy/training/custom-sentdetec-model - Copy.train'
revision_file = 'D:/Ananth/xxxxxxxx/spacy/training/corpus - many geners - limited'
entities_file = 'D:/Ananth/xxxxxxxx/spacy/source/out - Copy.csv'
revision_texts = get_revision_text(revision_file)
#training_texts = get_training_text(training_file)
matches_dict = bgp.build(training_file, entities_file)
train_model(revision_texts,matches_dict)
Can you tell me what this issue is about? A point to note is that the iteration or batch at which i get this error is different and not same. So i am not sure if this has something to do with my data or the program itself. Also, if i reduce the number of training data to very low number, the error doesn’t happen. So I tend to believe that the issue could be with data but just wanted to get your opinion as well.
the variable matches_dict has a structure like below. The key is the text for which NERs are to be annotated and values is a list of tuple, each tuple marking the entity,
{'For New Business, Years with current employer should be numeric.' : [('New Business', 4, 16), ('employer', 37, 45), ('current', 29, 36), ('numeric', 56, 63), ('Years', 18, 23)]}
I am using Python 3.6.1 (32-bit) and SpaCy 2.0.7. I am running Windows 10.
Thanks,