Thank you for the example. I’m trying to implement it now, however quick question. what is nr_class
?
I was thinking about using this method:
nlp.get_pipe('textcat').model = build_text_classifier()
However it requires that i provide a nr_class, but i have no idea what this arguments does or is supposed to be.
EDIT: I think i figured out that nr_class is for number of classes to output. Correct? So that should be 1 for this case (only categorizing NOUN_NOUN_SPLIT_ERROR)
EDIT 2:
There where some imports missing to get the model working. I used these:
from spacy._ml import concatenate_lists, SpacyVectors, zero_init, logistic
And a type in the code (missing parenthesis):
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}):
# Define a vector table that embeds values from each of these columns. These are the columns used
# By default in spaCy.
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
lower = HashEmbed(width, nr_vector, column=cols.index(LOWER))
prefix = HashEmbed(width//2, nr_vector, column=cols.index(PREFIX))
suffix = HashEmbed(width//2, nr_vector, column=cols.index(SUFFIX))
shape = HashEmbed(width//2, nr_vector, column=cols.index(SHAPE))
#### Define tables for our new features
cols.extend([TAG, DEP])
tag = HashEmbed(64, 500, column=cols.index(TAG))
dep = HashEmbed(64, 500, column=cols.index(DEP))
# Add the tag and dep features to the token embedding.
# Note that there's an important change here from the model definition within spaCy.
# spaCy's models wrap the vectors in the function `uniqued()`, which caches the vector
# constructed for each word type in a batch. This works because if a word has the same ORTH,
# it must necessarily have the same PREFIX, SUFFIX, SHAPE, etc. But this isn't true for TAG and DEP
# so we must not cache the vectors.
# Overall this layer takes a Doc object, extracts numeric IDs with doc.to_array(), embeds each ID into a vector using a separate table per ID, concatenates the vectors, and then uses a Maxout layer to reduce the dimensionality back down. Layer normalization is applied after the Maxout operation.
vectors = (
FeatureExtracter(cols)
>> with_flatten(
(lower | prefix | suffix | shape | tag | dep)
>> LN(Maxout(width, width+(width//2)*3 + 64 + 64))
)
)
# Here we add features from pre-trained vectors as well.
static_vectors = (
SpacyVectors
>>with_flatten(Affine(width, pretrained_dims))
)
vectors = concatenate_lists(vectors, static_vectors)
vectors_width = width*2
# Now that we have our word representations, we pass them through the CNN. You might want to try changing this to be deeper --- that would give you more context.
model = (
vectors
>> with_flatten(
LN(Maxout(width, vectors_width))
>> Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 2, pad=2)
>> flatten_add_lengths
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> Residual(zero_init(Maxout(width, width)))
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
>> logistic
)
# Set the output dimension for the model, for future reference.
model.nO = nr_class
return model
After running this a few iterations it seems like the i have failed to attach the model correctly. as the model never improves. So this is not working:
model = TextClassifier(nlp, labels, long_text=long_text,
low_data=len(examples) < 1000)
nlp.get_pipe('textcat').model = build_text_classifier(1)
Here is the complete batch classifier recipie (Nothing except for one line is changed):
@recipe('textcat.batch-train-tag',
dataset=recipe_args['dataset'],
input_model=recipe_args['spacy_model'],
output_model=recipe_args['output'],
lang=recipe_args['lang'],
factor=recipe_args['factor'],
dropout=recipe_args['dropout'],
n_iter=recipe_args['n_iter'],
batch_size=recipe_args['batch_size'],
eval_id=recipe_args['eval_id'],
eval_split=recipe_args['eval_split'],
long_text=("Long text", "flag", "L", bool),
silent=recipe_args['silent'])
def batch_train(dataset, input_model=None, output_model=None, lang='en',
factor=1, dropout=0.2, n_iter=10, batch_size=10,
eval_id=None, eval_split=None, long_text=False, silent=False):
"""
Batch train a new text classification model from annotations. Prodigy will
export the best result to the output directory, and include a JSONL file of
the training and evaluation examples. You can either supply a dataset ID
containing the evaluation data, or choose to split off a percentage of
examples for evaluation.
"""
log("RECIPE: Starting recipe textcat.batch-train", locals())
DB = connect()
print_ = get_print(silent)
random.seed(0)
if input_model is not None:
nlp = spacy.load(input_model, disable=['ner'])
print_('\nLoaded model {}'.format(input_model))
else:
nlp = spacy.blank(lang, pipeline=[])
print_('\nLoaded blank model')
examples = DB.get_dataset(dataset)
labels = {eg['label'] for eg in examples}
labels = list(sorted(labels))
model = TextClassifier(nlp, labels, long_text=long_text,
low_data=len(examples) < 1000)
# This is where the change is!!!!
nlp.get_pipe('textcat').model = build_text_classifier(1)
log('RECIPE: Initialised TextClassifier with model {}'
.format(input_model), model.nlp.meta)
random.shuffle(examples)
if eval_id:
evals = DB.get_dataset(eval_id)
print_("Loaded {} evaluation examples from '{}'"
.format(len(evals), eval_id))
else:
examples, evals, eval_split = split_evals(examples, eval_split)
print_("Using {}% of examples ({}) for evaluation"
.format(round(eval_split * 100), len(evals)))
random.shuffle(examples)
examples = examples[:int(len(examples) * factor)]
print_(printers.trainconf(dropout, n_iter, batch_size, factor,
len(examples)))
if len(evals) > 0:
print_(printers.tc_update_header())
best_acc = {'accuracy': 0}
best_model = None
if long_text:
examples = list(split_sentences(nlp, examples))
for i in range(n_iter):
loss = 0.
random.shuffle(examples)
for batch in cytoolz.partition_all(batch_size,
tqdm.tqdm(examples, leave=False)):
batch = list(batch)
loss += model.update(batch, revise=False, drop=dropout)
if len(evals) > 0:
with nlp.use_params(model.optimizer.averages):
acc = model.evaluate(tqdm.tqdm(evals, leave=False))
if acc['accuracy'] > best_acc['accuracy']:
best_acc = dict(acc)
best_model = nlp.to_bytes()
print_(printers.tc_update(i, loss, acc))
if len(evals) > 0:
print_(printers.tc_result(best_acc))
if output_model is not None:
if best_model is not None:
nlp = nlp.from_bytes(best_model)
msg = export_model_data(output_model, nlp, examples, evals)
print_(msg)
return best_acc['accuracy']