We have a prodigy annotation workflow where we label data with a custom transformer model and correct the annotations in prodigy, using the custom word-level tokenization that the tokenizer puts out.
I'm playing around with some of the example projects in explosion/projects
. I've adapted the ner-drug
workflow to our dataset, it trains pretty well out of thebox with the spacy token2vec multihash embed.
If I try to use a transformer token to vec, though it performs very poorly
Here's my config:
[paths]
train = ""
dev = ""
raw = null
init_tok2vec = null
vectors = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "en"
pipeline = ["transformer","ner"]
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
before_creation = null
after_creation = null
after_pipeline_creation = null
disabled = []
batch_size = 1000
[components]
[components.ner]
factory = "ner"
moves = null
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"
[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "roberta-base"
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
[components.transformer.model.tokenizer_config]
use_fast = true
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 2000
gold_preproc = false
limit = 0
augmenter = null
[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 10000
max_epochs = 0
max_steps = 50000
eval_frequency = 200
frozen_components = []
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 3000
buffer = 256
get_length = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = true
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005
[training.score_weights]
ents_per_type = null
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]
And here is some progress:
ℹ Pipeline: ['transformer', 'ner']
ℹ Initial learn rate: 0.0
E # LOSS TRANS... LOSS NER ENTS_F ENTS_P ENTS_R SCORE
--- ------ ------------- -------- ------ ------ ------ ------
0 0 0.00 867.11 0.00 0.00 0.00 0.00
12 200 302.03 276816.16 0.00 0.00 0.00 0.00
25 400 1539.83 204429.24 0.00 0.00 0.00 0.00
38 600 2471.37 196584.01 2.04 22.22 1.07 0.02
51 800 3390.00 194100.00 2.18 22.73 1.15 0.02
64 1000 3997.73 195599.19 2.45 21.79 1.30 0.02
77 1200 4636.88 195115.86 1.48 21.28 0.76 0.01
91 1400 5466.31 193555.61 2.85 21.28 1.53 0.03
103 1600 7179.97 193318.07 2.99 21.65 1.61 0.03
117 1800 8837.21 194556.30 3.12 21.15 1.68 0.03
130 2000 8623.35 194332.85 3.12 21.15 1.68 0.03
143 2200 9406.39 192675.97 0.60 23.53 0.31 0.01
156 2400 9127.69 191244.26 2.01 17.07 1.07 0.02
169 2600 9972.05 192307.53 2.59 21.69 1.38 0.03
182 2800 16974.89 191566.55 3.02 25.30 1.61 0.03
This is unusually low for any model so I feel that something must be going subtly wrong.
Here's what the hash embed was able to train to with this model: