Hi @wpm I’m trying your code and I’ve some editing as:
import prodigy
import spacy
from prodigy.util import log
import spacy.gold
import spacy.vocab
import spacy.tokens
import copy
from spacy.tokens import Span
from prodigy.components.preprocess import split_sentences, add_tokens
from prodigy.components.loaders import get_stream
from prodigy.core import recipe_args
from prodigy.util import split_evals, get_labels_from_ner, get_print, combine_models
from prodigy.util import read_jsonl,write_jsonl, set_hashes, log, prints
from prodigy.util import INPUT_HASH_ATTR
@prodigy.recipe('ner.make-gold',
dataset=recipe_args['dataset'],
spacy_model=recipe_args['spacy_model'],
source=recipe_args['source'],
api=recipe_args['api'],
loader=recipe_args['loader'],
patterns=recipe_args['patterns'],
labels=recipe_args['label_set'],
exclude=recipe_args['exclude'],
unsegmented=recipe_args['unsegmented'])
def make_gold(dataset, spacy_model, source=None, api=None, loader=None,
patterns=None, labels=None, exclude=None, unsegmented=False):
"""
Create gold data for NER by correcting a model's suggestions.
"""
log("RECIPE: Starting recipe ner.make-gold", locals())
nlp = spacy.load(spacy_model)
log("RECIPE: Loaded model {}".format(spacy_model))
patterns_by_label = {}
for entry in prodigy.util.read_jsonl(patterns):
patterns_by_label.setdefault(entry['label'], []).append(entry['pattern'])
matcher = spacy.matcher.Matcher(nlp.vocab)
for pattern_label, patterns in patterns_by_label.items():
matcher.add(pattern_label, None, *patterns)
# Get the label set from the `label` argument, which is either a
# comma-separated list or a path to a text file. If labels is None, check
# if labels are present in the model.
if labels is None:
labels = set(get_labels_from_ner(nlp) + patterns_by_label.keys())
print("Using {} labels from model: {}"
.format(len(labels), ', '.join(labels)))
log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
stream = get_stream(source, api=api, loader=loader, rehash=True,
dedup=True, input_key='text')
# Split the stream into sentences
if not unsegmented:
stream = split_sentences(nlp, stream)
# Tokenize the stream
stream = add_tokens(nlp, stream)
def make_tasks():
"""Add a 'spans' key to each example, with predicted entities."""
texts = ((eg['text'], eg) for eg in stream)
for doc, eg in nlp.pipe(texts, as_tuples=True):
task = copy.deepcopy(eg)
spans = []
matches = matcher(doc)
pattern_matches = tuple(Span(doc, start, end, label) for label, start, end in matches)
for ent in doc.ents + pattern_matches:
if labels and ent.label_ not in labels:
continue
spans.append({
'token_start': ent.start,
'token_end': ent.end - 1,
'start': ent.start_char,
'end': ent.end_char,
'text': ent.text,
'label': ent.label_,
'source': spacy_model,
'input_hash': eg[INPUT_HASH_ATTR]
})
task['spans'] = spans
task = set_hashes(task)
yield task
return {
'view_id': 'ner_manual',
'dataset': dataset,
'stream': make_tasks(),
'exclude': exclude,
'update': None,
'config': {'lang': nlp.lang, 'labels': labels}
}
And run it on 1.6.1 I get this warnings:
Traceback (most recent call last):
File "/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/anaconda3/lib/python3.6/site-packages/prodigy/__main__.py", line 259, in <module>
controller = recipe(*args, use_plac=True)
File "cython_src/prodigy/core.pyx", line 253, in prodigy.core.recipe.recipe_decorator.recipe_proxy
File "/anaconda3/lib/python3.6/site-packages/plac_core.py", line 328, in call
cmd, result = parser.consume(arglist)
File "/anaconda3/lib/python3.6/site-packages/plac_core.py", line 207, in consume
return cmd, self.func(*(args + varargs + extraopts), **kwargs)
File "prodigy-ner-mg.py", line 57, in make_gold
labels = set(get_labels_from_ner(nlp) + patterns_by_label.keys())
TypeError: can only concatenate list (not "dict_keys") to list
I would like to understand to fix it
My best
C.