yes testingof22kto27kdatasm is the name of the database to store our annotations and my recipe is
recipe.py
import prodigy
from prodigy import recipe_args
import spacy
from prodigy.util import read_jsonl
from spacy.matcher import Matcher
import prodigy
import spacy
from prodigy.util import log
import spacy.gold
import spacy.vocab
import spacy.tokens
import copy
from spacy.tokens import Span
from prodigy.components.preprocess import split_sentences, add_tokens
from prodigy.components.loaders import get_stream
from prodigy.core import recipe_args
from prodigy.util import split_evals, get_labels_from_ner, get_print, combine_moo
dels
from prodigy.util import read_jsonl,write_jsonl, set_hashes, log, prints
from prodigy.util import INPUT_HASH_ATTR
@prodigy.recipe('ner.make-silver',
dataset=recipe_args['dataset'],
spacy_model=recipe_args['spacy_model'],
source=recipe_args['source'],
api=recipe_args['api'],
loader=recipe_args['loader'],
patterns=recipe_args['patterns'],
labels=recipe_args['label_set'],
exclude=recipe_args['exclude'],
unsegmented=recipe_args['unsegmented'])
def make_gold(dataset, spacy_model, source=None, api=None, loader=None,
patterns=None, labels=None, exclude=None, unsegmented=False):
"""
Create gold data for NER by correcting a model's suggestions.
"""
#log("RECIPE: Starting recipe ner.make-gold", locals())
nlp = spacy.load(spacy_model)
#log("RECIPE: Loaded model {}".format(spacy_model))
patterns_by_label = {}
for entry in read_jsonl(patterns):
patterns_by_label.setdefault(entry['label'], []).append(entry['pattern']]
)
matcher = Matcher(nlp.vocab)
for pattern_label, patterns in patterns_by_label.items():
matcher.add(pattern_label, None, *patterns)
# Get the label set from the `label` argument, which is either a
# comma-separated list or a path to a text file. If labels is None, check
# if labels are present in the model.
if labels is None:
labels = set(get_labels_from_ner(nlp) + list(patterns_by_label.keys()))
print("Using {} labels from model: {}"
.format(len(labels), ', '.join(labels)))
log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
stream = get_stream(source, api=api, loader=loader, rehash=True,
dedup=True, input_key='text')
# Split the stream into sentences
if not unsegmented:
stream = split_sentences(nlp, stream)
# Tokenize the stream
stream = add_tokens(nlp, stream)
def make_tasks():
"""Add a 'spans' key to each example, with predicted entities."""
texts = ((eg['text'], eg) for eg in stream)
for doc, eg in nlp.pipe(texts, as_tuples=True):
task = copy.deepcopy(eg)
spans = []
matches = matcher(doc)
pattern_matches = tuple(Span(doc, start, end, label) for label, starr
t, end in matches)
for ent in doc.ents + pattern_matches:
if labels and ent.label_ not in labels:
continue
spans.append({
'token_start': ent.start,
'token_end': ent.end - 1,
'start': ent.start_char,
'end': ent.end_char,
'text': ent.text,
'label': ent.label_,
'source': spacy_model,
'input_hash': eg[INPUT_HASH_ATTR]
})
task['spans'] = spans
task = set_hashes(task)
yield task
return {
'view_id': 'ner_manual',
'dataset': dataset,
'stream': make_tasks(),
'exclude': exclude,
'update': None,
'config': {'lang': nlp.lang, 'labels': labels}
}