NER Training for Corporate Names

Yes, check out this user-contributed recipe that implements a terms.manual-to-patterns workflow:

Basically, all the recipe does is read in a dataset annotated with ner.manual, and then create a pattern for each of the annotated spans in it. You might want to adjust the code to only create patterns once and if they don't exist yet (otherwise, you'll get duplicates).

Another idea could be to impement a custom recipe that uses spaCy's Matcher or PhraseMatcher to find and pre-highlight spans and use the manual NER interface. If any entities are missing, you can highlight them manually. When the examples are sent back to the server, you can extract those and add them to the Matcher. So you'd be using an update callback just like the active learning recipes – only that you're not updating your model, but a matcher in the loop. The new matcher will then be applied to the stream and the longer you annotate, the fewer spans you'd ideally have to highlight manually.

Here's a code example to illustrate the idea – haven't tested it yet, but something like this should work:

import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
import spacy
from spacy.tokens import Span
import srsly


@prodigy.recipe("manual-match")
def manual_match(dataset, source, spacy_model):
    nlp = spacy.load(spacy_model)
    matcher = Matcher(nlp.vocab)

    def add_spans_to_stream(stream):
        # Run matcher over each example in the stream and add a "spans" property
        # to each task that includes the matched spans (so they're pre-highlighted).
        for eg in stream:
            doc = nlp.make_doc(eg["text"])
            matches = matcher(doc)
            matched_spans = [Span(doc, start, end, label=match_id) 
                             for match_id, start, end in matches]
            spans = [
                {
                    "start": span.start_char, 
                    "end": span.end_char, 
                    "label": span.label_, 
                    # Indicate that this was added automatically
                    "by_matcher": True
                }
                for span in matched_spans]
            eg["spans"] = spans
            yield eg

    def update(answers):
        # Update the matcher with patterns based on highlighted spans in the
        # annotations that come back
        for answer in answers:
            text = answer["text"]
            for span in answer.get("spans", []):
                # Only add new manually added spans, not the ones that were set
                # automatically
                if not span.get("by_matcher"): 
                    doc = nlp.make_doc(text[span["start"]:span["end"]])
                    label = span["label"]
                    pattern = [{"lower": token.lower_} for token in doc]
                    matcher.add(label, None, pattern)

    def on_exit(ctrl):
        # When the Prodigy server is stopped, serialize the patterns to a file
        result = []
        for label_id, patterns in matcher._patterns.items():
            label = nlp.vocab.strings[label_id]
            for pattern in patterns:
                result.append({"label": label, "pattern": pattern})
        srsly.write_jsonl("/path/to/patterns.jsonl", result)
    
    stream = JSONL(source)
    stream = add_tokens(nlp, stream)
    stream = add_spans_to_stream(stream)

    return {
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "on_exit": on_exit
        "view_id": "ner_manual",
        "config": {
            "batch_size": 3  # low batch size so we see results faster
        }
    }

Edit: Actually, here's a much simpler version using the EntityRuler. Newly annotated spans are added to the entity ruler as patterns, and in the end, it's serialized out as a JSONL file.

import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
import spacy

@prodigy.recipe("manual-match")
def manual_match(dataset, source, spacy_model):
    nlp = spacy.load(spacy_model)  # Let's assume this has an entity ruler
    ruler = nlp.get_pipe("entityruler")

    def add_spans_to_stream(stream):
        # Add a "spans" property to each task that includes the doc.ents 
        # (so they're pre-highlighted).
        for eg in stream:
            doc = nlp.make_doc(eg["text"])
            eg["spans"] = [
                {
                    "start": ent.start_char, 
                    "end": ent.end_char, 
                    "label": ent.label_, 
                    # Indicate that this was added automatically
                    "by_model": True
                }
                for ent in doc.ents]
            yield eg

    def update(answers):
        # Update the matcher with patterns based on highlighted spans in the
        # annotations that come back
        for answer in answers:
            text = answer["text"]
            patterns = []
            for span in answer.get("spans", []):
                # Only add new manually added spans, not the ones that were set
                # automatically
                if not span.get("by_model"):
                    doc = nlp.make_doc(text[span["start"]:span["end"]])
                    label = span["label"]
                    pattern = [{"lower": token.lower_} for token in doc]
                    patterns.append({ "label": label, "pattern": pattern })
            ruler.add_patterns(patterns)

    def on_exit(ctrl):
        # When the Prodigy server is stopped, serialize the patterns to a file
        ruler.to_disk("/path/to/patterns.jsonl")
    
    stream = JSONL(source)
    stream = add_tokens(nlp, stream)
    stream = add_spans_to_stream(stream)

    return {
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "on_exit": on_exit
        "view_id": "ner_manual",
        "config": {
            "batch_size": 3  # low batch size so we see results faster
        }
    }
2 Likes