Skip sentence and log when Error in sentence

(Vajja) #1

Please, can you let me know why labeling for NER. If there are special characters like ‘/’ or ‘-’ my near is throwing an error and it stops. Is there a way we can write those errors to logs and continue annotation of other sentences

import prodigy
from prodigy import recipe_args
import spacy
from prodigy.util import read_jsonl
from spacy.matcher import Matcher

import prodigy
import spacy
from prodigy.util import log

import spacy.vocab
import spacy.tokens
import copy

from spacy.tokens import Span
from prodigy.components.preprocess import split_sentences, add_tokens
from prodigy.components.loaders import get_stream
from prodigy.core import recipe_args
from prodigy.util import split_evals, get_labels_from_ner, get_print, combine_models
from prodigy.util import read_jsonl,write_jsonl, set_hashes, log, prints
from prodigy.util import INPUT_HASH_ATTR

def make_gold(dataset, spacy_model, source=None, api=None, loader=None,
              patterns=None, labels=None, exclude=None, unsegmented=False):
    Create gold data for NER by correcting a model's suggestions.
    #log("RECIPE: Starting recipe ner.make-gold", locals())
    nlp = spacy.load(spacy_model)
    #log("RECIPE: Loaded model {}".format(spacy_model))

    patterns_by_label = {}
    for entry in read_jsonl(patterns):
        patterns_by_label.setdefault(entry['label'], []).append(entry['pattern'])
    matcher = Matcher(nlp.vocab)
    for pattern_label, patterns in patterns_by_label.items():
        matcher.add(pattern_label, None, *patterns)

    # Get the label set from the `label` argument, which is either a
    # comma-separated list or a path to a text file. If labels is None, check
    # if labels are present in the model.
    if labels is None:
        labels = set(get_labels_from_ner(nlp) + list(patterns_by_label.keys()))
        print("Using {} labels from model: {}"
              .format(len(labels), ', '.join(labels)))
    log("RECIPE: Annotating with {} labels".format(len(labels)), labels)
    stream = get_stream(source, api=api, loader=loader, rehash=True,
                        dedup=True, input_key='text')
    # Split the stream into sentences
    if not unsegmented:
        stream = split_sentences(nlp, stream)
    # Tokenize the stream
    stream = add_tokens(nlp, stream)

    def make_tasks():
        """Add a 'spans' key to each example, with predicted entities."""
        texts = ((eg['text'], eg) for eg in stream)
        for doc, eg in nlp.pipe(texts, as_tuples=True):
            task = copy.deepcopy(eg)
            spans = []
            matches = matcher(doc)
            pattern_matches = tuple(Span(doc, start, end, label) for label, start, end in matches)
            for ent in doc.ents + pattern_matches:
                if labels and ent.label_ not in labels:
                    'token_start': ent.start,
                    'token_end': ent.end - 1,
                    'start': ent.start_char,
                    'end': ent.end_char,
                    'text': ent.text,
                    'label': ent.label_,
                    'source': spacy_model,
                    'input_hash': eg[INPUT_HASH_ATTR]
            task['spans'] = spans
            task = set_hashes(task)
            yield task

    return {
        'view_id': 'ner_manual',
        'dataset': dataset,
        'stream': make_tasks(),
        'exclude': exclude,
        'update': None,
        'config': {'lang': nlp.lang, 'labels': labels}

(Vajja) #2

This is the command I am using to start the server

prodigy ner.make-silver test_data en_core_web_md data.jsonl --label  label_name --patterns skill_patt.jsonl -F &

Can you also let me know how to add logs so I can check the reason for error


(Ines Montani) #3

What’s the error you’re seeing? Special characters shouldn’t really be a problem – unless the data you load in is actually invalid JSON or something like that.

If it’s actually invalid JSON, you could write your loader like this and add an except statement that prints the line if json.loads fails to load it (i.e. if it’s not valid JSON).

from pathlib import Path
import json

with Path("/path/to/data.jsonl").open("r", encoding="utf8") as file_:
    for line in file_:
            yield json.loads(line)
            print("Couldn't load line:", line)

If you want to add your own log statements, you can use the util.log helper:

from prodigy.util import log

log("I will be added to the log")
log("Message", "Second arg will only be shown in verbose mode")

To show the log, you can set the PRODIGY_LOGGING environment variable:

export PRODIGY_LOGGING=basic    # only show basic logs
export PRODIGY_LOGGING=verbose  # show more detailed info