Annotation pipeline - chaining multiple annotation task types

magdaaniol · February 23, 2024, 8:11pm

Alright, I confirm that the the recipe code el_recipe.py is slightly outdated (I haven't noticed before - sorry!) in the way it uses the KnowledgeBase and how it read the csv file.

I have updated and tested the version of the script I posted before. Note that this script assumes a jsonl file as input (as per your use case discussed before) and not the txt file as the demo does.

"""
Custom Prodigy recipe to perform manual annotation of entity links,
given an existing NER model and a knowledge base performing candidate generation.
You can run this project without having Prodigy or using this recipe:
sample results are stored in assets/emerson_annotated_text.jsonl
"""

import spacy
from spacy.kb import InMemoryLookupKB, get_candidates

import prodigy
from prodigy.models.ner import EntityRecognizer
from prodigy.components.stream import get_stream #UPDATED
from prodigy.components.filters import filter_duplicates
from prodigy.components.preprocess import split_spans

import csv
from pathlib import Path


@prodigy.recipe(
    "entity_linker.manual",
    dataset=("The dataset to use", "positional", None, str),
    source=("The source data as a .jsonl file", "positional", None, Path), #UPDATED
    nlp_dir=("Path to the NLP model with a pretrained NER component", "positional", None, Path),
    kb_loc=("Path to the KB", "positional", None, Path),
    entity_loc=("Path to the file with additional information about the entities", "positional", None, Path),
)
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    # Load the NLP and KB objects from file
    nlp = spacy.load(nlp_dir)
    kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=1) #UPDATED
    kb.from_disk(kb_loc)

    # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
    id_dict = dict()
    with Path(entity_loc).open("r", encoding="utf8") as csvfile: #UPDATED
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])

    # Initialize the Prodigy stream by loading the preannotated dataset
    stream = get_stream(source) #UPDATED

    # For each NER mention, add the candidates from the KB to the annotation task
    stream.apply(_add_options, stream=stream, kb=kb, nlp=nlp,id_dict=id_dict) #UPDATED to use the newer API
    stream.apply(split_spans, stream=stream) #NEW we want one entity per task
    stream.apply(filter_duplicates, stream=stream, by_input=False, by_task=True) #UPDATED to use the newer API

    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice", # choice view if will render NER spans if present in the input
        "config": {"choice_auto_accept": True},
    }


def _add_options(stream, kb, nlp, id_dict):
    """Define the options the annotator will be given, by consulting the candidates from the KB for each NER span."""
    for task in stream:
        text = task["text"]
        for mention in task["spans"]:
            start_char = int(mention["start"])
            end_char = int(mention["end"])
            doc = nlp(text)
            span = doc.char_span(start_char, end_char, mention["label"])

            candidates = get_candidates(kb, span)
            if candidates:
                options = [
                    {"id": c.entity_, "html": _print_url(c.entity_, id_dict)}
                    for c in candidates
                ]

                # we sort the options by ID
                options = sorted(options, key=lambda r: int(r["id"][1:]))

                # we add in a few additional options in case a correct ID can not be picked
                options.append({"id": "NIL_otherLink", "text": "Link not in options"})
                options.append({"id": "NIL_ambiguous", "text": "Need more context"})

                task["options"] = options
                yield task


def _print_url(entity_id, id_dict):
    """ For each candidate QID, create a link to the corresponding Wikidata page and print the description """
    url_prefix = "https://www.wikidata.org/wiki/"
    name, descr = id_dict.get(entity_id)
    option = "<a href='" + url_prefix + entity_id + "'>" + entity_id + "</a>: " + descr
    return option

I also added there the spliting of the spans we discussed in another thread so that the annotator has to deal with one entity at a time.
I will for sure update the demo next week to make sure it's fully compatible with the latest spaCy and prodigy API but hopefully this unblock you for now.

Topic		Replies	Views
prodigy use case for annotation having pre-annotated text usage , solved	8	1278	March 11, 2019
annotating entities in text documents usage , ner , solved	15	9982	November 28, 2017
📺 Video: Training a custom entity linking model with spaCy & Prodigy ner , project	45	6998	May 10, 2021
annotations imported via db-in not showned ner , done , front-end	2	47	August 31, 2024
How to build ABSA (Aspect-Based Sentiment Analysis) annotation recipe by prodigy? usage , custom , solved , medical	13	2682	June 5, 2019

Annotation pipeline - chaining multiple annotation task types

Related topics