Alright, I confirm that the the recipe code el_recipe.py
is slightly outdated (I haven't noticed before - sorry!) in the way it uses the KnowledgeBase
and how it read the csv file.
I have updated and tested the version of the script I posted before. Note that this script assumes a jsonl
file as input (as per your use case discussed before) and not the txt
file as the demo does.
"""
Custom Prodigy recipe to perform manual annotation of entity links,
given an existing NER model and a knowledge base performing candidate generation.
You can run this project without having Prodigy or using this recipe:
sample results are stored in assets/emerson_annotated_text.jsonl
"""
import spacy
from spacy.kb import InMemoryLookupKB, get_candidates
import prodigy
from prodigy.models.ner import EntityRecognizer
from prodigy.components.stream import get_stream #UPDATED
from prodigy.components.filters import filter_duplicates
from prodigy.components.preprocess import split_spans
import csv
from pathlib import Path
@prodigy.recipe(
"entity_linker.manual",
dataset=("The dataset to use", "positional", None, str),
source=("The source data as a .jsonl file", "positional", None, Path), #UPDATED
nlp_dir=("Path to the NLP model with a pretrained NER component", "positional", None, Path),
kb_loc=("Path to the KB", "positional", None, Path),
entity_loc=("Path to the file with additional information about the entities", "positional", None, Path),
)
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
# Load the NLP and KB objects from file
nlp = spacy.load(nlp_dir)
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=1) #UPDATED
kb.from_disk(kb_loc)
# Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
id_dict = dict()
with Path(entity_loc).open("r", encoding="utf8") as csvfile: #UPDATED
csvreader = csv.reader(csvfile, delimiter=",")
for row in csvreader:
id_dict[row[0]] = (row[1], row[2])
# Initialize the Prodigy stream by loading the preannotated dataset
stream = get_stream(source) #UPDATED
# For each NER mention, add the candidates from the KB to the annotation task
stream.apply(_add_options, stream=stream, kb=kb, nlp=nlp,id_dict=id_dict) #UPDATED to use the newer API
stream.apply(split_spans, stream=stream) #NEW we want one entity per task
stream.apply(filter_duplicates, stream=stream, by_input=False, by_task=True) #UPDATED to use the newer API
return {
"dataset": dataset,
"stream": stream,
"view_id": "choice", # choice view if will render NER spans if present in the input
"config": {"choice_auto_accept": True},
}
def _add_options(stream, kb, nlp, id_dict):
"""Define the options the annotator will be given, by consulting the candidates from the KB for each NER span."""
for task in stream:
text = task["text"]
for mention in task["spans"]:
start_char = int(mention["start"])
end_char = int(mention["end"])
doc = nlp(text)
span = doc.char_span(start_char, end_char, mention["label"])
candidates = get_candidates(kb, span)
if candidates:
options = [
{"id": c.entity_, "html": _print_url(c.entity_, id_dict)}
for c in candidates
]
# we sort the options by ID
options = sorted(options, key=lambda r: int(r["id"][1:]))
# we add in a few additional options in case a correct ID can not be picked
options.append({"id": "NIL_otherLink", "text": "Link not in options"})
options.append({"id": "NIL_ambiguous", "text": "Need more context"})
task["options"] = options
yield task
def _print_url(entity_id, id_dict):
""" For each candidate QID, create a link to the corresponding Wikidata page and print the description """
url_prefix = "https://www.wikidata.org/wiki/"
name, descr = id_dict.get(entity_id)
option = "<a href='" + url_prefix + entity_id + "'>" + entity_id + "</a>: " + descr
return option
I also added there the spliting of the spans we discussed in another thread so that the annotator has to deal with one entity at a time.
I will for sure update the demo next week to make sure it's fully compatible with the latest spaCy and prodigy API but hopefully this unblock you for now.