Annotating an extractive QA dataset à la SQuAD

Hi, again!

I found a solution to show both the question and the context, and to able to select the answer span.

Captura%20de%20pantalla%20de%202019-10-24%2011-11-09

Captura%20de%20pantalla%20de%202019-10-24%2011-10-28

Just add this custom Javascript to your ~/.prodigy/prodigy.json file:

{
"javascript": "document.addEventListener('prodigyupdate', event => {const container = document.querySelector('.prodigy-title'); container.innerHTML=window.prodigy.content.question; });document.addEventListener('prodigymount', event => {const container = document.querySelector('.prodigy-title'); container.innerHTML=window.prodigy.content.question; })"
}

And my custom recipe looks like:

import prodigy
import spacy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string


@prodigy.recipe(
    "qa",
    dataset=("The dataset to use", "positional", None, str),
    spacy_model=("The base model", "positional", None, str),
    source=("The source data as a JSONL file", "positional", None, str),
    label=("One or more comma-separated labels", "option", "l", split_string),
)
def qa(dataset, spacy_model, source, label="answer_span"):
    """Custom recipe to annotate a dataset to train and evalute an extractive Question Answering system"""

    # load the source dataset, made of samples containing question and text pairs
    stream = JSONL(source)
    # load the spaCy model
    nlp = spacy.load(spacy_model)
    # and tokenize the text
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",
        "dataset": dataset,
        "stream": stream,
        "config": {"lang": nlp.lang, "label": label, "labels": label},
    }

Thanks a lot!

1 Like