Dear Prodigy team,
I have a question regarding duplicated annotations in multi-session mode.
Our setup is as follows: we have several annotators and run sessions on different days on the same input data and we save annotations to the same dataset (we stop the server each evening and restart the next morning).
We want that each annotation task is only labeled once. If an example was labeled on day 1 by one of the annotators, it should not be presented to any of the other annotators neither on this day nor on any other day in the future.
We have been running our own recipe for annotations on prodigy version 1.10.7 until recently.
To ensure that each example is annotated once we set in the .prodigy.json:
"feed_overlap": false
In the past we did not have many duplicated annotations but since we upgraded to prodigy version 1.11.8 last week we are seeing a lot of duplicates. Especially after we stop and restart the server, the examples seem to start from the beginning.
We used to create our stream with
stream = JSONL(source)
stream = add_tokens(nlp, stream)
which seemed not to generate duplicates in the old version.
Do we instead need to use
stream = get_stream(
source, loader=loader, rehash=True, dedup=True, input_key="text"
)
stream = add_tokens(nlp, stream)
to ensure that we don't get the same examples multiple times for annotating?
Or do we need to use the
"exclude"
option in the recipe and pass the dataset name in order to exclude annotations from the previous days?
I am a bit confused which setting to use for what and would really appreciate your help.
Thanks a lot.
PS: Here is our code for our custom ner manual recipe:
import configparser
import logging
import os
from collections import Counter
from pathlib import Path
from typing import List, Optional
import prodigy
import spacy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string
@prodigy.recipe(
"ner.manual_stats",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string)
)
def ner_manual_stats(
dataset: str,
spacy_model: str,
source: str,
label: List[str]
):
"""
Mark spans manually by token. Requires only a tokenizer and no entity
recognizer, and doesn't do any active learning.
"""
# Load the spaCy model for tokenization
nlp = spacy.load(spacy_model)
stats_session=Counter()
stats_dataset_db=Counter()
# Load the stream from a JSONL file and return a generator that yields a dictionary for each example in the data.
stream = JSONL(source)
# Tokenize the incoming examples and add a "tokens" property to each example.
stream = add_tokens(nlp, stream)
def on_load(controller):
# Check if current dataset is available in database. The on_load callback receives the controller as an
# argument, which exposes the database via controller.db
if dataset in controller.db:
examples = controller.db.get_dataset(dataset)
for eg in examples:
stats_dataset_db[eg["answer"]] += 1
if "spans" in eg.keys():
for span in eg["spans"]:
stats_dataset_db[span["label"]] += 1
def update(answers):
nonlocal stats_session
for eg in answers:
stats_session[eg["answer"]] += 1
if "spans" in eg.keys():
for span in eg["spans"]:
stats_session[span["label"]] += 1
def on_exit(controller):
logger = logging.getLogger()
logger.setLevel("INFO")
sh = logging.StreamHandler()
sh.setLevel(logging.DEBUG)
logger.addHandler(sh)
logger.info("Annotations previously stored in DB for dataset {}:".format(dataset))
logger.info("Total:\t "+str(stats_dataset_db["accept"]+stats_dataset_db["reject"]+stats_dataset_db["ignore"]))
if len(stats_dataset_db.keys()) != 0:
logger.info("annotated entities in DB: ")
for key, value in stats_dataset_db.items():
if key not in ("accept", "reject", "ignore"):
logger.info("{}:\t {}".format(key, value))
logger.info("Annotations for this session:")
logger.info("Total:\t "+str(stats_session["accept"]+stats_session["reject"]+stats_session["ignore"]))
if len(stats_session.keys()) != 0:
logger.info("annotated entities for this session: ")
for key, value in stats_session.items():
if key not in ("accept", "reject", "ignore"):
logger.info("{}:\t {}".format(key, value))
return {
"view_id": "ner_manual", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"update": update, # List of dataset names to exclude
"on_load": on_load, # Called on first load
"on_exit": on_exit, # Called when Prodigy server is stopped
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": label, # Selectable label options
},
}