How to modify behavior of receive_answers( )

snd507 · June 10, 2020, 4:25pm

Hi, I'd like to modify the behavior of the receive_answers method with custom recipe. The reason for this is to avoid duplicate annotations in the database. I tried setting the exclude_by and exclude flags but they don't seem to work. I imagined prodigy would, by default, avoid storing duplicate entries into the DB but that filtering doesn't seem to work. So I modified the update() method to only save distinct answers to the DB. But it seems like prodigy actually saves the answers to the DB before calling update().

Here's my recipe code:

def custom_recipe(
        dataset: str,
        view_id: str,
        spacy_model: str,
        source: str,
        label: Optional[List[str]],
        patterns: Optional[str] = None,
        exclude: Optional[List[str]] = None):
    LOGGER.info('RECIPE: Starting recipe textcat.custom-recipe')

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    print(source)
    stream = JSONL(source)

    ctrl = Controller('MRO_Monitoring_June', 'text', [], None, True, None, None, None, None, None, {})

    def on_save(answers):
        print(f"\non_save received {len(answers)} annotations!")
        answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
        print(f"\non_save filtered to {len(answers)} annotations!")

    def update(answers):
        # This function is triggered when Prodigy receives annotations
        print(f"\nReceived {len(answers)} annotations!")
        answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
        print(f"\nSaving {len(answers)} annotations!")
        for ans in answers:
            epoch_time = int(time.time())
            ans['answer_timestamp'] = epoch_time
            del ans['html'] #remove html coming from view id
            del ans['id']
            saved_data.add(ans["meta"]['id'])
        db.add_examples(answers, datasets=['MRO_Monitoring_June'])
        # db_answers = [{"id": eg["id"], "answer": eg["answer"]} for eg in answers]
        #save_to_your_custom_db(db_answers)

    def on_exit(controller):
        """
        Triggered when server is stopped i.e Ctrl C
        A function that is invoked when you stop the Prodigy server.
        It takes the Controller as an argument, giving you access to the database.
        Split data into train, validation, test
        Write data in flashtext format
        """

    return {
        "dataset": dataset,
        "view_id": 'blocks',
        "stream": get_stream_loop(), #list conversion for progress bar on UI
        "update": update,
        "on_exit": on_exit,
        'exclude': [dataset],
        # 'db': False,
        "config": {
            'exclude_by': 'input',
            "blocks": [
                {"view_id": "html"},
                {"view_id": "text_input",
                 "field_id": "answer_comment",
                 "field_label": "Enter your comments here (Optional)",
                 "field_rows": 5,
                 "field_placeholder": "e.g Description is too short, and contains undefined abbreviations"
                }
            ]
        },
        'db': db
    }

justindujardin · June 10, 2020, 5:43pm

~~@snd507 you can use the before_db component of your recipe, instead of update~~

~~It's called with the list of answers that are about to be added to the database and expects a list of answers to be returned. You can modify the list and return it.~~

snd507 · June 10, 2020, 6:36pm

I'm getting an error with this approach:

before_db extra fields not permitted

@recipe("custom-recipe")
def custom_recipe():
    def before_db(answers):
                answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
                return answers
    return {
                "dataset": dataset,
        "view_id": 'blocks',
        "stream": get_stream_loop(), #list conversion for progress bar on UI
        "update": update,
        "on_exit": on_exit,
        "before_db": before_db,
        "config": {
            'exclude_by': 'input',
            "blocks": [
                {"view_id": "html"},
                {"view_id": "text_input",
                 "field_id": "answer_comment",
                 "field_label": "Enter your comments here (Optional)",
                 "field_rows": 5,
                 "field_placeholder": "e.g Description is too short, and contains undefined abbreviations"
                }
            ]
        },
        'db': db
    }

justindujardin · June 10, 2020, 6:48pm

Sorry, I referenced my development version of prodigy when I gave you this tip. The changes for using "before_db" aren't in a released version of prodigy.

For now, you can use this override of the Database class to accomplish the same thing:

from prodigy.components.db import Database

from prodigy import recipe
from prodigy.components import db
from prodigy.components.loaders import JSONL
from prodigy.util import LOGGER


class CustomDatabase(Database):
    def add_examples(self, examples, datasets=tuple()):
        # Filter out unwanted examples
        examples = [eg for eg in examples]
        return super().add_examples(examples, datasets)


db.Database = CustomDatabase


@recipe(
    "custom-recipe",
    dataset=("Dataset to save answers to", "positional", None, str),
    view_id=("Annotation interface", "option", "v", str),
    source=("The source data as a JSON file", "positional", None, str),
)
def custom_recipe(
    dataset: str, view_id: str, source: str,
):
    LOGGER.info("RECIPE: Starting recipe custom-recipe")
    return {
        "dataset": dataset,
        "view_id": "blocks",
        "stream": list(JSONL(source)),
        "config": {"blocks": [{"view_id": "classification"}]},
        "db": db.connect(),
    }

snd507 · June 11, 2020, 6:57pm

@justindujardin I got it to work with this approach. So I basically have a solution for the other thread regarding refresh browser. It uses an infinite loop stream with caching, and also avoids duplicates in the DB. However, because Im using an infinite loop stream, I can't wrap the list() around the stream and that's causing the UI to display the infinity symbol on the progress. I added a get_progress method and return it within the recipe but it doesn't seem to get called.

justindujardin · June 12, 2020, 1:52pm

@snd507 I'm glad you found a workaround. Here's a working example of using a custom progress callable in your recipe:

from prodigy import recipe
from prodigy.components import db
from prodigy.components.loaders import JSONL
from prodigy.util import LOGGER


class CustomProgress:
    def __call__(self, session: str, total: int) -> float:
        """Expects a value in range 0.0 - 1.0"""
        return 0.1337


@recipe(
    "custom-recipe",
    dataset=("Dataset to save answers to", "positional", None, str),
    view_id=("Annotation interface", "option", "v", str),
    source=("The source data as a JSON file", "positional", None, str),
)
def custom_recipe(
    dataset: str, view_id: str, source: str,
):

    LOGGER.info("RECIPE: Starting recipe custom-recipe")
    return {
        "dataset": dataset,
        "view_id": "blocks",
        "stream": JSONL(source),
        "progress": CustomProgress(),
        "config": {"blocks": [{"view_id": "classification"}]},
        "db": db.connect(),
    }

Topic		Replies	Views
Getting access to annotations before placed in db usage , database , custom , solved	8	2038	October 31, 2019
Custom templates with custom DB and exclude logic usage , custom , solved	20	3056	January 29, 2018
Saving and retrieving annotations usage , database , custom , solved	7	5108	June 13, 2018
Duplicated annotation when changing version ner , spacy	6	556	November 9, 2022
Few records in in the db for the same example usage	26	630	June 13, 2023

How to modify behavior of receive_answers( )

Related topics