How to modify behavior of receive_answers( )

Hi, I'd like to modify the behavior of the receive_answers method with custom recipe. The reason for this is to avoid duplicate annotations in the database. I tried setting the exclude_by and exclude flags but they don't seem to work. I imagined prodigy would, by default, avoid storing duplicate entries into the DB but that filtering doesn't seem to work. So I modified the update() method to only save distinct answers to the DB. But it seems like prodigy actually saves the answers to the DB before calling update().

Here's my recipe code:

def custom_recipe(
        dataset: str,
        view_id: str,
        spacy_model: str,
        source: str,
        label: Optional[List[str]],
        patterns: Optional[str] = None,
        exclude: Optional[List[str]] = None):
    LOGGER.info('RECIPE: Starting recipe textcat.custom-recipe')

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    print(source)
    stream = JSONL(source)

    ctrl = Controller('MRO_Monitoring_June', 'text', [], None, True, None, None, None, None, None, {})

    def on_save(answers):
        print(f"\non_save received {len(answers)} annotations!")
        answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
        print(f"\non_save filtered to {len(answers)} annotations!")

    def update(answers):
        # This function is triggered when Prodigy receives annotations
        print(f"\nReceived {len(answers)} annotations!")
        answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
        print(f"\nSaving {len(answers)} annotations!")
        for ans in answers:
            epoch_time = int(time.time())
            ans['answer_timestamp'] = epoch_time
            del ans['html'] #remove html coming from view id
            del ans['id']
            saved_data.add(ans["meta"]['id'])
        db.add_examples(answers, datasets=['MRO_Monitoring_June'])
        # db_answers = [{"id": eg["id"], "answer": eg["answer"]} for eg in answers]
        #save_to_your_custom_db(db_answers)

    def on_exit(controller):
        """
        Triggered when server is stopped i.e Ctrl C
        A function that is invoked when you stop the Prodigy server.
        It takes the Controller as an argument, giving you access to the database.
        Split data into train, validation, test
        Write data in flashtext format
        """

    return {
        "dataset": dataset,
        "view_id": 'blocks',
        "stream": get_stream_loop(), #list conversion for progress bar on UI
        "update": update,
        "on_exit": on_exit,
        'exclude': [dataset],
        # 'db': False,
        "config": {
            'exclude_by': 'input',
            "blocks": [
                {"view_id": "html"},
                {"view_id": "text_input",
                 "field_id": "answer_comment",
                 "field_label": "Enter your comments here (Optional)",
                 "field_rows": 5,
                 "field_placeholder": "e.g Description is too short, and contains undefined abbreviations"
                }
            ]
        },
        'db': db
    }

@snd507 you can use the before_db component of your recipe, instead of update

It's called with the list of answers that are about to be added to the database and expects a list of answers to be returned. You can modify the list and return it.

I'm getting an error with this approach:

before_db extra fields not permitted

@recipe("custom-recipe")
def custom_recipe():
    def before_db(answers):
                answers = [ans for ans in answers if ans["meta"]['id'] not in saved_data]
                return answers
    return {
                "dataset": dataset,
        "view_id": 'blocks',
        "stream": get_stream_loop(), #list conversion for progress bar on UI
        "update": update,
        "on_exit": on_exit,
        "before_db": before_db,
        "config": {
            'exclude_by': 'input',
            "blocks": [
                {"view_id": "html"},
                {"view_id": "text_input",
                 "field_id": "answer_comment",
                 "field_label": "Enter your comments here (Optional)",
                 "field_rows": 5,
                 "field_placeholder": "e.g Description is too short, and contains undefined abbreviations"
                }
            ]
        },
        'db': db
    }

Sorry, I referenced my development version of prodigy when I gave you this tip. The changes for using "before_db" aren't in a released version of prodigy.

For now, you can use this override of the Database class to accomplish the same thing:

from prodigy.components.db import Database

from prodigy import recipe
from prodigy.components import db
from prodigy.components.loaders import JSONL
from prodigy.util import LOGGER


class CustomDatabase(Database):
    def add_examples(self, examples, datasets=tuple()):
        # Filter out unwanted examples
        examples = [eg for eg in examples]
        return super().add_examples(examples, datasets)


db.Database = CustomDatabase


@recipe(
    "custom-recipe",
    dataset=("Dataset to save answers to", "positional", None, str),
    view_id=("Annotation interface", "option", "v", str),
    source=("The source data as a JSON file", "positional", None, str),
)
def custom_recipe(
    dataset: str, view_id: str, source: str,
):
    LOGGER.info("RECIPE: Starting recipe custom-recipe")
    return {
        "dataset": dataset,
        "view_id": "blocks",
        "stream": list(JSONL(source)),
        "config": {"blocks": [{"view_id": "classification"}]},
        "db": db.connect(),
    }

@justindujardin I got it to work with this approach. So I basically have a solution for the other thread regarding refresh browser. It uses an infinite loop stream with caching, and also avoids duplicates in the DB. However, because Im using an infinite loop stream, I can't wrap the list() around the stream and that's causing the UI to display the infinity symbol on the progress. I added a get_progress method and return it within the recipe but it doesn't seem to get called.

@snd507 I'm glad you found a workaround. Here's a working example of using a custom progress callable in your recipe:

from prodigy import recipe
from prodigy.components import db
from prodigy.components.loaders import JSONL
from prodigy.util import LOGGER


class CustomProgress:
    def __call__(self, session: str, total: int) -> float:
        """Expects a value in range 0.0 - 1.0"""
        return 0.1337


@recipe(
    "custom-recipe",
    dataset=("Dataset to save answers to", "positional", None, str),
    view_id=("Annotation interface", "option", "v", str),
    source=("The source data as a JSON file", "positional", None, str),
)
def custom_recipe(
    dataset: str, view_id: str, source: str,
):

    LOGGER.info("RECIPE: Starting recipe custom-recipe")
    return {
        "dataset": dataset,
        "view_id": "blocks",
        "stream": JSONL(source),
        "progress": CustomProgress(),
        "config": {"blocks": [{"view_id": "classification"}]},
        "db": db.connect(),
    }