extract Session ID dataset as users are curating (validate curation)?

I noticed that validate answer doesn't return the session id for multiuser is there a way to return it?

Hi Adrian,

it's a bit unclear to me what exactly you tried to achieve and what went wrong. Could you share the recipe that you ran, what you expected to see and what went wrong? If you could share any relevant configurations in prodigy.json that'd be helpful too.

# Does not receive session id
    answers = []
    def countAnswer(answer):
        if len(answers) >= 5:
            raise ValueError("Limit Reached")
        answers.append(answer)
        
    
    dataset = dataset
    view_id = "ner_manual"
    stream = stream
    update = None
    db = None
    progress = None
    on_load = lambda i: print("On Load!")
    on_exit = lambda i: print("On Exit!")
    before_db = remove_tokens
    validate_answer = countAnswer
    get_session_id = None
    exclude = None
    config = {
        "labels": label
    }
​
    ctrl = Controller(dataset, view_id, stream, update, db,
                        progress, on_load, on_exit, before_db,
                        validate_answer, get_session_id, exclude,
                        config, None)
​
    return ctrl

Is it possible to return a session id to get user dataset info, because the return doesn't include this. I was thinking of changing the controller but this is not the best idea.

Could you share the full recipe? It seems like you're only sending a part of the Python script. Also, could you share the command that you used to run the recipe?

from typing import List, Optional
from requests import session
import spacy
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.models.matcher import PatternMatcher
from prodigy.util import split_string
from prodigy.core import Controller
from prodigy.components.db import connect

def remove_tokens(answers):
    for eg in answers:
        del eg["tokens"]
        if "spans" in eg:
            for span in eg["spans"]:
                del span["token_start"]
                del span["token_end"]
    return answers

@prodigy.recipe(
    "entity_curation",
    dataset=("The dataset to use", "positional", None, str),
    spacy_model=("The base model", "positional", None, str),
    source=("The source data as a JSONL file", "positional", None, str),
    label=("One or more comma-separated labels", "option", "l", split_string),
    patterns=("The match patterns file", "option", "p", str),
    n_examples=(
        "Number of examples to randomly review, -1 for all",
        "option",
        "n",
        int,
    ),
    exclude=("Names of datasets to exclude", "option", "e", split_string),
    highlight_chars=(
        "Allow for highlighting individual characters instead of tokens",
        "flag",
        "C",
        bool,
    ),
)
def entity_curation(
    dataset: str,
    spacy_model: str,
    n_examples: int,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    highlight_chars: bool = False,
):

    answered = set()
    db = connect()
    nlp = spacy.load(spacy_model)
    stream = JSONL(source)
    
    def validate_answer(answer):
        answered.add(answer["_input_hash"])
        if len(answered) == n_examples:
            raise ValueError(
                f'You have reacher {n_examples-1} examples, please save and exit')

    if patterns is not None:
        pattern_matcher = PatternMatcher(
            nlp, combine_matches=True, all_examples=True)
        pattern_matcher = pattern_matcher.from_disk(patterns)
        stream = (eg for _, eg in pattern_matcher(stream))

    stream = add_tokens(nlp, stream, use_chars=highlight_chars)

    dataset = dataset
    view_id = "ner_manual"
    stream = stream
    update = None
    db = None
    progress = None
    on_load = lambda i: print("On Load!")
    on_exit = lambda i: print("On Exit!")
    before_db = remove_tokens
    validate_answer = validate_answer
    get_session_id = None
    exclude = None
    config = {
        "labels": label
    }

    ctrl = Controller(dataset, view_id, stream, update, db,
                        progress, on_load, on_exit, before_db,
                        validate_answer, get_session_id, exclude,
                        config, None)

    return ctrl

@ines is this possible?

hi @a.arranz!

Have you seen this post?

Unfortunately since the session ID isn't assigned until execution, there's no way to access it within the recipe. However, the post outlines how to get it from separate instances of Prodigy.

Hope this helps!