No code open for dep.teach recipe?

Hi!
Is there an open code for dep_teach recipe like the ner ones?
I am a little confused with how to write a custom stream for a dependency parser. The default recipe doesn't seem to suit as it doesn't use my custom tokenizer. And i don't really know what to change if I don't know what the code for dep_teach recipe looks like.

Edit:
What I actually did is the following:

import sentencepiece as spm
s = spm.SentencePieceProcessor()

s.Load("/home/marina_mitrofanova/data/TokenData/my.model")

class CTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = s.EncodeAsPieces(text)
        # All tokens 'own' a subsequent space character in this tokenizer
        return Doc(self.vocab, words=words)
    
    def to_disk(self, path, **kwargs):
    # This will receive the directory path + /my_component
        with path.open("wb") as file_:
            file_.write(self.to_bytes(**kwargs))

    def from_disk(self, path, **kwargs):
        # This will receive the directory path + /my_component
        with path.open("rb") as file_:
            bytes_data = file_.read()
            self.from_bytes(bytes_data, **kwargs)
        return self

    def to_bytes(self, exclude=tuple(), **kwargs):
        
        serializers = OrderedDict((
            ("vocab", lambda: self.vocab.to_bytes()),
        ))
        exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):

        data = OrderedDict()
        deserializers = OrderedDict((
            ("vocab", lambda b: self.vocab.from_bytes(b))
        ))
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        msg = util.from_bytes(bytes_data, deserializers, exclude)

        return self

@prodigy.recipe(
    "dep.custom",
    dataset=("The dataset to use", "positional", None, str),
    spacy_model=("The base model", "positional", None, str),
    source=("The source data as a JSONL file", "positional", None, str),
    label=("One or more comma-separated labels", "option", "l", split_string),
    patterns=("Optional match patterns", "option", "p", str),
    exclude=("Names of datasets to exclude", "option", "e", split_string),
    unsegmented=("Don't split sentences", "flag", "U", bool),
)
def dep_teach(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    unsegmented: bool = False,
):

    if source.endswith(".csv"):
        stream = CSV(source)
        
    else:
        stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    nlp.tokenizer = CTokenizer(nlp.vocab)
    
    model = DependencyParser(nlp, label=label)

    if patterns is None:
        # No patterns are used, so just use the NER model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns
        matcher = PatternMatcher(nlp).from_disk(patterns)

        predict, update = combine_models(model, matcher)

    if not unsegmented:

        stream = split_sentences(nlp, stream)

    stream = prefer_uncertain(predict(stream))

    return {
        "view_id": "dep_teach",
        "dataset": dataset,  # the dataset to save annotations to
        "stream": stream,  # the stream of examples
        "exclude": exclude,
        "config": {"lang": nlp.lang, "labels":label}, 
    }

And now there is no problem with tokenzation but there is a problem with display. There are no texts displayed on the main screen, however if you press accept or something else the texts appear on the left:


What is it exactly that I'm doing wrong?

Edit:
I got it! The problem was in view_id set to "dep_teach" as I assumed it would be analogical to "ner_teach". When I changed it to just "dep" everything worked.

Glad you got it working :slightly_smiling_face: Btw, you can find an overview of the available interfaces here: Annotation interfaces · Prodigy · An annotation tool for AI, Machine Learning & NLP

We also ship the source of the recipes with Prodigy, so you can see how they're implemented. If you run prodigy stats, it should show you the exact path to your Prodigy installation.

See my answer here: Dep.Teach doesn't use same tokenenization as pretrained model - #13 by ines

Thank you!!

1 Like