Hi!
Is there an open code for dep_teach
recipe like the ner
ones?
I am a little confused with how to write a custom stream for a dependency parser. The default recipe doesn't seem to suit as it doesn't use my custom tokenizer. And i don't really know what to change if I don't know what the code for dep_teach
recipe looks like.
Edit:
What I actually did is the following:
import sentencepiece as spm
s = spm.SentencePieceProcessor()
s.Load("/home/marina_mitrofanova/data/TokenData/my.model")
class CTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = s.EncodeAsPieces(text)
# All tokens 'own' a subsequent space character in this tokenizer
return Doc(self.vocab, words=words)
def to_disk(self, path, **kwargs):
# This will receive the directory path + /my_component
with path.open("wb") as file_:
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **kwargs):
# This will receive the directory path + /my_component
with path.open("rb") as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **kwargs)
return self
def to_bytes(self, exclude=tuple(), **kwargs):
serializers = OrderedDict((
("vocab", lambda: self.vocab.to_bytes()),
))
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
data = OrderedDict()
deserializers = OrderedDict((
("vocab", lambda b: self.vocab.from_bytes(b))
))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
return self
@prodigy.recipe(
"dep.custom",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
patterns=("Optional match patterns", "option", "p", str),
exclude=("Names of datasets to exclude", "option", "e", split_string),
unsegmented=("Don't split sentences", "flag", "U", bool),
)
def dep_teach(
dataset: str,
spacy_model: str,
source: str,
label: Optional[List[str]] = None,
patterns: Optional[str] = None,
exclude: Optional[List[str]] = None,
unsegmented: bool = False,
):
if source.endswith(".csv"):
stream = CSV(source)
else:
stream = JSONL(source)
# Load the spaCy model
nlp = spacy.load(spacy_model)
nlp.tokenizer = CTokenizer(nlp.vocab)
model = DependencyParser(nlp, label=label)
if patterns is None:
# No patterns are used, so just use the NER model to suggest examples
# and only use the model's update method as the update callback
predict = model
update = model.update
else:
# Initialize the pattern matcher and load in the JSONL patterns
matcher = PatternMatcher(nlp).from_disk(patterns)
predict, update = combine_models(model, matcher)
if not unsegmented:
stream = split_sentences(nlp, stream)
stream = prefer_uncertain(predict(stream))
return {
"view_id": "dep_teach",
"dataset": dataset, # the dataset to save annotations to
"stream": stream, # the stream of examples
"exclude": exclude,
"config": {"lang": nlp.lang, "labels":label},
}
And now there is no problem with tokenzation but there is a problem with display. There are no texts displayed on the main screen, however if you press accept
or something else the texts appear on the left:
What is it exactly that I'm doing wrong?
Edit:
I got it! The problem was in view_id
set to "dep_teach" as I assumed it would be analogical to "ner_teach". When I changed it to just "dep" everything worked.