Hi!
I have written a custom recipe to load data from an existing dataset in my postgres db. The problem I'm having, is that it seems that the stream is looped over completely before serving:
import prodigy
import spacy
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.types import StreamType
# make the config work for rel_component
from custom_training.rel_component.scripts.rel_model import (
create_classification_layer, create_instances, create_relation_model)
from custom_training.rel_component.scripts.rel_pipe import (
make_relation_extractor, score_relations)
color_map = {
"SIBLING": "#ffd882",
"PARENT": "#c5bdf4",
"SAME_AS": "#d9fbad"
}
nlp = spacy.load("ref_ner/training/model-best")
rel_model = spacy.load("custom_training/rel_component/training/model-best")
nlp.add_pipe("relation_extractor", source=rel_model, name="relation_extractor", after="ner")
def add_relations_to_stream(stream) -> StreamType:
for eg in stream:
doc = nlp(eg["text"])
eg["relations"] = []
eg["spans"] = []
ent_map = {ent.start: ent for ent in doc.ents}
for ent in doc.ents:
span = dict(start=ent.start_char, end=ent.end_char,
token_start=ent.start, token_end=ent.end, label=ent.label_)
eg["spans"].append(span)
for (head, child), rel in doc._.rel.items():
rev_rel = {v: k for k, v in rel.items()}
val = max(rev_rel.keys())
if val < 0.5:
continue
label = rev_rel[val].upper()
head_ent = ent_map[head]
head_span = dict(start=head_ent.start_char, end=head_ent.end_char, token_start=head_ent.start, token_end=head_ent.end, label=head_ent.label_)
child_ent = ent_map[child]
child_span = dict(start=child_ent.start_char, end=child_ent.end_char,
token_start=child_ent.start, token_end=child_ent.end, label=child_ent.label_)
eg["relations"].append({"head": head, "head_span": head_span, "child": child, "child_span": child_span, "label": label, "color": color_map[label]})
yield eg
@prodigy.recipe(
"ref-rel",
dataset=("Dataset to save answers to", "positional", None, str),
source=("Source texts", "positional", None, str)
)
def custom_dep_recipe(dataset, source):
stream = get_stream(
source, None, None, rehash=True, dedup=True, input_key="text", is_binary=False
)
# stream = add_tokens(spacy.blank("it"), stream) # data comes from an existing dataset with tokens
stream = add_relations_to_stream(stream) # add custom relations
return {
"dataset": dataset, # dataset to save annotations to
"stream": stream, # the incoming stream of examples
"view_id": "relations", # annotation interface to use
"config": {
"labels": ["PARENT", "SIBLING", "SAME_AS"], # labels to annotate
"span-labels": ["J-REF", "L-REF"]
}
}
If I comment stream = add_relations_to_stream(stream)
all works fine (except... I don't get my relations), otherwise it seems to loop through all the existing dataset instead of actually yielding one example at a time.
As a matter of fact, if I place a print(eg)
right below the yield eg
in add_relations_to_stream
, it starts printing every example.
Is it possible that somewhere the stream
attribute that is returned gets converted to a list?
Or am I missing something very obvious?
I am running this with:
prodigy ref-rel mydataset_ref_rel dataset:mydataset_ref -F ref-rel.py
Thanks!
Specs:
macOS Monterey
Python==3.9.5
prodigy==1.11.7
spacy==3.2.2