Hello everyone. I'm very new to the tool and python in general, so bare with me if i did something really stupid in the code :). I tried multiple things and none of them seem to work, so let me explain what i'm trying to do.
Currently i'm wiring a custom recipe (mostly following chatbot video tutorial) that uses spans manual and text classification. Here's the code:
import spacy
import prodigy
from prodigy.components.preprocess import add_tokens
from prodigy.components.loaders import JSONL, TXT
from prodigy.models.matcher import PatternMatcher, PhraseMatcher
import codecs
import json
@prodigy.recipe(
"smart-prompt-training",
dataset=("Dataset to save annotations into", "positional", None, str),
lang=("Language to use", "positional", None, str),
file_in=("Path to example prompt file", "positional", None, str),
label_file=("Path to labels file", "positional", None, str),
patterns_file=("Path to patterns file", "positional", None, str),
intents_file=("Path to intents file", "positional", None, str)
)
def custom_recipe(dataset, lang, file_in, label_file, patterns_file, intents_file):
with open(label_file, 'r') as file:
span_labels = file.readlines()
with open(intents_file, 'r') as file:
intent_labels = file.readlines()
def add_options(stream):
for ex in stream:
ex['options'] = [
{"id": lab, "text": lab} for lab in intent_labels
]
yield ex
nlp = spacy.load(lang)
stream = TXT(file_in)
stream = add_tokens(nlp, stream, use_chars=None)
stream = add_options(stream)
def write_without_bom(file_path):
BOM = codecs.BOM_UTF8.decode('utf-8')
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
if text.startswith(BOM):
text = text[len(BOM):]
with open(file_path, 'w', encoding='utf-8') as file:
file.write(text)
if patterns_file is not None:
write_without_bom(patterns_file)
pattern_matcher = PatternMatcher(nlp, combine_matches=True, all_examples=True)
pattern_matcher = pattern_matcher.from_disk(patterns_file)
stream = (eg for _, eg in pattern_matcher(stream))
blocks = [
{"view_id": "spans_manual"},
{"view_id": "choice", "text": None}
]
return {
"view_id": "blocks",
"dataset": dataset,
"stream": stream,
"config": {
"lang": nlp.lang,
"labels": span_labels,
"blocks": blocks,
"choice_style": "single"
}
}
I'm exporting my labels (entities) and choices (intents) from somewhere else and placing them in a file. My pattern file looks like this currently:
{"label":"Management","pattern":"littell properties"}
{"label":"Management","pattern":"pallas realty advisors"}
{"label":"Management","pattern":"abbey residential"}
So, what i want to achieve is add a phrase matcher to the custom recipe that'll match phrase by patterns provided in the file. If i have a line "Give me ... managed by abbey residential" i'd like prodigy to automatically highlight abbey residential and match it against "Management" label.
EDIT: I am using the following cmd line to run it:
python -m prodigy smart-prompt-training smart-prompts en_core_web_sm prompt_data.txt entities.txt patterns.jsonl intents.txt -F smartRecipe.py
EDIT2: Just to clarify as i did not write anything on it - i am currently using PatternMatcher in code and tried multiple other ways to replace it with PhraseMatcher and couldn't get it to work.
Thanks for being awesome!