Hi,
I am annotating text for a binary classification task and I would like to make it so that a set of words that I specify appears highlighted in the Prodigy session to make annotation easier. I am using Prodigy v1.8.5. What I have tried to do is modify the textcat.manual
recipe on Github using patterns but after many hours or trial-and-error, I still get a "No tasks available" message on the Prodigy session. Below is the code for the textcat.manual
recipe along with the code I added (marked with # ADDED
).
from typing import List, Optional
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.util import split_string
from prodigy.models.matcher import PatternMatcher # ADDED
import spacy # ADDED: v.2.1.9
# Helper functions for adding user provided labels to annotation tasks.
def add_label_options_to_stream(stream, labels):
options = [{"id": label, "text": label} for label in labels]
for task in stream:
task["options"] = options
yield task
def add_labels_to_stream(stream, labels):
for task in stream:
task["label"] = labels[0] # ADDED: orginal code has `label[0]` instead of `labels[0]` which is likely an error
yield task
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"textcat.manual_pattern", # ADDED: original was "textcat.manual"
dataset=("The dataset to use", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
exclusive=("Treat classes as mutually exclusive", "flag", "E", bool),
exclude=("Names of datasets to exclude", "option", "e", split_string),
)
# ADDED: original was `def textcat_manual(`
def textcat_manual_pattern(
dataset: str,
source: str,
label: Optional[List[str]] = None,
exclusive: bool = False,
exclude: Optional[List[str]] = None,
):
"""
Manually annotate categories that apply to a text. If more than one label
is specified, categories are added as multiple choice options. If the
--exclusive flag is set, categories become mutually exclusive, meaning that
only one can be selected during annotation.
"""
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
# ADDED: Adding patterns
nlp = spacy.blank("en") # ADDED
matcher = PatternMatcher(nlp, label_span=True) # ADDED
patterns = [{"label" : "LABEL1", "pattern" : "pattern1"}] # ADDED
matcher.add_patterns(patterns) # ADDED
stream = matcher(stream) # ADDED
#Add labels to each task in stream
has_options = len(label) > 1
if has_options:
stream = add_label_options_to_stream(stream, label)
else:
stream = add_labels_to_stream(stream, label)
return {
"view_id": "choice" if has_options else "classification", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"choice_style": "single" if exclusive else "multiple", # Style of choice interface
"exclude_by": "input" if has_options else "task", # Hash value used to filter out already seen examples
},
}
Am I missing something obvious? Is this just not possible in my version of Prodigy? I'd appreciate any insights on the matter.