Context: I am trying to create quite a complex NER workflow in prodigy, and am struggling to get the recipe right. Ultimately, I would like to create a kind of active learning loop that uses both:
- The default spacy NER model "en_core_web_md", and;
- A third-party LLM,
to pre-label my texts. The goals are:
- The human labeller should receive helpful suggestions from both of these models to ease cognitive load.
- Spacy NER model should update itself every N batches with the human-labelled data.
- The same spacy model should select which documents to label every batch. As the model learns, the choice of next texts to label can be tuned to optimise learning speed.
- The LLM should help overcome the cold start model, and can also be used as a baseline to measure the local model's improvement.
My best attempt to do this: Here is the code for my recipe:
import copy
import spacy
import prodigy
import configparser
from spacy.training import Example
from typing import Optional, Iterable
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens, split_sentences
from prodigy.util import split_string, set_hashes
from spacy_llm.util import assemble
import groq_llm_model # noqa
BATCH_SIZE = 10
CONFIG_PATH = "./groq_config.cfg"
def make_tasks_dual(
nlp_local: spacy.language.Language,
nlp_llm: spacy.language.Language,
stream: Iterable[dict],
labels: Optional[list[str]],
):
texts = ((example["text"], example) for example in stream)
for i, (doc_local, example) in enumerate(nlp_local.pipe(texts, as_tuples=True)):
print(f"{i}. Document is being processed in make_tasks")
task = copy.deepcopy(example)
spans_local = []
for ent in doc_local.ents:
if labels and ent.label_ not in labels:
continue
spans_local.append(
{
"token_start": ent.start,
"token_end": ent.end - 1,
"start": ent.start_char,
"end": ent.end_char,
"text": ent.text,
"label": ent.label_,
}
)
# # Get LLM suggestions
spans_llm = []
text = example["text"]
tokens = example["tokens"]
doc_llm = nlp_llm(text)
for ent in doc_llm.ents:
if labels and ent.label_ not in labels:
continue
spans_llm.append(
{
"token_start": ent.start,
"token_end": ent.end - 1,
"start": ent.start_char,
"end": ent.end_char,
"text": ent.text,
"label": ent.label_,
}
)
task["versions"] = [
{
"text": text,
"tokens": tokens,
"spans": spans_local,
"answer": "accept",
"sessions": ["spacy_medium"],
"default": True,
},
{
"text": text,
"tokens": tokens,
"spans": spans_llm,
"answer": "accept",
"sessions": ["groq_llm"],
"default": True,
},
]
task = set_hashes(task)
yield task
@prodigy.recipe(
"ner.correct.v2",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
labels=("One or more comma-separated labels", "option", "l", split_string),
exclude=("Names of datasets to exclude", "option", "e", split_string),
unsegmented=("Don't split sentences", "flag", "U", bool),
component=("Name of NER component in the pipeline", "option", "c", str),
)
def ner_correct(
dataset: str,
spacy_model: str,
source: str,
labels: Optional[list[str]] = None,
exclude: Optional[list[str]] = None,
unsegmented: bool = False,
component: Optional[str] = "ner",
config_path: str = CONFIG_PATH,
):
nlp_local = spacy.load(spacy_model)
nlp_llm = assemble(config_path)
stream = JSONL(source)
config = configparser.ConfigParser()
config.read(config_path)
if component not in nlp_local.pipe_names:
raise ValueError(
f"Can't find component '{component}' in the provided pipeline."
)
if not unsegmented:
stream = split_sentences(nlp_local, stream)
all_model_labels = config.get("components.llm.task", "labels").split(",")
if labels is None:
specified_labels = all_model_labels
else:
specified_labels = labels
task_labels = list(set(specified_labels).intersection(set(all_model_labels)))
stream = add_tokens(nlp_local, stream)
stream = make_tasks_dual(
nlp_local,
nlp_llm,
stream,
task_labels,
)
def make_update(answers):
examples = []
for example in answers:
if example["answer"] == "accept":
pred = nlp_local.make_doc(example["text"])
ref = nlp_local.make_doc(example["text"])
spans = [
pred.char_span(span["start"], span["end"], label=span["label"])
for span in example.get("spans", [])
]
ref.set_ents(spans)
examples.append(Example(pred, ref))
nlp_local.update(examples)
return {
"view_id": "ner_manual",
# "view_id": "review",
"dataset": dataset,
"stream": stream,
"update": make_update,
"exclude": exclude,
"config": {
"lang": nlp_local.lang,
"labels": labels,
"exclude_by": "input",
"batch_size": BATCH_SIZE,
"custom_theme": {
"cardMaxWidth": 1500,
"cardMinHeight": 250,
"windowColor": "#f8f9fa",
"cardBackgroundColor": "#ffffff",
"cardBorderColor": "#ebebeb",
"show_task_ids": True,
},
"blocks": [
{"view_id": "ner_manual", "text": "Local Model", "labels": labels},
{"view_id": "ner_manual", "text": "LLM Model", "labels": labels},
],
},
}
# return {
# "view_id": "ner_manual",
# "dataset": dataset,
# "stream": stream,
# "update": make_update,
# "exclude": exclude,
# "config": {
# "lang": nlp_local.lang,
# "labels": labels,
# "exclude_by": "input",
# "batch_size": BATCH_SIZE,
# },
# }
There are two local dependencies:
### groq_config.cfg
[nlp]
lang = "en"
pipeline = ["llm"]
batch_size = 128
[components]
[components.llm]
factory = "llm"
[components.llm.task]
@llm_tasks = "spacy.NER.v2"
labels = PERSON,ORG,LOC
[components.llm.model]
@llm_models = "groq_llm_model.v1"
and
### groq_llm_model.py
from groq import Groq
from decouple import config
from typing import Iterable
from spacy_llm.registry import registry # type: ignore
GROQ_CHAT_MODEL = "mixtral-8x7b-32768"
GROQ_API_KEY = config("GROQ_API_KEY") # type: ignore
GROQ_CLIENT = Groq(
api_key=GROQ_API_KEY,
)
def groq_complete_chat(
prompts_for_doc: Iterable[str],
) -> Iterable[str]:
result: list[str] = []
for prompt in prompts_for_doc:
messages = [
{
"role": "user",
"content": prompt,
}
]
chat_completion_object = GROQ_CLIENT.chat.completions.create(
messages=messages, # type: ignore
model=GROQ_CHAT_MODEL,
)
answer = chat_completion_object.choices[0].message.content
result.append(answer)
return result
@registry.llm_models("groq_llm_model.v1")
def make_groq_model():
def _query_groq_llm(
prompts: Iterable[Iterable[str]],
) -> Iterable[Iterable[str]]:
result = []
for prompts_for_doc in prompts:
answers_for_doc = groq_complete_chat(prompts_for_doc)
result.append(answers_for_doc)
return result
return _query_groq_llm
Note: The latter of these requires a free API key from Groq, called GROQ_API_KEY
, imported from the environment variables.
My goal: I want the UI for my recipe to look like this review UI in the prodigy UI documentation:
Specifically, I want to see "parallel" labelling results for both the local model (call it spacy.NER
), and the LLM (call it groq_llm
), just as in the review UI image above.
The roadblocks: I think there are two big things I don't understand:
- As you can see, I have attempted to create such a UI using the "blocks" paradigm, but this does not yield the required results (see image below). This was a copy-paste-pray kinda solution; I don't really understand how this works.
- I have struggled to understand the correct format for the JSON
task
object defined inmake_tasks_dual
. This may be defined somewhere in the prodigy documentation, but I could not find it in sufficient detail to feel confident about it (but perhaps I have overlooked this?). So, I resorted to following this helpful tutorial, implementing the code outlined there for a model-based annotation task, and then using thePRODIGY_LOGGING=verbose
command line flag to force prodigy to print the task data to the terminal. This is how I arrived at the implicit task object definition in themake_tasks_dual
function — it seemed from this output that the correct task format might be:
{
"text": <str>,
"meta": <dict>,
"tokens": <list>,
“versions”: [
"text": <str>,
“tokens”: <list>,
“spans”: <list>,
“answer”: “accept”,
“sessions”: [“model_name”],
“default”: <bool>,
]
"_input_hash": <int>,
"_task_hash": <int>,
}
but of course I am not sure. Here is what my UI looks like at present:
As you can see, nothing at all is highlighted, although the system appears to be doing some labelling, based on the verbose terminal output:
(prodigy) stephenenrightward@Stephens-MacBook-Pro-3 prodigy-docker % ./run_prodigy_simple.sh
âś” Removed 'ner_terminal' from database SQLite
22:22:18: CLI: Importing file ./ner_dual_recipe.py
22:22:19: RECIPE: Calling recipe 'ner.correct.v2'
/Users/stephenenrightward/miniconda3/envs/prodigy/lib/python3.11/site-packages/spacy_llm/pipeline/llm.py:143: UserWarning: Task supports sharding, but model does not provide context length. Data won't be sharded, prompt might exceed the model's context length. Set context length in your config. If you think spacy-llm should provide the context length for this model automatically, report this to https://github.com/explosion/spacy-llm/issues.
warnings.warn(
22:22:20: /Users/stephenenrightward/.prodigy/prodigy.json
22:22:20: VALIDATE: Validating components returned by recipe
22:22:20: CONTROLLER: Initialising from recipe
22:22:20: CONTROLLER: Recipe Config
22:22:20: {'lang': 'en', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC'], 'exclude_by': 'input', 'batch_size': 10, 'custom_theme': {'cardMaxWidth': 1500, 'cardMinHeight': 250, 'windowColor': '#f8f9fa', 'cardBackgroundColor': '#ffffff', 'cardBorderColor': '#ebebeb', 'show_task_ids': True}, 'blocks': [{'view_id': 'ner_manual', 'text': 'Local Model', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC']}, {'view_id': 'ner_manual', 'text': 'LLM Model', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC']}], 'dataset': 'ner_terminal', 'recipe_name': 'ner.correct.v2'}
22:22:20: VALIDATE: Creating validator for view ID 'ner_manual'
22:22:20: CONTROLLER: Using `no_overlap` router.
22:22:20: VALIDATE: Validating Prodigy and recipe config
22:22:20: PREPROCESS: Tokenizing examples (running tokenizer only)
0. Document is being processed in make_tasks
22:22:22: /Users/stephenenrightward/.prodigy/prodigy.json
22:22:22: DB: Creating unstructured dataset 'ner_terminal'
Added dataset ner_terminal to database SQLite.
22:22:22: DB: Creating unstructured dataset '2024-05-28_22-22-22'
22:22:22: {'created': datetime.datetime(2024, 5, 28, 22, 22, 22)}
22:22:22: CORS: initialized with wildcard "*" CORS origins
✨ Starting the web server at http://localhost:8080 ...
Open the app in your browser and start annotating!
INFO: Started server process [57265]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://localhost:8080 (Press CTRL+C to quit)
INFO: ::1:53800 - "GET / HTTP/1.1" 200 OK
INFO: ::1:53800 - "GET /bundle.js HTTP/1.1" 200 OK
22:22:23: /Users/stephenenrightward/.prodigy/prodigy.json
22:22:23: GET: /project
22:22:23: {'lang': 'en', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC'], 'exclude_by': 'input', 'batch_size': 10, 'custom_theme': {'cardMaxWidth': 1500, 'cardMinHeight': 250, 'windowColor': '#f8f9fa', 'cardBackgroundColor': '#ffffff', 'cardBorderColor': '#ebebeb', 'show_task_ids': True}, 'blocks': [{'view_id': 'ner_manual', 'text': 'Local Model', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC']}, {'view_id': 'ner_manual', 'text': 'LLM Model', 'labels': ['PERSON', 'ORG', 'GPE', 'LOC']}], 'dataset': 'ner_terminal', 'recipe_name': 'ner.correct.v2', 'view_id': 'ner_manual', 'version': '1.15.3'}
INFO: ::1:53800 - "GET /project HTTP/1.1" 200 OK
INFO: ::1:53801 - "GET /fonts/lato-bold.woff2 HTTP/1.1" 200 OK
INFO: ::1:53800 - "GET /fonts/robotocondensed-bold.woff2 HTTP/1.1" 200 OK
INFO: ::1:53802 - "GET /fonts/lato-regular.woff2 HTTP/1.1" 200 OK
22:22:23: /Users/stephenenrightward/.prodigy/prodigy.json
INFO: ::1:53800 - "GET /favicon.ico HTTP/1.1" 200 OK
22:22:23: POST: /get_session_questions
22:22:23: CONTROLLER: Getting batch of questions for session: None
22:22:23: STREAM: Created queue for 2024-05-28_22-22-22.
22:22:23: ROUTER: Routing item with _input_hash=1617698495 -> ['2024-05-28_22-22-22']
1. Document is being processed in make_tasks
22:22:23: ROUTER: Routing item with _input_hash=-1699702290 -> ['2024-05-28_22-22-22']
2. Document is being processed in make_tasks
22:22:24: ROUTER: Routing item with _input_hash=-1137479422 -> ['2024-05-28_22-22-22']
3. Document is being processed in make_tasks
22:22:24: ROUTER: Routing item with _input_hash=-770276050 -> ['2024-05-28_22-22-22']
4. Document is being processed in make_tasks
22:22:25: ROUTER: Routing item with _input_hash=-2114618586 -> ['2024-05-28_22-22-22']
5. Document is being processed in make_tasks
22:22:25: ROUTER: Routing item with _input_hash=803401808 -> ['2024-05-28_22-22-22']
6. Document is being processed in make_tasks
22:22:25: ROUTER: Routing item with _input_hash=1361785827 -> ['2024-05-28_22-22-22']
7. Document is being processed in make_tasks
22:22:26: ROUTER: Routing item with _input_hash=2123367862 -> ['2024-05-28_22-22-22']
8. Document is being processed in make_tasks
22:22:26: ROUTER: Routing item with _input_hash=-1935862931 -> ['2024-05-28_22-22-22']
9. Document is being processed in make_tasks
22:22:27: ROUTER: Routing item with _input_hash=-4780812 -> ['2024-05-28_22-22-22']
22:22:27: RESPONSE: /get_session_questions (10 examples)
22:22:27: {'tasks': [{'text': "Zhengzhou Technology Trading Market specializes in technology (focusing on Zhengzhou Technology Trading Market).\n\nEditor's Note\nTo improve our province's innovation system and build a first-class innovation ecosystem, Zhengzhou Technology Trading Market was officially launched on December 24, 2021. After its oper...", 'meta': {}, 'tokens': [{'text': 'Zhengzhou', 'start': 0, 'end': 9, 'id': 0, 'ws': True}, {'text': 'Technology', 'start': 10, 'end': 20, 'id': 1, 'ws': True}, {'text': 'Trading', 'start': 21, 'end': 28, 'id': 2, 'ws': True}, {'text': 'Market', 'start': 29, 'end': 35, 'id': 3, 'ws': True}, {'text': 'specializes', 'start': 36, 'end': 47, 'id': 4, 'ws': True}, {'text': 'in', 'start': 48, 'end': 50, 'id': 5, 'ws': True}, {'text': 'technology', 'start': 51, 'end': 61, 'id': 6, 'ws': True}, {'text': '(', 'start': 62, 'end': 63, 'id': 7, 'ws': False}, {'text': 'focusing', 'start': 63, 'end': 71, 'id': 8, 'ws': True}, {'text': 'on', 'start': 72, 'end': 74, 'id': 9, 'ws': True}, {'text': 'Zhengzhou', 'start': 75, 'end': 84, 'id': 10, 'ws': True}, {'text': 'Technology', 'start': 85, 'end': 95, 'id': 11, 'ws': True}, {'text': 'Trading', 'start': 96, 'end': 103, 'id': 12, 'ws': True}, {'text': 'Market', 'start': 104, 'end': 110, 'id': 13, 'ws': False}, {'text': ')', 'start': 110, 'end': 111, 'id': 14, 'ws': False}, {'text': '.', 'start': 111, 'end': 112, 'id': 15, 'ws': False}, {'text': '\n\n', 'start': 112, 'end': 114, 'id': 16, 'ws': False}, {'text': 'Editor', 'start': 114, 'end': 120, 'id': 17, 'ws': False}, {'text': "'s", 'start': 120, 'end': 122, 'id': 18, 'ws': True}, {'text': 'Note', 'start': 123, 'end': 127, 'id': 19, 'ws': False}, {'text': '\n', 'start': 127, 'end': 128, 'id': 20, 'ws': False}, {'text': 'To', 'start': 128, 'end': 130, 'id': 21, 'ws': True}, {'text': 'improve', 'start': 131, 'end': 138, 'id': 22, 'ws': True}, etc
Notes:
- The above terminal output is truncated to fit within the maximum character limit of a question on this form.
- The video tutorial I mention in point 2 above succeeds in implementing this UI in a slightly different use case, namely where one has first pre-labelled all data with both the local and LLM model, using the prodigy command line. The review UI then allows one to "review" the choices of both models in parallel, for each labelled data point. This is great, but it is not quite my use case: As I mentioned in the intro, I would like to perform the labelling batch-wise, so that, in the next phase of my project, I can update both the local spacy.NER model, and perhaps also update the N-shot examples in the LLM prompt, based on the text labelled so far.
TL;DR: How can I fix the above code to yield the review-style UI I'm looking for, with both local and LLM model suggestions (i.e. text span highlights) shown in parallel, while still labelling on a per-batch basis for an active learning use case? Also, where can I find a detailed and exhaustive description of the task object format for the various labelling use cases (text categorisation, NER etc), so that I can adapt this UI to those different use cases without needing to return to the forum?