Bug description:
I tried to use the ner.llm.correct
recipe to annotate a dataset of 465 texts. The text length ranges from around 2.000 characters to 40.000 characters. For the first 60 texts, the Prodigy respects the batch configuration (it was set to 3). However, after this, the Prodigy tried to send 50 texts to LLM before showing anything to the annotator. As the LLM response time is too long, the server returns an empty batch and Prodigy shows the annotator the message: No tasks available.
Reproduction steps:
How can we recreate the bug?
I used a custom model to access a proxy to an Azure endpoint using the OpenAI client.
The config.cfg
was defined as follows:
[nlp]
lang = "pt"
pipeline = ["llm"]
[components]
[components.llm]
factory = "llm"
[components.llm.task]
@llm_tasks = "spacy.NER.v3"
labels = ["LABEL_1", "LABEL_2", "LABEL_3", "LABEL_4", "LABEL_5"]
description="DESCRIPTION PLACEHOLDER"
[components.llm.task.label_definitions]
LABEL_1 = "LABEL_1 DESCRIPTION"
LABEL_2 = "LABEL_2 DESCRIPTION"
LABEL_3 = "LABEL_3 DESCRIPTION"
LABEL_4 = "LABEL_4 DESCRIPTION"
LABEL_5 = "LABEL_5 DESCRIPTION"
[components.llm.task.examples]
@misc = "spacy.FewShotReader.v1"
path = "path/to/examples.json"
[components.llm.model]
@llm_models = "NER_custom.v1"
api_type = "azure"
api_key = "APPLICATION_KEY"
api_base = "PROXY_ENDPOINT"
api_version = "API_VERSION"
model = "GPT_MODEL"
[components.llm.cache]
@llm_misc = "spacy.BatchCache.v1"
path = "local-cached"
batch_size = 3
max_batches_in_mem = 10
main.py to
serve the Prodigy server is defined as follows
EXPERIMENT_NAME = "NAME_OF_EXPERIMENT"
RECIPE_NAME = "ner.llm.correct"
DATASET_NAME = EXPERIMENT_NAME
DATA_PATH = "path/to/data.jsonl"
CONFIG_PATH = "path/to/config.cfg"
def main():
@registry.llm_models("NER_custom.v1")
def ner_custom_v1(
api_type: str,
api_key: str,
api_base: str,
api_version: str,
model: str,
deployment_id: str,
):
return NER_CUSTOM(
api_type=api_type,
api_key=api_key,
api_base=api_base,
api_version=api_version,
model=model,
deployment_id=deployment_id,
)
prodigy.serve(f"{RECIPE_NAME} {DATASET_NAME} {CONFIG_PATH} {DATA_PATH}")
if __name__ == "__main__":
os.environ["PRODIGY_CONFIG"] = "path/to/prodigy.json"
main()
The custom model was defined as follows
import random
import time
from typing import Any, Dict, Iterable, List
import openai
import srsly
class NER_CUSTOM:
def __init__(self, api_type, api_key, api_base, api_version, model, deployment_id):
self._api_type = api_type
self._api_key = api_key
self._api_base = api_base
self._api_version = api_version
self._model = model
self._deployment_id = deployment_id
openai.api_type = self._api_type
openai.api_key = self._api_key
openai.api_base = self._api_base
openai.api_version = self._api_version
def __call__(self, prompts: Iterable[Iterable[str]]) -> Iterable[Iterable[str]]:
def retry_with_exponential_backoff(
func,
initial_delay: float = 1,
exponential_base: float = 2,
jitter: bool = True,
max_retries: int = 10,
errors: tuple = (openai.error.RateLimitError,),
):
"""Retry a function with exponential backoff."""
def wrapper(*args, **kwargs):
num_retries = 0
delay = initial_delay
while True:
try:
return func(*args, **kwargs)
except errors:
num_retries += 1
if num_retries > max_retries:
raise Exception(
f"Maximum number of retries ({max_retries}) exceeded."
)
delay *= exponential_base * (1 + jitter * random.random())
time.sleep(delay)
except Exception as e:
raise e
return wrapper
all_api_responses: List[List[str]] = []
for prompts_for_doc in prompts:
api_responses: List[str] = []
prompts_for_doc = list(prompts_for_doc)
@retry_with_exponential_backoff
def _request(json_data: Dict[str, Any]) -> Dict[str, Any]:
response = openai.ChatCompletion.create(
deployment_id=self._deployment_id,
model=self._model,
messages=[{"role": "user", "content": json_data["prompt"]}],
)
return response
for prompt in prompts_for_doc:
responses = _request({"prompt": prompt})
if "error" in responses:
return responses["error"]
assert len(responses["choices"]) == 1
response = responses["choices"][0]
api_responses.append(
response.get("message", {}).get(
"content", srsly.json_dumps(response)
)
)
all_api_responses.append(api_responses)
return all_api_responses
Lastly, the prodigy.json
was defined as follows
{
"db": "postgresql",
"port": 8080,
"host": "0.0.0.0",
"ui_lang": "pt",
"project_info": [
"dataset",
"session",
"lang",
"recipe_name",
"view_id",
"label"
],
"show_stats": true,
"history_length": 20,
"feed_overlap": false,
"allow_newline_highlight": true,
"ner_manual_highlight_chars": true,
"show_flag": true,
"instructions": "path/to/instructions.html",
"custom_theme": {
"smallText": 16,
"cardMaxWidth": "85%",
"labels": {
"LABEL_1": "#c5bdf4",
"LABEL_2": "#ff69b4",
"LABEL_3": "#d9fbad",
"LABEL_4": "#96e8ce",
"LABEL_5": "#ffd882"
}
}
}
Environment variables:
Please provide prodigy stats
or Python version/OS/Prodigy version:
Python version: 3.10
OS: Debian:Bullseye (also happens on MacOS 14.4)
Prodigy version: 1.15.6