just to follow up: i get errors when trying to pipe in to the source argument for textcat.llm.fetch. my idea was to just loop through batches of lines in the jsonl file and progressively add in to the dataset with --resume.
Getting labels from the 'llm' component
Using 5 labels: ['ASSESS', 'BREATHING', 'MOVEMENT', 'OTHER', 'PAIN']
RECIPE: Resuming from previous output file:
dataset:openai_test_batch
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/Users/aaronconway/audiosedstate/venv/lib/python3.10/site-packages/prodigy/main.py", line 50, in
main()
File "/Users/aaronconway/audiosedstate/venv/lib/python3.10/site-packages/prodigy/main.py", line 44, in main
controller = run_recipe(run_args)
File "cython_src/prodigy/cli.pyx", line 123, in prodigy.cli.run_recipe
File "cython_src/prodigy/cli.pyx", line 124, in prodigy.cli.run_recipe
File "/Users/aaronconway/audiosedstate/venv/lib/python3.10/site-packages/prodigy/recipes/llm/textcat.py", line 155, in llm_fetch_textcat
total = sum(1 for _ in stream.copy())
File "cython_src/prodigy/components/stream.pyx", line 374, in prodigy.components.stream.Stream.copy
File "cython_src/prodigy/components/source.pyx", line 371, in prodigy.components.source.GeneratorSource.copy
TypeError
cat test.jsonl | dotenv run -- prodigy textcat.llm.fetch fewshot_openai.cfg - dataset:openai_test_batch --loader jsonl --resume
test.jsonl
{"text": "It's okay.", "meta": {"pid": "P018", "segment": "18", "start_time": "105.537", "end_time": "106.778"}}
{"text": "One chance to get a fresh breath there.", "meta": {"pid": "P018", "segment": "19", "start_time": "106.778", "end_time": "109.279"}}
{"text": "Oh, it feels good.", "meta": {"pid": "P018", "segment": "20", "start_time": "109.279", "end_time": "111.32"}}
{"text": "So this smells like the plastic that it's made of.", "meta": {"pid": "P018", "segment": "21", "start_time": "111.32", "end_time": "113.301"}}
{"text": "All watered up now.", "meta": {"pid": "P018", "segment": "24", "start_time": "116.583", "end_time": "121.606"}}
fewshot_openai.cfg
[nlp]
lang = "en"
pipeline = ["llm"]
[components]
[components.llm]
factory = "llm"
save_io = true
[components.llm.model]
@llm_models = "spacy.GPT-3-5.v2"
config = {"temperature": 0.0}
[components.llm.task]
@llm_tasks = "spacy.TextCat.v3"
labels = MOVEMENT,BREATHING,ASSESS,PAIN,OTHER
exclusive_classes = true
[components.llm.task.label_definitions]
MOVEMENT = "A specific instruction for a person to stay still or stop moving. It may be to stop moving a particular body part like hands or arms or legs, or just to stop moving and stay still in general."
BREATHING = "A specific instruction to either stop breathing or start breathing."
ASSESS = "A question asking a person how they are feeling."
PAIN = "A person describing their pain or discomfort."
OTHER = "Everything else that does not fall into the other categories."
[components.llm.task.examples]
@misc = "spacy.FewShotReader.v1"
path = "examples.jsonl"
[components.llm.task.normalizer]
@misc = "spacy.LowercaseNormalizer.v1"
[components.llm.cache]
@llm_misc = "spacy.BatchCache.v1"
path = "local-cached"
batch_size = 3
max_batches_in_mem = 10