Hi @cheyanneb!
So you're just looking to add the raw span text to each span dict?
Thinking you could just add this to db-out
:
for eg in examples:
for span in eg["spans"]:
span['text'] = eg['text'][span['start']:span['end']]
If you create a new flag argument (add_span_text
) to turn this on or off (set off by default), you could run this
from pathlib import Path
from typing import Optional, Union
import srsly
from prodigy.components.db import connect
from prodigy.util import msg
def db_out(
set_id: str,
out_dir: Optional[Union[str, Path]] = None,
answer: str = None,
flagged_only: bool = False,
dry: bool = False,
add_span_text: bool = False,
) -> None:
"""
Export annotations from the database. Files will be exported in
Prodigy's JSONL format.
"""
DB = connect()
if set_id not in DB:
msg.fail(f"Can't find '{set_id}' in database {DB.db_name}", exits=1)
examples = DB.get_dataset_examples(set_id)
if flagged_only:
examples = [eg for eg in examples if eg.get("flagged")]
if answer:
examples = [eg for eg in examples if eg.get("answer") == answer]
# add span text
if add_span_text:
for eg in examples:
for span in eg["spans"]:
span['text'] = eg['text'][span['start']:span['end']]
if out_dir is None:
for eg in examples:
print(srsly.json_dumps(eg))
else:
out_dir = Path(out_dir)
if not out_dir.exists():
out_dir.mkdir()
out_file = out_dir / f"{set_id}.jsonl"
if not dry:
srsly.write_jsonl(out_file, examples)
msg.good(
f"Exported {len(examples)} annotations from '{set_id}' in database {DB.db_name}",
out_file.resolve(),
)
Does this work?