spans.manual merge tokens using db-out

Hi @cheyanneb!

So you're just looking to add the raw span text to each span dict?

Thinking you could just add this to db-out:

for eg in examples:
    for span in eg["spans"]:
          span['text'] = eg['text'][span['start']:span['end']]

If you create a new flag argument (add_span_text) to turn this on or off (set off by default), you could run this

from pathlib import Path
from typing import Optional, Union

import srsly
from prodigy.components.db import connect
from prodigy.util import msg

def db_out(
    set_id: str,
    out_dir: Optional[Union[str, Path]] = None,
    answer: str = None,
    flagged_only: bool = False,
    dry: bool = False,
    add_span_text: bool = False,
) -> None:
    """
    Export annotations from the database. Files will be exported in
    Prodigy's JSONL format.
    """
    DB = connect()
    if set_id not in DB:
        msg.fail(f"Can't find '{set_id}' in database {DB.db_name}", exits=1)
    examples = DB.get_dataset_examples(set_id)
    if flagged_only:
        examples = [eg for eg in examples if eg.get("flagged")]
    if answer:
        examples = [eg for eg in examples if eg.get("answer") == answer]

    # add span text
    if add_span_text:
        for eg in examples:
            for span in eg["spans"]:
                span['text'] = eg['text'][span['start']:span['end']]

    if out_dir is None:
        for eg in examples:
            print(srsly.json_dumps(eg))
    else:
        out_dir = Path(out_dir)
        if not out_dir.exists():
            out_dir.mkdir()
        out_file = out_dir / f"{set_id}.jsonl"
        if not dry:
            srsly.write_jsonl(out_file, examples)
        msg.good(
            f"Exported {len(examples)} annotations from '{set_id}' in database {DB.db_name}",
            out_file.resolve(),
        )

Does this work?

1 Like