Hi @Yuri,
I'm trying to reproduce the problem that you're having, and I need some help. I tried to reproduce your export/filter test but the SQL command you gave (select content from example;
) doesn't have the information needed to filter examples to a dataset or session. Can you share the complete SQL command you used?
To further debug, I set up a prodigy recipe file as a test case below. I hope you can help tweak it so that I get similar results to yours. The file contains two recipes (get_dataset_api
and get_dataset_sql
) that query the database to get examples from a generated test dataset. The dataset has answers from multiple sessions, and answers without a session.
recipe.py
import prodigy
dataset = "test-dataset"
num_sessions = 5
examples_per_session = 10
@prodigy.recipe("get_dataset_api")
def get_dataset_using_db_adapter():
db: prodigy.components.db.Database = prodigy.components.db.connect()
make_dataset(db)
# Get all examples
all_examples = db.get_dataset(dataset)
print(f"Database: get_dataset({dataset}) = {len(list(all_examples))}")
# for ex in all_examples:
# print(ex)
# Show sessions individually (0-3)
for i in range(num_sessions):
session_id = f"session_{i}"
session_examples = db.get_dataset(session_id)
print(f"Database: get_dataset({session_id}) == {len(list(session_examples))}")
# for sample in session_examples:
# print(sample)
@prodigy.recipe("get_dataset_sql")
def get_dataset_using_raw_sql():
def query_str(dataset_name: str) -> str:
return f"""
SELECT
eg.id, eg.input_hash, eg.task_hash, eg.content
FROM
example AS eg
INNER JOIN
link AS link ON (eg.id = link.example_id)
INNER JOIN
dataset AS dataset ON (link.dataset_id = dataset.id)
WHERE
(dataset.name = '{dataset_name}');
"""
db: prodigy.components.db.Database = prodigy.components.db.connect()
make_dataset(db)
# Get all examples
cursor = db.db.execute_sql(query_str(dataset))
all_examples = cursor.fetchall()
print(f"SQL: get_dataset({dataset}) = {len(list(all_examples))}")
# for ex in all_examples:
# print(ex)
# Show sessions individually (0-3)
for i in range(num_sessions):
session_id = f"session_{i}"
cursor = db.db.execute_sql(query_str(session_id))
session_examples = cursor.fetchall()
print(f"SQL: get_dataset({session_id}) == {len(list(session_examples))}")
# for sample in session_examples:
# print(sample)
def make_example(session_id=None, i=5):
obj = {
"text": f"Example {i}",
"label": "LABEL",
"answer": "accept",
}
if session_id is not None:
obj[prodigy.util.SESSION_ID_ATTR] = session_id
return prodigy.util.set_hashes(obj)
def make_dataset(db: prodigy.components.db.Database):
"""Make a disposable test dataset in the prodigy database"""
if dataset in db:
db.drop_dataset(dataset)
assert db.get_dataset(dataset) is None
# Insert a batch of examples without a session ID
non_session_examples = [make_example(None, i) for i in range(examples_per_session)]
db.add_examples(non_session_examples, [dataset])
# Insert (n) batches of examples, with different session IDs
for i in range(num_sessions):
session_id = f"session_{i}"
session_examples = [
make_example(session_id, i) for i in range(examples_per_session)
]
db.add_examples(session_examples, [dataset, session_id])
The first recipe queries for the examples using the prodigy Database
class, and you can run it:
$ prodigy get_dataset_api -F recipe.py
Database: get_dataset(test-dataset) = 60
Database: get_dataset(session_0) == 10
Database: get_dataset(session_1) == 10
Database: get_dataset(session_2) == 10
Database: get_dataset(session_3) == 10
Database: get_dataset(session_4) == 10
The second recipe queries for the examples using raw SQL
statements, and you can run it:
$ prodigy get_dataset_sql -F recipe.py
SQL: get_dataset(test-dataset) = 60
SQL: get_dataset(session_0) == 10
SQL: get_dataset(session_1) == 10
SQL: get_dataset(session_2) == 10
SQL: get_dataset(session_3) == 10
SQL: get_dataset(session_4) == 10
Hopefully, you can confirm the same results I see here, and maybe it helps tell us what is different in your setup. I also wonder, what version of prodigy are you using? Thanks!