Thanks so much for providing the debugging information.
It looks then that none of the labels you mentioned is missing! There's exactly the same number of PERSON
and COURSE
labels in the merged dataset as in the constituent datasets.
What is unexpected is actually the number of ORG
labels.
Have you performed any modifications on ner_resumes
or constituent dataset after merging and before computing these stats? Such as manual curation of the labels, for example.
The db-merge
recipe just concatenates the examples without any deduplication or span modification.
You also mention there are fewer examples in the resulting merged dataset.
Let's just redo the merging with some extra logging and some extra stats on input hashes and task hashes and the total number of examples.
Could you please run this script and share the terminal output with me?
from typing import List, Set, Dict, NamedTuple
from prodigy.components.db import connect
from prodigy.util import set_hashes
from collections import Counter
class DatasetStats(NamedTuple):
"""Simple container for dataset statistics"""
input_hashes: Set[str]
task_hashes: Set[str]
label_counts: Dict[str, int]
total_inputs: int
total_tasks: int
def count_labels(examples: List[dict]) -> Dict[str, int]:
"""Count label occurrences in dataset examples"""
counter = Counter()
for eg in examples:
for span in eg.get("spans", []):
counter[span["label"]] += 1
return dict(counter)
def get_dataset_stats(examples: List[dict]) -> DatasetStats:
"""Calculate statistics for a dataset"""
input_hashes = [task["_input_hash"] for task in examples]
task_hashes = [task["_task_hash"] for task in examples]
return DatasetStats(
input_hashes=set(input_hashes),
task_hashes=set(task_hashes),
label_counts=count_labels(examples),
total_inputs=len(input_hashes),
total_tasks=len(task_hashes)
)
def validate_output_dataset(db, output_dataset: str) -> None:
"""Check if output dataset exists and is empty"""
if output_dataset in db and len(db.get_dataset_examples(output_dataset)):
raise ValueError(
f"Output dataset '{output_dataset}' already exists and includes examples. "
"Please use a new dataset name to avoid unexpected results."
)
def merge_datasets(
input_datasets: List[str],
output_dataset: str,
rehash: bool = False
) -> List[dict]:
"""Merge multiple datasets into one"""
db = connect()
merged = []
for dataset_id in input_datasets:
examples = db.get_dataset_examples(dataset_id)
if rehash:
examples = [set_hashes(eg, overwrite=True) for eg in examples]
merged.extend(examples)
print(f"Added {len(examples)} examples from '{dataset_id}'")
validate_output_dataset(db, output_dataset)
db.add_dataset(output_dataset)
db.add_examples(merged, datasets=[output_dataset])
return merged
def print_label_stats(stats: Dict[str, int]) -> None:
"""Print label statistics in a formatted table"""
print("\nLabels")
print("-" * 40)
for label, count in sorted(stats.items()):
print(f"{label}\t{count}")
print("-" * 40)
def print_dataset_stats(dataset_name: str, stats: DatasetStats, total: int) -> None:
"""Print comprehensive dataset statistics"""
print(f"\n=== Analysis for {dataset_name} ===")
print(f"Total examples in the DB: {total}")
print(f"Input hashes: {stats.total_inputs}")
print(f"Unique input hashes: {len(stats.input_hashes)}")
print(f"Task hashes: {stats.total_tasks}")
print(f"Unique task hashes: {len(stats.task_hashes)}")
print_label_stats(stats.label_counts)
# Print duplicate warnings
if stats.total_inputs != len(stats.input_hashes):
print(f"Warning: Contains {stats.total_inputs - len(stats.input_hashes)} duplicate input hashes")
if stats.total_tasks != len(stats.task_hashes):
print(f"Warning: Contains {stats.total_tasks - len(stats.task_hashes)} duplicate task hashes")
def compare_datasets(source_stats: DatasetStats, merged_stats: DatasetStats, source_name: str) -> None:
"""Compare source dataset with merged dataset"""
missing = merged_stats.input_hashes - source_stats.input_hashes
extra = source_stats.input_hashes - merged_stats.input_hashes
print(f"\nComparing {source_name} with merged dataset:")
print(f"Missing from merged dataset: {len(missing)} hashes")
print(f"Extra in {source_name} dataset: {len(extra)} hashes")
def analyze_all_datasets(dataset_names: List[str], output_dataset: str) -> Dict[str, DatasetStats]:
"""Analyze all datasets and return their statistics"""
db = connect()
datasets_stats = {}
for dataset_name in dataset_names + [output_dataset]:
examples = db.get_dataset_examples(dataset_name)
stats = get_dataset_stats(examples)
datasets_stats[dataset_name] = stats
print_dataset_stats(dataset_name, stats, len(examples))
return datasets_stats
def main(
dataset_names: List[str],
output_dataset: str,
rehash: bool = False
) -> None:
"""Main function to orchestrate dataset merging and analysis"""
print("\n=== Merging datasets ===\n")
# Merge datasets
merged_examples = merge_datasets(dataset_names, output_dataset, rehash)
print(f"\nMerged {len(merged_examples)} examples from {len(dataset_names)} datasets")
print(f"Created merged dataset '{output_dataset}'")
# Print dataset information
print("\n=== Prodigy Dataset Analysis ===")
for idx, name in enumerate(dataset_names, 1):
print(f"Dataset {idx}: {name}")
print(f"Merged dataset: {output_dataset}")
# Analyze all datasets
datasets_stats = analyze_all_datasets(dataset_names, output_dataset)
# Compare datasets
for dataset_name in dataset_names:
compare_datasets(
datasets_stats[dataset_name],
datasets_stats[output_dataset],
dataset_name
)
if __name__ == "__main__":
main(
dataset_names=[
"ner_resumes_person",
"ner_resumes_org",
"ner_resumes_course"
],
output_dataset="test_merged", # this will create a fresh merged dataset in your db
rehash=False # adjust to how you use rehash flag with `db-merge`
)
Please adjust the rehash
argument to how you used the --rehash
flag int he recipe. It shouldn't really matter but let's control for it as well.
Finally could you share with me the output of your prodigy stats
command to make sure it's not some older issue (although I don't recall any).
Thank you for your cooperation!