Python script to Convert CSV to JSONL (with metadata support)

I wasted quite some time to figure out JSONL is the preferred format for prodigy.ai , I used chatGPT to make a script

  1. It asks the users for the file location
  2. Displays the list of all columns and ask you to select the primary column you want train -
  3. Additionally ask you if you wish to convert other columns to metadata , you can select multiple at a time followed by a comma.
import pandas as pd
import json


def display_columns_in_table(df, exclude_col=None):
    headers = df.columns if exclude_col is None else [col for col in df.columns if col != exclude_col]
    print("\nColumns:")
    for i, col in enumerate(headers, start=1):
        print(f"{i}. {col}", end="\t")
        if i % 2 == 0:  # Change 4 to the desired number of columns per row
            print()


def convert_csv_to_jsonl(csv_file_path, text_col, meta_cols=None, encoding='utf-8'):
    try:
        df = pd.read_csv(csv_file_path, encoding=encoding)
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(csv_file_path, encoding='windows-1252')
        except UnicodeDecodeError as e:
            print(f"Failed to read CSV file: {e}")
            return

    seen = set()  # Set to keep track of unique entries
    jsonl_output = []

    for _, row in df.iterrows():
        # Prepare the main text and metadata
        json_obj = {"text": row[text_col]}
        if meta_cols:
            json_obj["meta"] = {col: row[col] for col in meta_cols}

        # Serialize the JSON object to a string
        json_str = json.dumps(json_obj)

        # Check for duplication
        if json_str not in seen:
            seen.add(json_str)
            jsonl_output.append(json_str)

    # Write to JSONL file
    output_file_path = csv_file_path.replace('.csv', '.jsonl')
    with open(output_file_path, 'w') as file:
        for item in jsonl_output:
            file.write(item + '\n')


def display_columns_in_table(df, exclude_col=None):
    for i, col in enumerate(df.columns, start=1):
        col_display = f"{i}. {col}" if col != exclude_col else f"{i}. ~~{col}~~"
        print(col_display, end="\t")
        if i % 4 == 0:  # Adjust the number here for the number of columns per row
            print()
    if len(df.columns) % 4 != 0:  # New line if the last row isn't full
        print()

def main():
    csv_file_path = input("Enter the path to your CSV file: ")
    encoding_used = 'utf-8'

    # Try reading the CSV file with different encodings
    try:
        df = pd.read_csv(csv_file_path)
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(csv_file_path, encoding='windows-1252')
            encoding_used = 'windows-1252'
        except UnicodeDecodeError as e:
            print(f"Failed to read CSV file: {e}")
            return

    display_columns_in_table(df)

    while True:
        text_col_input = input("\nEnter the column number for the primary text: ")
        if "," in text_col_input or not text_col_input.isdigit():
            print("Please enter only one column number for the primary text.")
            continue
        text_col_num = int(text_col_input)
        if text_col_num < 1 or text_col_num > len(df.columns):
            print("Invalid column number. Please try again.")
            continue
        break

    text_col = df.columns[text_col_num - 1]

    include_meta = input("Do you want to include metadata? (Y/N): ").strip().upper()
    meta_cols = []
    if include_meta == 'Y':
        display_columns_in_table(df, exclude_col=text_col)
        meta_col_nums = input("\nEnter the column numbers for metadata, separated by commas: ")
        meta_col_indices = [int(x.strip()) - 1 for x in meta_col_nums.split(',') if x.strip().isdigit()]
        meta_cols = [df.columns[i] for i in meta_col_indices if i < len(df.columns) and i != text_col_num - 1]

    convert_csv_to_jsonl(csv_file_path, text_col, meta_cols, encoding=encoding_used)

if __name__ == "__main__":
    main()