I wasted quite some time to figure out JSONL is the preferred format for prodigy.ai , I used chatGPT to make a script
- It asks the users for the file location
- Displays the list of all columns and ask you to select the primary column you want train -
- Additionally ask you if you wish to convert other columns to metadata , you can select multiple at a time followed by a comma.
import pandas as pd
import json
def display_columns_in_table(df, exclude_col=None):
headers = df.columns if exclude_col is None else [col for col in df.columns if col != exclude_col]
print("\nColumns:")
for i, col in enumerate(headers, start=1):
print(f"{i}. {col}", end="\t")
if i % 2 == 0: # Change 4 to the desired number of columns per row
print()
def convert_csv_to_jsonl(csv_file_path, text_col, meta_cols=None, encoding='utf-8'):
try:
df = pd.read_csv(csv_file_path, encoding=encoding)
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_file_path, encoding='windows-1252')
except UnicodeDecodeError as e:
print(f"Failed to read CSV file: {e}")
return
seen = set() # Set to keep track of unique entries
jsonl_output = []
for _, row in df.iterrows():
# Prepare the main text and metadata
json_obj = {"text": row[text_col]}
if meta_cols:
json_obj["meta"] = {col: row[col] for col in meta_cols}
# Serialize the JSON object to a string
json_str = json.dumps(json_obj)
# Check for duplication
if json_str not in seen:
seen.add(json_str)
jsonl_output.append(json_str)
# Write to JSONL file
output_file_path = csv_file_path.replace('.csv', '.jsonl')
with open(output_file_path, 'w') as file:
for item in jsonl_output:
file.write(item + '\n')
def display_columns_in_table(df, exclude_col=None):
for i, col in enumerate(df.columns, start=1):
col_display = f"{i}. {col}" if col != exclude_col else f"{i}. ~~{col}~~"
print(col_display, end="\t")
if i % 4 == 0: # Adjust the number here for the number of columns per row
print()
if len(df.columns) % 4 != 0: # New line if the last row isn't full
print()
def main():
csv_file_path = input("Enter the path to your CSV file: ")
encoding_used = 'utf-8'
# Try reading the CSV file with different encodings
try:
df = pd.read_csv(csv_file_path)
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_file_path, encoding='windows-1252')
encoding_used = 'windows-1252'
except UnicodeDecodeError as e:
print(f"Failed to read CSV file: {e}")
return
display_columns_in_table(df)
while True:
text_col_input = input("\nEnter the column number for the primary text: ")
if "," in text_col_input or not text_col_input.isdigit():
print("Please enter only one column number for the primary text.")
continue
text_col_num = int(text_col_input)
if text_col_num < 1 or text_col_num > len(df.columns):
print("Invalid column number. Please try again.")
continue
break
text_col = df.columns[text_col_num - 1]
include_meta = input("Do you want to include metadata? (Y/N): ").strip().upper()
meta_cols = []
if include_meta == 'Y':
display_columns_in_table(df, exclude_col=text_col)
meta_col_nums = input("\nEnter the column numbers for metadata, separated by commas: ")
meta_col_indices = [int(x.strip()) - 1 for x in meta_col_nums.split(',') if x.strip().isdigit()]
meta_cols = [df.columns[i] for i in meta_col_indices if i < len(df.columns) and i != text_col_num - 1]
convert_csv_to_jsonl(csv_file_path, text_col, meta_cols, encoding=encoding_used)
if __name__ == "__main__":
main()