I am trying to create a label prediction model that can read the text and extract the trained labels that I prepared in prodigy. To do that I created an annotated dataset in prodigy and created a trained model and used the best model from prodigy with spacy-model and "roberta-large" to predict the labels form texts in my df_text database. However, the model runs really slow, I am not sure whether the problem is with the trained prodigy model. I would be very appreciative if you could let me know whether I am using the trained prodigy model in a correct way. Also, the accuracy of the trained model in prodigy was 0.79 for the best model. Please see the code in below. Thanks in advance.
def load_jsonl(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
data.append(json.loads(line))
return data
def annotate_text(row, patterns):
text = row['text'].lower()
annotations = {
'sedan': [],
'truck': [],
'suv': [],
'crossover': []
}
for entry in patterns:
try:
pattern_str = ' '.join([re.escape(p['lower']) for p in entry['pattern']])
if re.search(r'\b' + pattern_str + r'\b', text):
annotations[entry['label']].append(pattern_str)
except re.error as e:
print(f"Regex error with pattern: {pattern_str}")
print(f"Error: {e}")
row['sedan_label'] = ', '.join(annotations['sedan'])
row['truck_label'] = ', '.join(annotations['truck'])
row['suv_label'] = ', '.join(annotations['suv'])
row['crossover_label'] = ', '.join(annotations['crossover'])
return row
prodigy_model_path = './model-best'
ner_model = spacy.load(prodigy_model_path)
spacy_model = spacy.load('en_core_web_lg')
def extract_gpe(text):
doc = spacy_model(text)
return ", ".join([ent.text for ent in doc.ents if ent.label_ == "GPE"])
texts_input_df = df_text.apply(lambda row: annotate_text(row, load_jsonl('./Total.jsonl')), axis=1)
print("Columns in DataFrame:", texts_input_df.columns)
required_columns = ['sedan_label', 'truck_label', 'suv_label', 'crossover_label']
missing_columns = [col for col in required_columns if col not in texts_input_df.columns]
if missing_columns:
print("Missing columns:", missing_columns)
for col in missing_columns:
texts_input_df[col] = ""
texts_input_df['geopolitical_label'] = texts_input_df['text'].apply(extract_gpe)
def prepare_dataset(df):
def label_encoder(label_dict):
return [
int(bool(label_dict['sedan'])),
int(bool(label_dict['truck'])),
int(bool(label_dict['suv'])),
int(bool(label_dict['crossover']))
]
df['encoded_labels'] = df.apply(lambda row: label_encoder({
'sedan': row.get('sedan_label', ''),
'truck': row.get('truck_label', ''),
'suv': row.get('suv_label', ''),
'crossover': row.get('crossover_label', '')
}), axis=1)
return Dataset.from_pandas(df[['text', 'encoded_labels']])
train_df, eval_df = train_test_split(texts_input_df, test_size=0.1)
train_dataset = prepare_dataset(train_df)
eval_dataset = prepare_dataset(eval_df)
model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=9, label2id={
'O': 0, 'B-sedan': 1, 'I-sedan': 2, 'B-truck': 3, 'I-truck': 4,
'B-suv': 5, 'I-suv': 6, 'B-crossover': 7,
'I-crossover': 8
})
id2label = {id: label for label, id in model.config.label2id.items()}
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
labels = []
for i, doc_labels in enumerate(examples['encoded_labels']):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = [-100 if word_id is None else doc_labels[word_id] if word_id < len(doc_labels) else -100 for word_id in word_ids]
labels.append(label_ids)
tokenized_inputs['labels'] = labels
return tokenized_inputs
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)
metric = load_metric("seqeval")
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
true_predictions = [[id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
true_labels = [[id2label[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
return metric.compute(predictions=true_predictions, references=true_labels)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
logging_dir="./logs",
logging_steps=50,
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
weight_decay=0.01
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)
trainer.train()
model.save_pretrained('./saved_model')
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def update_df_with_predictions(text):
predictions = ner_pipeline(text)
geopolitical_label = extract_gpe(text)
result = {
'sedan': ", ".join([pred['word'] for pred in predictions if pred['entity_group'] == 'sedan']),
'truck': ", ".join([pred['word'] for pred in predictions if pred['entity_group'] == 'truck']),
'suv': ", ".join([pred['word'] for pred in predictions if pred['entity_group'] == 'suv']),
'crossover': ", ".join([pred['word'] for pred in predictions if pred['entity_group'] == 'crossover']),
'geopolitical': geopolitical_label
}
return result
texts_input_df.update(texts_input_df['text'].apply(update_df_with_predictions))