Hi Everyone @magdaaniol you probably can help me the most.
Below is the output of my Custom Recipe
Here is the code to my Custom Recipe Pipeline:
import spacy
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
import yaml
from datetime import datetime
@prodigy.recipe(
"raw-text-spancat-textcat-pipeline",
dataset=("Dataset to save annotations into", "positional", None, str),
spacy_model=("Loadable spaCy pipeline", "positional", None, str),
source=("Path to examples.jsonl file", "positional", None, str),
annotator_id=("Annotator ID", "option", "a", str)
)
def raw_text_spancat_textcat_pipeline(dataset, spacy_model, source, annotator_id):
# Generate session ID
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
session_id = f"{annotator_id}_{current_time}"
nlp = spacy.load(spacy_model)
print("Pipeline components:", nlp.pipe_names) # Debug print
span_labels = [
"ACTION", "DOSE", "DOSE_UNIT", "ROUTE", "FREQUENCY", "AS_NEEDED", "AS_DIRECTED",
"UNTIL_FINISHED", "DURATION", "TIME_OF_DAY", "DEVICE", "FOOD", "INTERVAL", "SITE",
"DX", "MULTI_STEP", "ADDITIONAL_INFO"
]
textcat_labels = [label for label in span_labels if label not in ["MULTI_STEP", "ADDITIONAL_INFO"]]
# Load the YAML file
with open('smartsig_config.yaml', 'r') as file:
config = yaml.safe_load(file)
def get_normalized_labels(category):
if category in config['normalization_labels']:
return sorted([{"id": k, "text": v.upper()} for k, v in config['normalization_labels'][category].items()], key=lambda x: x['text'])
elif category in ['AS_NEEDED', 'AS_DIRECTED', 'UNTIL_FINISHED']:
return [{"id": "1", "text": "1"}, {"id": "0", "text": "0"}]
elif category == 'DURATION':
return sorted([{"id": k, "text": v} for k, v in config['special_cases']['duration'].items()], key=lambda x: x['text'])
elif category == 'FREQUENCY':
return sorted([{"id": k, "text": v} for k, v in config['special_cases']['frequency'].items()], key=lambda x: x['text'])
elif category in config['numerical_categories']:
return [{"id": "numerical", "text": "Enter a number"}]
else:
return []
def get_category_type(category):
if category == 'DURATION':
return 'duration'
elif category == 'FREQUENCY':
return 'frequency'
elif category in config['numerical_categories']:
return 'numerical'
elif category in ['AS_NEEDED', 'AS_DIRECTED', 'UNTIL_FINISHED']:
return 'binary'
else:
return 'dropdown'
def sort_categories(categories):
return sorted(categories, key=lambda x: (get_category_type(x), x))
def add_spans_and_options(stream):
for eg in stream:
doc = nlp(eg['text'])
eg['spans'] = []
for span in doc.spans["sc"]:
eg['spans'].append({
"start": span.start_char,
"end": span.end_char,
"token_start": span.start,
"token_end": span.end - 1,
"label": span.label_
})
eg['tokens'] = [{"text": t.text, "start": t.idx, "end": t.idx + len(t.text), "id": i}
for i, t in enumerate(doc)]
sorted_labels = sort_categories(textcat_labels)
eg['options'] = {label: get_normalized_labels(label) for label in sorted_labels}
# Initialize all category values to empty strings
for label in sorted_labels:
eg[label] = ""
yield eg
stream = JSONL(source)
stream = add_tokens(nlp, stream)
stream = add_spans_and_options(stream)
return {
"view_id": "blocks",
"dataset": dataset,
"stream": stream,
"config": {
"labels": span_labels,
"exclude_by": "input",
"auto_count_examples": True,
"global_css": """
.prodigy-content {
padding-top: 40px;
}
.prodigy-content mark {
padding: 0.2em 0;
line-height: inherit;
}
.prodigy-buttons {
display: flex;
justify-content: center;
}
#text-categories {
margin-top: 5px;
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 3px;
padding: 5px;
background-color: #f0f0f0;
border-radius: 5px;
}
.category-label {
display: flex;
flex-direction: column;
align-items: flex-start;
background-color: white;
padding: 8px;
border-radius: 5px;
font-size: 14px;
text-align: left;
border: 1px solid #ddd;
transition: all 0.2s ease;
width: 100%;
}
.category-label span {
margin-bottom: 2px;
font-weight: bold;
}
.category-label.selected {
box-shadow: 0 0 5px rgba(0,0,0,0.3);
}
.category-dropdown,
.category-input {
width: 100%;
padding: 5px;
border: 1px solid #ccc;
border-radius: 3px;
font-size: 14px;
background-color: #f8f8f8;
}
#fixed-sig-text {
position: fixed;
top: 0;
left: 0;
right: 0;
background-color: rgba(255, 255, 255, 0.9);
padding: 5px;
z-index: 1000;
border-bottom: 2px solid #583fcf;
font-size: 18px;
font-weight: bold;
text-align: center;
height: auto;
}
.prodigy-content {
padding-top: 5px;
}
#text-categories {
margin-top: 10px;
}
""",
"javascript": """
document.addEventListener('prodigymount', () => {
function setupCategories() {
const categoriesContainer = document.querySelector('#text-categories');
if (!categoriesContainer) return;
const options = prodigy.content.options || {};
const categoryOrder = [
'ACTION', 'DOSE', 'DOSE_UNIT', 'ROUTE', 'FREQUENCY', 'AS_NEEDED', 'AS_DIRECTED',
'UNTIL_FINISHED', 'DURATION', 'TIME_OF_DAY', 'DEVICE', 'FOOD', 'INTERVAL', 'SITE', 'DX'
];
const sortedCategories = categoryOrder.filter(category => options.hasOwnProperty(category));
let html = sortedCategories.map(category => {
const categoryOptions = options[category] || [];
return `
<div class="category-label" data-category="${category}">
<span>${category}</span>
${getCategoryInput(category, categoryOptions)}
</div>
`;
}).join('');
categoriesContainer.innerHTML = html;
categoriesContainer.addEventListener('change', (e) => {
const input = e.target;
const category = input.closest('.category-label').dataset.category;
let value = input.value;
if (category === 'FREQUENCY' && input.type === 'number') {
const dropdown = input.previousElementSibling;
value = dropdown.value || value;
}
prodigy.update({ [category]: value });
updateCategoryHighlight(category, value);
});
// Set initial values based on prodigy.content
sortedCategories.forEach(category => {
const value = prodigy.content[category] || '';
const input = categoriesContainer.querySelector(`[name="${category}"]`);
if (input) {
input.value = value;
updateCategoryHighlight(category, value);
}
});
}
function getCategoryInput(category, categoryOptions) {
if (category === 'DURATION') {
return `<select class="category-dropdown" name="${category}">
<option value="">Select an option</option>
${categoryOptions.map(opt => `<option value="${opt.id}">${opt.text}</option>`).join('')}
</select>`;
} else if (category === 'FREQUENCY') {
return `
<select class="category-dropdown" name="${category}">
<option value="">Select an option</option>
${categoryOptions.map(opt => `<option value="${opt.id}">${opt.text}</option>`).join('')}
</select>
<input type="number" class="category-input" name="${category}_custom" placeholder="Enter a number">
`;
} else if (['DOSE', 'INTERVAL'].includes(category)) {
return `<input type="number" class="category-input" name="${category}" placeholder="Enter a number">`;
} else if (['AS_NEEDED', 'AS_DIRECTED', 'UNTIL_FINISHED'].includes(category)) {
return `<select class="category-dropdown" name="${category}">
<option value="">Select an option</option>
<option value="1">Yes</option>
<option value="0">No</option>
</select>`;
} else {
return `<select class="category-dropdown" name="${category}">
<option value="">Select an option</option>
${categoryOptions.map(opt => `<option value="${opt.id}">${opt.text}</option>`).join('')}
</select>`;
}
}
function updateCategoryHighlight(category, value) {
const categoryLabel = document.querySelector(`.category-label[data-category="${category}"]`);
if (categoryLabel) {
if (value && value !== '') {
categoryLabel.classList.add('selected');
categoryLabel.style.backgroundColor = getSpanColor(category);
} else {
categoryLabel.classList.remove('selected');
categoryLabel.style.backgroundColor = '';
}
}
}
function getSpanColor(category) {
const colorMap = {
'ACTION': '#e8daff',
'DOSE': '#aaffaa',
'DOSE_UNIT': '#ffccaa',
'ROUTE': '#aaccff',
'FREQUENCY': '#ffaaaa',
'AS_NEEDED': '#ffffaa',
'AS_DIRECTED': '#aaffff',
'UNTIL_FINISHED': '#ffaaff',
'DURATION': '#ccffaa',
'TIME_OF_DAY': '#aaffcc',
'DEVICE': '#ffaacc',
'FOOD': '#ccaaff',
'INTERVAL': '#aaccaa',
'SITE': '#ffcccc',
'DX': '#ccccff'
};
return colorMap[category] || '#ffffff';
}
setupCategories();
prodigy.on('update', () => {
console.log('Update event triggered');
setupCategories();
});
const updateFixedSigText = () => {
let fixedSigText = document.getElementById('fixed-sig-text');
if (!fixedSigText) {
fixedSigText = document.createElement('div');
fixedSigText.id = 'fixed-sig-text';
document.body.insertBefore(fixedSigText, document.body.firstChild);
}
fixedSigText.textContent = prodigy.content.text || '';
};
updateFixedSigText();
prodigy.on('update', updateFixedSigText);
console.log('SmartSigs customization applied');
});
""",
"blocks": [
{"view_id": "spans_manual", "labels": span_labels},
{"view_id": "html", "html": "<div id='text-categories'></div>"}
],
"buttons": ["accept", "reject", "ignore"]
},
}
Ideally what im trying to do is 2 fold first is taking a Rx Sig like "Take 1 Tablet by Mouth Every Morning" and use span cat to attempt to find each of these compoennts (in my custom reciepe i loaded a previous spancat model to help with this annoation process) the second part I want to do is use those spancats and help me classify and normalize those sigs to standard components. (what I am attempting to do with the custom css components). Eventually so that lets say I had to Sigs like Take 1 Tablet by mouth every morning and "Take 1 tablet PO qd" they would output the same sig. Im not sure if my approach is the best atm but curently i think with this custom pipeline its really only a spancat pipeline with just custom JS to add textcat labels. Im not sure how to combien these pipelines have the same tokenizer to be used for both. I also have an issue where my JS is static where when I click accept here it repeats the same textcat components without resetting to default see below for exmaple:
Im not sure what I am doing tbh lol. Im not sure if im makign much sense. Im a pharmacist and my goal is to normalize the output of sigs and kinda stuck.
What I tried to do was to change this so that I can get textcat as part of the pipeline but when I change the view_id
{"view_id": "html", "html": "<div id='text-categories'></div>"}
to a view block like how your repo was done
{"view_id": "choice", "text": None},
but when i do that i get this error
I just gave a hodgepodge of so many things im sorry if im confusing.