Hi!
I am running an interface streaming from a postgres database. This database gets a fixed size of weekly additional examples. Now my annotators do not always manage to keep up with this pace and this generates backlog and I would like them to annotate the newest examples first.
I currently try to adapt the stream such that my annotators always get the most recent example from the DB using ORDER in the call to the Postgres DB (see recipe below). However, when I ran this my annotators would be told that there are no more tasks, even though only ~650 (acc. to prodigy progress
) of the +4K examples in the DB had been annotated. Any idea what the reason might be?
Recipe
import os
import spacy
import prodigy
from prodigy.components.preprocess import add_tokens
from prodigy.util import set_hashes
import psycopg2
@prodigy.recipe(
"hs-default-db",
dataset=("Dataset to save annotations into", "positional", None, str),
lang=("Language to use", "positional", None, str)
)
def customRecipe(dataset, lang):
# We can use the blocks to override certain config and content, and set
# `"text": None` for the choice interface so it doesn't also render the text
blocks = [
{"view_id": "html"},
{"view_id": "ner_manual"},
{"view_id": "choice", "text": None},
{"view_id": "text_input"}
]
span_labels = ["target"]
textcat_labels = ["Not toxic", "Toxic (excl. hate)", "Hate speech - Nationality", "Hate speech - Ethnicity", "Hate speech - Immigrants", "Hate speech - Gender", "Hate speech - Religion", "Hate speech - Sexuality", "Hate speech - Disability", "Hate speech - Age", "Hate speech - Other"]
html_template = (
'<div class="context">'
'<h4><span style="font-weight: bold;">Article Context</span></h4>'
'<p style="margin-bottom: 0px; font-weight: bold;">{{article_title}}</p>'
'<p style="margin-bottom: 5px; font-style: italic;">{{article_title_header}}</p>'
'<p style="margin-bottom: 0px;">{{article_lead}}</p>'
# '<p style="margin-bottom: 0px; font-weight: bold;">Parent Comment: {{parent_text}}</p>'
'</div>'
'<br>'
'<h3 style="color: #801414; margin-bottom: 0px; margin-top: 10px; text-align: left;">Comment (find all targets)</h3>'
)
def dbLoader():
# Fetch database credentials from environment variables
dbname = os.getenv("PGDATABASE")
user = os.getenv("PGUSER")
password = os.getenv("PGPASSWORD")
host = os.getenv("PGHOST")
port = os.getenv("PGPORT")
# Connect to PostgreSQL
conn = psycopg2.connect(
dbname=dbname,
user=user,
password=password,
host=host,
port=port
)
cur = conn.cursor()
cur.execute("SELECT comment_id, text, article_title, article_title_header, article_lead FROM hs_samples ORDER BY "samplingDatetime" DESC")
records = cur.fetchall()
# Prepare data for Prodigy
def replace_empty_string(obj):
if obj is None:
obj = "NA"
return obj
stream = []
for record in records:
comment_id = replace_empty_string(record[0])
text = replace_empty_string(record[1])
article_title = replace_empty_string(record[2])
article_title_header = replace_empty_string(record[3])
article_lead = replace_empty_string(record[4])
example = {
"comment_id": comment_id,
"text": text,
"article_title": article_title,
"article_title_header": article_title_header,
"article_lead": article_lead,
"meta": {"source": "database"}
}
example = set_hashes(example)
stream.append(example)
cur.close()
conn.close()
return stream
def add_options(stream):
for t in stream:
t['options'] = [
{"id": lab, "text": lab} for lab in textcat_labels
]
yield t
nlp = spacy.blank(lang) # blank spaCy pipeline for tokenization
stream = dbLoader()
stream = add_tokens(nlp, stream)
stream = add_options(stream)
return {
"view_id": "blocks", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"validate_answer": validate_answer, # Validate the answers
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": span_labels,
"blocks": blocks,
# "keymap_by_label": {
# "0": "q",
# "1": "w",
# "2": "e",
# "3": "r",
# "product": "1",
# "amount": "2",
# "size": "3",
# "type": "4",
# "topping": "5"
# },
"choice_style": "multiple",
"html_template": html_template,
"custom_theme": {
"bgCardTitle": "#801414",
"colorHighlightLabel": "#801414"
},
"global_css_dir": "./recipes/style",
# "javascript_dir": "./recipes/style",
"instructions": "./recipes/instructions.html"
}
}
prodigy.json
{
"buttons": ["accept", "undo"],
"annotations_per_task": 1.1,
"host":"0.0.0.0",
"port":8080,
"db":"postgresql",
"db_settings":{
"postgresql":{
"host":"placeholder",
"dbname":"placeholder",
"user":"placeholder",
"password":"placeholder",
"port":"placeholder"
}
}
}