Hi Vincent,
Thanks for the response. I am using the following command to start the server:
prodigy custom-mark dpc_retrain_games_demo data/dpc_retrain_games_21Q2_manrev_input.jsonl -F custom_domain_top_5.py -v choice -p 82
I have a custom recipe in custom_domain_top_5.py
:
import copy
import json
import logging
import os
import sys
import zlib
from collections import Counter
from hashlib import md5
from pprint import pprint
from time import time
import pandas as pd
import prodigy
from boto3 import session
from botocore.client import Config
from bs4 import BeautifulSoup
from prodigy.components.db import connect
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import fetch_images
from prodigy.recipes.generic import mark
from prodigy.util import log
import requests
import config
PRODIGY_LOGGING = 'basic'
os.environ['PRODIGY_LOGGING'] = PRODIGY_LOGGING
session = session.Session()
client = session.client(
's3',
region_name=config.space_region_name,
endpoint_url=config.space_endpoint_url,
aws_access_key_id=config.space_access_key,
aws_secret_access_key=config.space_secret_key
)
BASE_URL = config.space_bucket_url
BASE_PATH = 'data/'
def add_options(stream, template):
for task in stream:
for ki in ['_input_hash', '_task_hash', 'accept', 'answer']:
if ki in task:
del task[ki]
options = [
{'id': 'I', 'text': 'Infringing'},
{'id': 'IN', 'text': 'Infringing (Not Accessible)'},
{'id': 'N', 'text': 'Not Infringing'},
{'id': 'U', 'text': 'Insufficient data'}
]
task['options'] = options
k = task['Key']
task['URL'] = 'http://' + task['Domain']
if k is None:
log("KEY IS None!")
task['text'] = k
for i in range(5):
img_key = task['ScreenshotLocationKey{}'.format(i + 1)]
if img_key:
img_url = client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': config.space_bucket_name, 'Key': img_key},
ExpiresIn=172800
)
else:
img_url = None
task['ScreenshotURL{}'.format(i)] = img_url
img_key = task['SourceLocationKey{}'.format(i + 1)]
if img_key:
img_url = client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': config.space_bucket_name, 'Key': img_key},
ExpiresIn=172800
)
else:
img_url = None
task['SourceURL{}'.format(i)] = img_url
task['html'] = template
yield task
@prodigy.recipe('custom_mark',
dataset=('Dataset ID', 'positional', None, str),
view_id=('Annotation interface', 'option', 'v', str),
# memorize=('Enable answer cache', 'flag', 'M', bool),
port=('Port to run application on', 'option', 'p', int),
exclude=('Exclude data from dataset', 'option', 'e', str)
)
def my_custom_recipe(dataset, view_id='choice', source=None, port=8080, exclude=None):
with open('./template_domain_top_5.html') as tmp:
html_template = tmp.read()
# load your own streams from anywhere you want
stream = add_options(JSONL(source), html_template)
def recv_answers(stream):
for eg in stream:
if 'image' in eg:
del eg['image']
if 'html' in eg:
del eg['html']
print("Dataset ID:", dataset)
comp_config = {
'choice_auto_accept': True,
'html_template': html_template,
'instructions': './instructions.html',
'custom_theme': {
'cardMaxWidth': '1500px',
},
'port': port,
'host': '0.0.0.0',
'batch_size': config.prodigy_batch_size,
}
components = mark(
dataset=dataset,
source=stream,
# memorize=True,
exclude=[dataset],
view_id=view_id
)
components['config'] = comp_config
components['update'] = recv_answers
return components