I am running this code but the I am getting NO_LABEL
every time. I saw a similar post here on NO_LABEL
, so I tried using ner.manual
but, I do not have the functionality of counting number of annotations done per annotator (each annotator has their own link). There is no on_load
, update
or on_exit
option.
import prodigy
import spacy
from multiprocessing import Process
from time import sleep
from prodigy.recipes.ner import batch_train
import atexit
from pathlib import Path
import datetime as dt
from prodigy.components import printers
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.core import recipe, recipe_args
from prodigy.util import TASK_HASH_ATTR, log, get_labels
from datetime import datetime
from collections import Counter
# It's all going to be run by coder name.
# Config:
# - add list of coders
# - ?? add port per coder?
# - base file name for files
# - recipe, db, model, output
@prodigy.recipe('mark_custom',
dataset=recipe_args['dataset'],
spacy_model=recipe_args['spacy_model'],
source=recipe_args['source'],
api=recipe_args['api'],
loader=recipe_args['loader'],
label=recipe_args['label'],
view_id=recipe_args['view'],
memorize=recipe_args['memorize'],
exclude=recipe_args['exclude'])
def mark_custom(dataset, spacy_model, source=None, view_id=None, label=None, api=None,
loader=None, memorize=False, exclude=None):
"""
Click through pre-prepared examples, with no model in the loop.
"""
log('RECIPE: Starting recipe mark', locals())
nlp = spacy.load(spacy_model)
log("RECIPE: Loaded model {}".format(spacy_model))
stream = get_stream(source, api, loader)
stream = list(add_tokens(nlp, stream))
labels = get_labels(label, nlp)
print(labels)
counts = Counter()
memory = {}
def fill_memory(ctrl):
if memorize:
examples = ctrl.db.get_dataset(dataset)
log("RECIPE: Add {} examples from dataset '{}' to memory"
.format(len(examples), dataset))
for eg in examples:
memory[eg[TASK_HASH_ATTR]] = eg['answer']
def ask_questions(stream,nlp=nlp):
for eg in stream:
eg['time_loaded'] = datetime.now().isoformat()
if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory:
answer = memory[eg[TASK_HASH_ATTR]]
counts[answer] += 1
else:
if label:
eg['label'] = label
yield eg
def recv_answers(answers):
for eg in answers:
counts[eg['answer']] += 1
memory[eg[TASK_HASH_ATTR]] = eg['answer']
eg['time_returned'] = datetime.now().isoformat()
def print_results(ctrl):
print(printers.answers(counts))
def get_progress(session=0, total=0, loss=0):
progress = len(counts) / len(stream)
return progress
return {
'view_id': view_id,
'dataset': dataset,
'stream': ask_questions(stream),
'exclude': exclude,
'update': recv_answers,
'on_load': fill_memory,
'on_exit': print_results,
'config': {'label': labels}
}
class MultiProdigy:
def __init__(self,
coder_list = [{"name" : "X", "port" : 9010},
{"name" : "Y", "port" : 9011}
]):
self.coder_list = coder_list
self.processes = []
self.spacy_model = '/shopin-data/mohit_pandey/PRODIGY/MODEL/ner_iter2_model_batch_train_1'
self.label = "Details,Length,Lining,Fabric,Neckline,Occasion,Pattern,Personality,Support"
def serve(self, coder, port):
print(coder)
base = "/shopin-data/mohit_pandey/PRODIGY/DATA/data_"
filename = "{0}{1}.jsonl".format(base, coder)
prodigy.serve('mark_custom', # recipe
"ner_iteration_2_manual_tagged", # db
self.spacy_model, # model
filename, # input file
"ner_manual", # view ID
self.label, # labels
None, # api
None, # loader
True, # memorize
"ner_iteration_2_manual_tagged", # exclude
port=port) # port
def make_prodigies(self):
for coder_info in enumerate(self.coder_list):
coder_info = coder_info[1] # wut
thread = Process(target=self.serve, args = (coder_info['name'], coder_info['port']))
self.processes.append(thread)
def start_prodigies(self):
print("Starting Prodigy processes...")
for p in self.processes:
p.start()
sleep(1)
def kill_prodigies(self):
print("Killing Prodigy threads")
for i in self.processes:
try:
i.terminate()
except AttributeError:
print("Process {0} doesn't exist?".format(i))
self.processes = []
if __name__ == "__main__":
mp = MultiProdigy()
#mp.make_retrain_time()
atexit.register(mp.kill_prodigies)
mp.make_prodigies()
mp.start_prodigies()
while True:
sleep(5)
# if dt.datetime.now() > mp.retrain_time:
# print("Retraining model and scheduling next retraining for tomorrow")
# mp.make_retrain_time() # bump to tomorrow
# mp.train_and_restart()
Is there any way I can use ner.manual
and get a count of annotations done per annotator and also avoid repetition of questions ?