I’m making a classifier for a relatively rare category (less than 1% of documents). After using the seed terms to find a bunch of examples and doing an initial batch train of the model, I’ve been having a hard time finding enough positive examples in the stream once I switch to the model, even with prefer_high_scores
. I’m guessing this has to do with the limit on number of examples the model looks at in the stream at once. I hacked something together that classifies an entire JSONL (in my case, with around 10,000 sentences) and writes out the top n highest scoring to a JSONL for use in Prodigy.
Hopefully this helps someone else with a similar problem, or maybe someone can tell me a better way of doing this.
import plac
import spacy
import jsonlines
import operator
from tqdm import tqdm
from random import shuffle
import re
@plac.annotations(
input_file=("File to get high probability text from.", "option", "i", str),
model=("Prodigy text classification model to use (path to).", "option", "m", str),
label=("Label to use.", "option", "l", str),
max_examples=("Max number of examples to export.", "option", "n", str))
def main(input_file, model, label, max_examples = 200):
"""
When you're looking for rare positive examples, run the model over your entire file and pull out the highest scoring n examples.
"""
nlp = spacy.load(model)
print("Calculating scores...")
with jsonlines.open(input_file) as reader:
lines = list(reader)
docs = []
for obj in tqdm(lines):
# should use pipe here but would require an extra reassembly step
obj['score'] = nlp(obj['text']).cats[label]
docs.append(obj)
docs.sort(key=operator.itemgetter('score'), reverse=True)
filtered_docs = docs[0:max_examples]
# keep things interesting, don't go best to worst
shuffle(filtered_docs)
outfile = re.sub("\.jsonl", "_high_scores.jsonl", input_file)
with jsonlines.open(outfile, mode='w') as writer:
writer.write_all(filtered_docs)
print("Wrote {0} tasks to {1}".format(len(filtered_docs), outfile))
if __name__ == "__main__":
plac.call(main)