Hi,
I have two annotated data sets from two different annotation tools
- one has been converted into your accepted JSON input format for training
- the other comes from Prodigy annotations
I'm trying to convert prodigy output into json input format, like so, but the character offsets are misaligned.
nlp = spacy.blank('en')
def get_entity_tuples(spans: dict):
entity_tuples = []
print(spans)
for span in spans:
entity_tuples.append((span.get('start'), span.get('end'), span.get('label')))
return entity_tuples
json_input = []
for page_id, page in enumerate(accepted_annot):
file_name = page.get('meta').get('filename', None)
raw_string = page.get('text')
doc = nlp(raw_string)
offsets = get_entity_tuples(page.get('spans',[]))
ner = biluo_tags_from_offsets(doc, offsets)
tokens = []
for token in doc:
new_token = {"id": token.i, "orth": token.text, "ner": ner[token.i]}
tokens.append(new_token)
sentences = {"id": 0, "tokens": tokens}
paragraph = {"raw": raw_string, "sentences": sentences}
result = {"id": str(file_name), "paragraphs": [paragraph]}
json_input.append(result)
E.g.
{'text': '11th', 'start': 1965, 'end': 1969, 'id': 433} # Prodigy output
doc[433]
'11th' #correct
#But...
doc[433].idx
2015 # not 1965
Please can you advise where I'm going wrong and if there is a better way of combining the two sets of training data?
Thank you
Anna