Doccano annotated data review in prodigy.

I have Txt file with a list of JSONs which is pre-annotated from doccano. I made this script to convert that txt to prodigy-friendly JSONL.


import json
import spacy
from spacy.tokens import Doc


# Define a function to remove invalid entity spans
def remove_invalid_entity_spans(ftext, text, entities, nlp):
    # Process the text with spaCy
    doc = nlp(text, disable=["parser", "tagger", "ner"])

    # Iterate over the entities and create valid entity spans
    valid_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            valid_entities.append((span.start_char, span.end_char, span.label_, span.start, span.end))
        else:
            # Try to adjust the span by extending it
            flag = True
            for i in range(1, 10):
                span = doc.char_span(start - i, end, label=label)
                if span is not None:
                    valid_entities.append((span.start_char, span.end_char, span.label_, span.start, span.end))
                    flag = False
                    break
            else:
                for i in range(1, 10):
                    span = doc.char_span(start, end + i, label=label)
                    if span is not None:
                        valid_entities.append((span.start_char, span.end_char, span.label_, span.start, span.end))
                        flag = False
                        break
            # Print a message if the span could not be adjusted
            if flag:
                print('not parsed', start, end, '"' + ftext[start:end] + '"', label)

    return valid_entities


# Load the English tokenizer, tagger, parser, and NER model
nlp = spacy.load("en_core_web_sm")

# Define variables and paths
counter = {}
datas = []
file = [{'id': 83536, 'text': 'Indeed Resume\nVijaykumar L Jeeragi\nSupervisor\nSattur, Dharwad, Karnataka, Karnataka\njeeragivijay@gmail.com\n+91 9880784203\nAn Indian citizen, Bachelors Degree in Arts, (Technical diploma in Computers (JLC), Diploma in Airline\nManagement (IATA) with 20 years experience 16 years in India &4 years in Middle East in handling,\nAsset Control Division as a Assistant Asset Controller in Nesma & Alfadl Const co., Alkhobar Saudi\nArabia and worked as a Supervisor in billing section (front office) in SDM College and Hospital Sattur,\nDharwad. Karnataka. India.\nJOB DESCRIPTION:\nCOMPANY: NESMA & ALFADL CONST. CO. LTD\nALKHOBAR (SAUDI ARABIA)\nDEPARTMENT: Asset Control Division\nJOB TITLE: Asst. Asset Controller\nNAME: Vijaykumar Lingaraj Jeeragi\nPRESENT ADRESS: Vijaykumar L Jeeragi\nPlot No 1097, 36 th Cross, Near SDM Staff\nQuarters, Sattur, Dharwad.\nCell: +91 9880784203, +91 9916187893\nEmail: jeeragivijay@gmail.com\n1. POSITION IN THE ORGANISATION IN NESMA & ALFADL ( 1998 - 2002)\nSUPPPORT SERVICES MANAGER\nASSSET CONTROL MANAGER\nASST. ASSET CONTROLLER\nCLERICAL ASSISTANT\nCONTACTS ARISING FROM THE JOB\nA) Within the Company / Department\nSupport Services Manager\nAuto Workshop Manager\nPersonnel / Administration Department\nFinance Department\nProcurement Department\nStores Department\nB) Outside Nesma & Alfadl Co.\nSuppliers\nSub Contractors\nGENERAL ACTIVITY OF THE JOB\nThe function of the job include:\nTagging of all the Fixed Assets of the Company\nKeeping a track of them with respect to their locations\nAllocation of Assets with respect the the requirement on site.\nOn non-availability of assets, arranging for rental through suppliers.\nMajor role in the mobilizing and Demobilizing of the Project.\nArranging daily transportation for the staff and also on emergency basis.\nPreparing of invoices and charging the sites accordingly.\nA) Field of Responsibility\nResponsible to the Support Services Manager for the Control and allocation of assets with specific\nresponsibility for the efficient monetary control of assets.\nB) Planning\nTo plan effective operation of the Assets with respect to the requirement of sites.\nC) Implementation\nTo receive all the fixed Assets and tag them with respect to their categories.\nTo plan out the distribution of fixed Assets like vehicles, Equipment and small tools to the different\nprojects as per their requirement.\nTo co-ordinate with the Project Managers and Engineers in order to fulfill their requirement on site.\nIn case of non-availability of any assets to be provided on site. Hiring of assets from Suppliers and\nensuring that the rhythm of work is not altered.\nTo monitor and control all fixed assets.\nTo implement reports for fixed assets and charge the projects accordingly.\nTo ensure that full maintenance is done whenever the Fixed Asset is returned back to us by making a\njob request for the workshop.\nTo arrange transportation for the staff our Company as well as arrange transportation to carry\nmaterials on our trailers and heavy trucks to all our projects.\nAlso to control the transfer of drivers and Heavy Equipment Operators with respect to the\nrequirement from sites.\nPetty cash handled throughout the department.\nRELEVANT INFORMATION REGARDING THE JOB\nStatistics: Tagging/Controlling/ Transportation Coordinator/ Invoicing /Tracking & Updating Records of\nAssets.\nEstablishment: Company Asset Control Department\nStaff Responsibility: 2 Clerks\nPackages worked on: BaaN IV, Fixed Asset Package (Foxpro), MS-Office,\nSalary withdrawn: SR 2000 basic + Overtime +F.A + Transportation.\n2. WORKED IN SDM MEDICAL COLLEGE AND HOSPITAL, SATTUR, DHARWAD. (KARNATAKA) From 2007 to\n2013\nGENERAL ACTIVITY OF THE JOB\nThe function of the job includes:\nWorked as a super wiser\nOut patients Billing / in patients billing\nIn patients Discharge billing\nCash Handling of main billing sections\nInventory management\nOPD billing on rotation basis\nWilling to relocate to: Abroad - dubai - bahrain\nWork Experience\nOffice Assistant later promted as supervisor\nSDM Hospital - Sattur, Dharwad,Karnataka\n2007 to 2013\nof various projects like supervising in Billing work, maintenance, Inventory and as well as computerized\nin/out patients,discharge billing and also main billing cashier. ( 2007-2013)\nSupervisor\nManoj Jewellery, Koppikar Road Hubli - Hubli, Karnataka\n2002 to 2006\nASSISTANT ASSET CONTROLLER\nNesma and Alfadl Construction Co\n1998 to 2002\nAdministrator in M/s Institute\nStudies on Agriculture & Rural Development (ISARD) - Dharwad, Karnataka\n1996 to 1998\nEducation\nHigh school or equivalent\nSkills\nms office\nCertifications/Licenses\nState council of vocational education\nNovember 2002 to Present\nJLC certificate is issued with the provision to us it as equivalent to preuniversity course for the purpose\nof seeking jobs only\nAdditional Information\n1. Diploma in international airline and travel management from trade wings institute of management\n(IATA)\n2. Karnataka secondary education board certificate.\nVIjaykmar L. Jeeragi', 'label': [[14, 34, 'Name', 3, 6], [46, 83, 'Location', 9, 16], [84, 106, 'Email', 17, 18], [107, 121, 'Mobile_Number', 19, 21], [248, 267, 'Total_Experience', 49, 52], [553, 568, 'Experience_title', 112, 114], [579, 608, 'Company_Name', 118, 126], [579, 608, 'Current_Company', 118, 126], [609, 617, 'Location', 127, 128], [645, 667, 'Current_Designation', 135, 138], [645, 667, 'Position', 135, 138], [679, 701, 'Position', 142, 146], [886, 908, 'Email', 189, 190], [3537, 3569, 'Company_Name', 699, 704], [3571, 3598, 'Exp_Location', 705, 711], [3605, 3617, 'Exp_Duration', 713, 717], [3916, 3931, 'Experience_title', 778, 780], [3932, 3948, 'Position', 781, 783], [3966, 3976, 'Position', 786, 787], [3977, 3989, 'Company_Name', 788, 790], [3992, 4017, 'Exp_Location', 791, 796], [4018, 4030, 'Exp_Duration', 797, 800], [4203, 4212, 'Exp_Duration', 833, 836], [4214, 4224, 'Position', 838, 839], [4225, 4240, 'Company_Name', 840, 842], [4242, 4280, 'Exp_Location', 843, 850], [4281, 4293, 'Exp_Duration', 851, 854], [4294, 4320, 'Position', 855, 858], [4321, 4353, 'Company_Name', 859, 864], [4354, 4366, 'Exp_Duration', 865, 868], [4451, 4469, 'Edu_Location', 886, 889], [4470, 4482, 'Edu_Duration', 890, 893], [4519, 4525, 'Skills_title', 901, 902], [4778, 4832, 'Degree', 950, 957]], 'relations': [], 'tokens': [{'text': 'Vijaykumar L Jeeragi', 'start': 14, 'end': 34, 'token_start': 3, 'token_end': 6, 'entityLabel': 'Name'}, {'text': 'Sattur, Dharwad, Karnataka, Karnataka', 'start': 46, 'end': 83, 'token_start': 9, 'token_end': 16, 'entityLabel': 'Location'}, {'text': 'jeeragivijay@gmail.com', 'start': 84, 'end': 106, 'token_start': 17, 'token_end': 18, 'entityLabel': 'Email'}, {'text': '+91 9880784203', 'start': 107, 'end': 121, 'token_start': 19, 'token_end': 21, 'entityLabel': 'Mobile_Number'}, {'text': '20 years experience', 'start': 248, 'end': 267, 'token_start': 49, 'token_end': 52, 'entityLabel': 'Total_Experience'}, {'text': 'JOB DESCRIPTION', 'start': 553, 'end': 568, 'token_start': 112, 'token_end': 114, 'entityLabel': 'Experience_title'}, {'text': 'NESMA & ALFADL CONST. CO. LTD', 'start': 579, 'end': 608, 'token_start': 118, 'token_end': 126, 'entityLabel': 'Company_Name'}, {'text': 'NESMA & ALFADL CONST. CO. LTD', 'start': 579, 'end': 608, 'token_start': 118, 'token_end': 126, 'entityLabel': 'Current_Company'}, {'text': 'ALKHOBAR', 'start': 609, 'end': 617, 'token_start': 127, 'token_end': 128, 'entityLabel': 'Location'}, {'text': 'Asset Control Division', 'start': 645, 'end': 667, 'token_start': 135, 'token_end': 138, 'entityLabel': 'Current_Designation'}, {'text': 'Asset Control Division', 'start': 645, 'end': 667, 'token_start': 135, 'token_end': 138, 'entityLabel': 'Position'}, {'text': 'Asst. Asset Controller', 'start': 679, 'end': 701, 'token_start': 142, 'token_end': 146, 'entityLabel': 'Position'}, {'text': 'jeeragivijay@gmail.com', 'start': 886, 'end': 908, 'token_start': 189, 'token_end': 190, 'entityLabel': 'Email'}, {'text': 'SDM MEDICAL COLLEGE AND HOSPITAL', 'start': 3537, 'end': 3569, 'token_start': 699, 'token_end': 704, 'entityLabel': 'Company_Name'}, {'text': 'SATTUR, DHARWAD. (KARNATAKA', 'start': 3571, 'end': 3598, 'token_start': 705, 'token_end': 711, 'entityLabel': 'Exp_Location'}, {'text': '2007 to\n2013', 'start': 3605, 'end': 3617, 'token_start': 713, 'token_end': 717, 'entityLabel': 'Exp_Duration'}, {'text': 'Work Experience', 'start': 3916, 'end': 3931, 'token_start': 778, 'token_end': 780, 'entityLabel': 'Experience_title'}, {'text': 'Office Assistant', 'start': 3932, 'end': 3948, 'token_start': 781, 'token_end': 783, 'entityLabel': 'Position'}, {'text': 'supervisor', 'start': 3966, 'end': 3976, 'token_start': 786, 'token_end': 787, 'entityLabel': 'Position'}, {'text': 'SDM Hospital', 'start': 3977, 'end': 3989, 'token_start': 788, 'token_end': 790, 'entityLabel': 'Company_Name'}, {'text': 'Sattur, Dharwad,Karnataka', 'start': 3992, 'end': 4017, 'token_start': 791, 'token_end': 796, 'entityLabel': 'Exp_Location'}, {'text': '2007 to 2013', 'start': 4018, 'end': 4030, 'token_start': 797, 'token_end': 800, 'entityLabel': 'Exp_Duration'}, {'text': '2007-2013', 'start': 4203, 'end': 4212, 'token_start': 833, 'token_end': 836, 'entityLabel': 'Exp_Duration'}, {'text': 'Supervisor', 'start': 4214, 'end': 4224, 'token_start': 838, 'token_end': 839, 'entityLabel': 'Position'}, {'text': 'Manoj Jewellery', 'start': 4225, 'end': 4240, 'token_start': 840, 'token_end': 842, 'entityLabel': 'Company_Name'}, {'text': 'Koppikar Road Hubli - Hubli, Karnataka', 'start': 4242, 'end': 4280, 'token_start': 843, 'token_end': 850, 'entityLabel': 'Exp_Location'}, {'text': '2002 to 2006', 'start': 4281, 'end': 4293, 'token_start': 851, 'token_end': 854, 'entityLabel': 'Exp_Duration'}, {'text': 'ASSISTANT ASSET CONTROLLER', 'start': 4294, 'end': 4320, 'token_start': 855, 'token_end': 858, 'entityLabel': 'Position'}, {'text': 'Nesma and Alfadl Construction Co', 'start': 4321, 'end': 4353, 'token_start': 859, 'token_end': 864, 'entityLabel': 'Company_Name'}, {'text': '1998 to 2002', 'start': 4354, 'end': 4366, 'token_start': 865, 'token_end': 868, 'entityLabel': 'Exp_Duration'}, {'text': 'Dharwad, Karnataka', 'start': 4451, 'end': 4469, 'token_start': 886, 'token_end': 889, 'entityLabel': 'Edu_Location'}, {'text': '1996 to 1998', 'start': 4470, 'end': 4482, 'token_start': 890, 'token_end': 893, 'entityLabel': 'Edu_Duration'}, {'text': 'Skills', 'start': 4519, 'end': 4525, 'token_start': 901, 'token_end': 902, 'entityLabel': 'Skills_title'}, {'text': 'Diploma in international airline and travel management', 'start': 4778, 'end': 4832, 'token_start': 950, 'token_end': 957, 'entityLabel': 'Degree'}]}]
for example in file:
    # Parse the tokens using spaCy
    text = example["text"].replace('//n', ' ').replace('|', ' ').replace('\n', ' ')
    tokens = nlp(text)
    spaces = [True if tok.whitespace_ else False for tok in tokens]
    words = [t.text.replace('//n', ' ').replace('|', ' ').replace('\n', ' ') for t in tokens]
    doc = Doc(nlp.vocab, words=words, spaces=spaces)

    # Parse the entities from the example
    spans = example["tokens"]
    entities = [(span["start"], span["end"], span["entityLabel"].upper()) for span in spans if
                span["entityLabel"].upper() != "SKILLS"]

    # Remove invalid entity spans
    valid_entities = remove_invalid_entity_spans(text, example["text"], entities, nlp)

    # Create a dictionary for the example and append it to the list
    data = {
        "text": text,
        "label": valid_entities,
        "tokens": [{"text": token.text, "id": token.i, "start": token.idx, "end": token.idx + len(token.text)}
                   for token in tokens],
        "spans": [
            {"start": start, "end": end, "label": label, "token_start": token_start, "token_end": token_end} for
            start, end, label, token_start, token_end in valid_entities]}
    datas.append(data)

with open("new_current_add_data_wn.jsonl", "w", encoding="utf-8") as outfile:
    for item in datas:
        json.dump(item, outfile, ensure_ascii=False)
        outfile.write("\n")

After running the script and this command prodigy db-in dataset_old new_current_add_data_wn.jsonl, I received the following output:

✔ Created dataset 'dataset_old' in database SQLite
✔ Imported 4312 annotations to 'dataset_old' (session
2023-04-04_19-26-16) in database SQLite

I then checked the status of the database by running the prodigy stats command with the dataset_old argument. The output showed that the database was successfully updated with the new annotations.

============================== ✨  Prodigy Stats ==============================

Version          1.11.10                       
Location         ***/prodigy
Prodigy Home     ***/.prodigy
Platform         Linux-5.15.0-1027-gcp-x86_64-with-glibc2.31
Python Version   3.10.8                        
Database Name    SQLite                        
Database Id      sqlite                        
Total Datasets   24                            
Total Sessions   290                           


============================== ✨  Dataset Stats ==============================

Dataset       dataset_old        
Created       2023-04-04 19:26:16
Description   None               
Author        None               
Annotations   4312               
Accept        4312               
Reject        0                  
Ignore        0

Next, I attempted to review the annotated data using the
prodigy review dataset_old_verified dataset_old --view-id ner_manual --label all_lables

However, when I tried to cancel and select a new entity in the UI, I encountered an error:

Oops, something went wrong :(

You might have come across a bug in Prodigy's web app – sorry about that. We'd love to fix this, so feel free to open an issue on the Prodigy Support Forum and include the steps that led to this message.

TypeError: Cannot read properties of undefined (reading 'end')

I am unsure whether I made any mistakes during the process or if this is a bug. Please guide me on how to resolve this issue. Thank you.

After several tries, I solved it by my self.
Did some changes in my code for proper token_start and token_end.


import json
import spacy
from spacy.tokens import Doc


# Define a function to remove invalid entity spans
def remove_invalid_entity_spans(ftext, text, entities, nlp):
    # Process the text with spaCy
    doc = nlp(text, disable=["parser", "tagger", "ner"])

    # Iterate over the entities and create valid entity spans
    valid_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            valid_entities.append((span.start_char, span.end_char, span.label_))
        else:
            # Try to adjust the span by extending it
            flag = True
            for i in range(1, 10):
                span = doc.char_span(start - i, end, label=label)
                if span is not None:
                    valid_entities.append((span.start_char, span.end_char, span.label_))
                    flag = False
                    break
            else:
                for i in range(1, 10):
                    span = doc.char_span(start, end + i, label=label)
                    if span is not None:
                        valid_entities.append((span.start_char, span.end_char, span.label_))
                        flag = False
                        break
            # Print a message if the span could not be adjusted
            if flag:
                print('not parsed', start, end, '"' + ftext[start:end] + '"', label)

    return valid_entities


def token_points(tokens, labels):
    # try:
    # Initialize variables to keep track of token positions
    token_start_positions = [token.idx for token in tokens]
    token_end_positions = [token.idx + len(token) for token in tokens]

    label_spans = []

    # Loop through the labels
    for label in labels:
        try:
            # Get the start and end positions of the label
            label_start = label[0]
            label_end = label[1]
            label_type = label[2]

            # Find the token that contains the start position of the label
            token_start_index = 0
            while label_start >= token_end_positions[token_start_index]:
                token_start_index += 1

            # Find the token that contains the end position of the label
            token_end_index = token_start_index
            while label_end > token_end_positions[token_end_index]:
                token_end_index += 1

            # Get the start and end positions of the token that contains the label
            token_start = token_start_positions[token_start_index]
            token_end = token_end_positions[token_end_index]

            if token_start != label_start and token_end != label_end:
                print("Not Proper token start ", tokens[label_start:label_end], tokens[token_start:token_end], label)
                continue
            else:
                # Add the label span to the list
                label_span = {"start": token_start, "end": token_end, "label": label_type,
                              "token_start": token_start_index,
                              "token_end": token_end_index}
                label_spans.append(label_span)
        except:
            continue
    return label_spans


# Load the English tokenizer, tagger, parser, and NER model
nlp = spacy.load("en_core_web_sm")

# Define variables and paths
datas = []


file=[{'id':83536,'text':'Indeed Resume\nVijaykumar L Jeeragi\nSupervisor\nSattur, Dharwad, Karnataka, Karnataka\njeeragivijay@gmail.com\n+91 9880784203\nAn Indian citizen, Bachelors Degree in Arts, (Technical diploma in Computers (JLC), Diploma in Airline\nManagement (IATA) with 20 years experience 16 years in India &4 years in Middle East in handling,\nAsset Control Division as a Assistant Asset Controller in Nesma & Alfadl Const co., Alkhobar Saudi\nArabia and worked as a Supervisor in billing section (front office) in SDM College and Hospital Sattur,\nDharwad. Karnataka. India.\nJOB DESCRIPTION:\nCOMPANY: NESMA & ALFADL CONST. CO. LTD\nALKHOBAR (SAUDI ARABIA)\nDEPARTMENT: Asset Control Division\nJOB TITLE: Asst. Asset Controller\nNAME: Vijaykumar Lingaraj Jeeragi\nPRESENT ADRESS: Vijaykumar L Jeeragi\nPlot No 1097, 36 th Cross, Near SDM Staff\nQuarters, Sattur, Dharwad.\nCell: +91 9880784203, +91 9916187893\nEmail: jeeragivijay@gmail.com\n1. POSITION IN THE ORGANISATION IN NESMA & ALFADL ( 1998 - 2002)\nSUPPPORT SERVICES MANAGER\nASSSET CONTROL MANAGER\nASST. ASSET CONTROLLER\nCLERICAL ASSISTANT\nCONTACTS ARISING FROM THE JOB\nA) Within the Company / Department\nSupport Services Manager\nAuto Workshop Manager\nPersonnel / Administration Department\nFinance Department\nProcurement Department\nStores Department\nB) Outside Nesma & Alfadl Co.\nSuppliers\nSub Contractors\nGENERAL ACTIVITY OF THE JOB\nThe function of the job include:\nTagging of all the Fixed Assets of the Company\nKeeping a track of them with respect to their locations\nAllocation of Assets with respect the the requirement on site.\nOn non-availability of assets, arranging for rental through suppliers.\nMajor role in the mobilizing and Demobilizing of the Project.\nArranging daily transportation for the staff and also on emergency basis.\nPreparing of invoices and charging the sites accordingly.\nA) Field of Responsibility\nResponsible to the Support Services Manager for the Control and allocation of assets with specific\nresponsibility for the efficient monetary control of assets.\nB) Planning\nTo plan effective operation of the Assets with respect to the requirement of sites.\nC) Implementation\nTo receive all the fixed Assets and tag them with respect to their categories.\nTo plan out the distribution of fixed Assets like vehicles, Equipment and small tools to the different\nprojects as per their requirement.\nTo co-ordinate with the Project Managers and Engineers in order to fulfill their requirement on site.\nIn case of non-availability of any assets to be provided on site. Hiring of assets from Suppliers and\nensuring that the rhythm of work is not altered.\nTo monitor and control all fixed assets.\nTo implement reports for fixed assets and charge the projects accordingly.\nTo ensure that full maintenance is done whenever the Fixed Asset is returned back to us by making a\njob request for the workshop.\nTo arrange transportation for the staff our Company as well as arrange transportation to carry\nmaterials on our trailers and heavy trucks to all our projects.\nAlso to control the transfer of drivers and Heavy Equipment Operators with respect to the\nrequirement from sites.\nPetty cash handled throughout the department.\nRELEVANT INFORMATION REGARDING THE JOB\nStatistics: Tagging/Controlling/ Transportation Coordinator/ Invoicing /Tracking & Updating Records of\nAssets.\nEstablishment: Company Asset Control Department\nStaff Responsibility: 2 Clerks\nPackages worked on: BaaN IV, Fixed Asset Package (Foxpro), MS-Office,\nSalary withdrawn: SR 2000 basic + Overtime +F.A + Transportation.\n2. WORKED IN SDM MEDICAL COLLEGE AND HOSPITAL, SATTUR, DHARWAD. (KARNATAKA) From 2007 to\n2013\nGENERAL ACTIVITY OF THE JOB\nThe function of the job includes:\nWorked as a super wiser\nOut patients Billing / in patients billing\nIn patients Discharge billing\nCash Handling of main billing sections\nInventory management\nOPD billing on rotation basis\nWilling to relocate to: Abroad - dubai - bahrain\nWork Experience\nOffice Assistant later promted as supervisor\nSDM Hospital - Sattur, Dharwad,Karnataka\n2007 to 2013\nof various projects like supervising in Billing work, maintenance, Inventory and as well as computerized\nin/out patients,discharge billing and also main billing cashier. ( 2007-2013)\nSupervisor\nManoj Jewellery, Koppikar Road Hubli - Hubli, Karnataka\n2002 to 2006\nASSISTANT ASSET CONTROLLER\nNesma and Alfadl Construction Co\n1998 to 2002\nAdministrator in M/s Institute\nStudies on Agriculture & Rural Development (ISARD) - Dharwad, Karnataka\n1996 to 1998\nEducation\nHigh school or equivalent\nSkills\nms office\nCertifications/Licenses\nState council of vocational education\nNovember 2002 to Present\nJLC certificate is issued with the provision to us it as equivalent to preuniversity course for the purpose\nof seeking jobs only\nAdditional Information\n1. Diploma in international airline and travel management from trade wings institute of management\n(IATA)\n2. Karnataka secondary education board certificate.\nVIjaykmar L. Jeeragi','label':[[14,34,'Name',3,6],[46,83,'Location',9,16],[84,106,'Email',17,18],[107,121,'Mobile_Number',19,21],[248,267,'Total_Experience',49,52],[553,568,'Experience_title',112,114],[579,608,'Company_Name',118,126],[579,608,'Current_Company',118,126],[609,617,'Location',127,128],[645,667,'Current_Designation',135,138],[645,667,'Position',135,138],[679,701,'Position',142,146],[886,908,'Email',189,190],[3537,3569,'Company_Name',699,704],[3571,3598,'Exp_Location',705,711],[3605,3617,'Exp_Duration',713,717],[3916,3931,'Experience_title',778,780],[3932,3948,'Position',781,783],[3966,3976,'Position',786,787],[3977,3989,'Company_Name',788,790],[3992,4017,'Exp_Location',791,796],[4018,4030,'Exp_Duration',797,800],[4203,4212,'Exp_Duration',833,836],[4214,4224,'Position',838,839],[4225,4240,'Company_Name',840,842],[4242,4280,'Exp_Location',843,850],[4281,4293,'Exp_Duration',851,854],[4294,4320,'Position',855,858],[4321,4353,'Company_Name',859,864],[4354,4366,'Exp_Duration',865,868],[4451,4469,'Edu_Location',886,889],[4470,4482,'Edu_Duration',890,893],[4519,4525,'Skills_title',901,902],[4778,4832,'Degree',950,957]]}]
for example in file:
    # Parse the tokens using spaCy
    text = example["text"].replace('//n', ' ').replace('|', ' ').replace('\n', ' ')
    tokens = nlp(text)

    # Parse the entities from the example
    spans = example["tokens"]
    entities = [(span["start"], span["end"], span["entityLabel"].upper()) for span in spans if
                span["entityLabel"].upper() != "SKILLS"]

    # Remove invalid entity spans
    valid_entities = remove_invalid_entity_spans(text, example["text"], entities, nlp)

    valid_spans = token_points(tokens, valid_entities)

    # Create a dictionary for the example and append it to the list
    data = {
        "text": text,
        "label": valid_entities,
        "spans": valid_spans}
    # doc = nlp(text)
    datas.append(data)

with open("new_current_add_data_wn.jsonl", "w", encoding="utf-8") as outfile:
    for item in datas:
        json.dump(item, outfile, ensure_ascii=False)
        outfile.write("\n")

Then

prodigy db-in dataset_old ./new_current_add_data_wn.jsonl
prodigy db-out dataset_old > dataset_old.jsonl
prodigy ner.manual dataset_old_verified en_core_web_sm dataset_old.jsonl --label all_labels --loader jsonl

Thanks!!!

1 Like