Hello there,
I am trying to make NER (18 Entities ) and relation_extractor (7 relations) both together.
Every document has an average of 3k tokens per document. sentence segmentation is not possible because the text is unstructured. Used "prodigy rel.manual" for annotating NE & Relation. after that convert jsonl file in .spacy using this file
And then created config file like this
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = "pytorch"
seed = 342
[nlp]
lang = "en"
pipeline = ["transformer","ner","relation_extractor"]
batch_size = 32
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"
[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "roberta-base"
mixed_precision = false
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 64
[components.transformer.model.grad_scaler_config]
[components.transformer.model.tokenizer_config]
use_fast = true
[components.transformer.model.transformer_config]
[components.relation_extractor]
factory = "relation_extractor"
threshold = 0.5
[components.relation_extractor.model]
@architectures = "rel_model.v1"
[components.relation_extractor.model.create_instance_tensor]
@architectures = "rel_instance_tensor.v1"
[components.relation_extractor.model.create_instance_tensor.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.relation_extractor.model.create_instance_tensor.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.relation_extractor.model.create_instance_tensor.pooling]
@layers = "reduce_mean.v1"
[components.relation_extractor.model.create_instance_tensor.get_instances]
@misc = "rel_instance_generator.v1"
max_length = 1000
[components.relation_extractor.model.classification_layer]
@architectures = "rel_classification_layer.v1"
nI = null
nO = null
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 100000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
logger = {"@loggers":"spacy.ConsoleLogger.v1"}
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005
[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]
While training getting error like
Could not determine any instances in doc.
Could not determine any instances in doc.
...
Could not determine any instances in doc.
Could not determine any instances in doc.
and killed.
2. when I am using
[corpora.dev]
@readers = "Gold_ents_Corpus.v1"
file = ${paths.dev}
[corpora.train]
@readers = "Gold_ents_Corpus.v1"
file = ${paths.train}
training is stuck
input is like
{
"text": "Karthik K . . Niveditha M Junior Data Scientist r.niveditha@outlook.com | linkedin.com/in/niveditha-m-4510681aa | +91-9381982821 Junior Data Scientist demonstrating proven success in Data science field as an analyst and scientist spanning 2 years of experience. Adept at using large data sets to drive insights and inform decision making. With a strong foundation in data preprocessing/mining, machine learning, and efficient in building desired model on baselined strategy. Proficient in data visualization with effective reporting skills using Power BI. Areas of Expertise include: Business Analysis Data Analysis Model training Visualization and Reporting Processing large data sets Software Skills & Tools: Machine learning algorithms (Supervised & Unsupervised) Linear regression, Logistic Regression, Decision Tree, Random Forest, Ensemble Techniques K means, clustering NLP, Deep Learning, RNN, Bi-directional LSTM Python (Pandas, NumPy, Scikit- learn, Matplotlib, Seaborn, Flask, Pyre sparser) Power BI, Statistics & SQL Work History (May 2021-Feb 2023) Think bridge software pvt. ltd, PUNE, Maharashtra Dec 2022 -Feb 2023 Project 2 Role: Junior Data Scientist Worked on Project named Einstein with objective to develop automated applicant recommendation system that aids HR professionals by reducing manual intervention needed in reviewing resumes To achieve the goal, I have created a deep learning model that predicts and validates candidate suitability for the job description. Worked on end-to-end Data Science Process i.e., Data Collection, Model Training, Model Analysis and Deployment Collaborated in deploying the model using Flask on Azure App Service. Achieved an accuracy of 77% while recommending apt candidates to recruiters for Customer Service Job Role. Expanded Einstein to five more job roles apart from CS Job role Technologies: (Python, Machine Learning, NLP, Flask, Ensemble techniques, Bidirectional LSTM) Tools: Azure (Logic Apps, Blob Storage, Table Storage, App service) mailto:r.niveditha@outlook.com linkedin.com/in/niveditha-m-4510681aa . . May 2021 Nov 2022 Certification Sep 2020 -Feb 2021 Education Project 1 Role: Power BI Developer Worked on Project Rent-A-Center (RAC) Reformed the reporting structure from legacy platform (SAP BO reports into PowerBI by joining efforts with Data Engineering team Created design documentations in line with Agile methodologies Developed dashboards and tabular visualizations using Power BI and entailed in validating the BI reports and established data quality checks Associated in production deployment by presenting model demo to clients, sourced feedback and enhanced the reporting structure Data Science|360DigiTMG|Hyderabad Completed Data Science certification course at 360DigiTMG As a part of internship, successfully completed internal project at training center related to machine learning basic regression algorithm Predicting the adipose tissue based on waist circumference by using ML linear regression model Master of Science (MSC) June 2018-Dec 2020 Post-Graduation in Computer Science with 90% aggregate Bachelor of Sciences (BSC) June-2015-Apr 2018 Bachelor in Computers, Mathematics, Physics with 87% aggregate",
"meta": {
"file": "file_path"},
"_input_hash": 2111674242, "_task_hash": 1263820275, "_is_binary": false,
"spans": [{"start": 0, "end": 9, "token_start": 0, "token_end": 1, "label": "NAME"},
{"start": 14, "end": 25, "token_start": 4, "token_end": 5, "label": "NAME"},
{"start": 26, "end": 47, "token_start": 6, "token_end": 8, "label": "CURRENT_POSITION"},
{"start": 48, "end": 71, "token_start": 9, "token_end": 9, "label": "EMAIL"},
{"start": 114, "end": 128, "token_start": 13, "token_end": 15, "label": "MOBILE_NUMBER"},
{"start": 711, "end": 766, "token_start": 109, "token_end": 116, "label": "SKILLS"},
{"start": 767, "end": 784, "token_start": 117, "token_end": 118, "label": "SKILLS"},
{"start": 786, "end": 805, "token_start": 120, "token_end": 121, "label": "SKILLS"},
{"start": 807, "end": 820, "token_start": 123, "token_end": 124, "label": "SKILLS"},
{"start": 822, "end": 835, "token_start": 126, "token_end": 127, "label": "SKILLS"},
{"start": 837, "end": 856, "token_start": 129, "token_end": 130, "label": "SKILLS"},
{"start": 857, "end": 864, "token_start": 131, "token_end": 132, "label": "SKILLS"},
{"start": 866, "end": 880, "token_start": 134, "token_end": 135, "label": "SKILLS"},
{"start": 882, "end": 895, "token_start": 137, "token_end": 138, "label": "SKILLS"},
{"start": 897, "end": 900, "token_start": 140, "token_end": 140, "label": "SKILLS"},
{"start": 902, "end": 921, "token_start": 142, "token_end": 145, "label": "SKILLS"},
{"start": 922, "end": 928, "token_start": 146, "token_end": 146, "label": "SKILLS"},
{"start": 930, "end": 936, "token_start": 148, "token_end": 148, "label": "SKILLS"},
{"start": 938, "end": 943, "token_start": 150, "token_end": 150, "label": "SKILLS"},
{"start": 945, "end": 958, "token_start": 152, "token_end": 153, "label": "SKILLS"},
{"start": 960, "end": 970, "token_start": 155, "token_end": 155, "label": "SKILLS"},
{"start": 972, "end": 979, "token_start": 157, "token_end": 157, "label": "SKILLS"},
{"start": 981, "end": 986, "token_start": 159, "token_end": 159, "label": "SKILLS"},
{"start": 988, "end": 1000, "token_start": 161, "token_end": 162, "label": "SKILLS"},
{"start": 1002, "end": 1010, "token_start": 164, "token_end": 165, "label": "SKILLS"},
{"start": 1012, "end": 1022, "token_start": 167, "token_end": 167, "label": "SKILLS"},
{"start": 1025, "end": 1028, "token_start": 169, "token_end": 169, "label": "SKILLS"},
{"start": 1043, "end": 1060, "token_start": 173, "token_end": 177, "label": "TOTAL_EXPERIENCE"},
{"start": 1062, "end": 1092, "token_start": 179, "token_end": 184, "label": "COMPANY_NAME"},
{"start": 1094, "end": 1111, "token_start": 186, "token_end": 188, "label": "EXP_LOCATION"},
{"start": 1112, "end": 1130, "token_start": 189, "token_end": 192, "label": "EXP_DURATION"},
{"start": 1147, "end": 1168, "token_start": 197, "token_end": 199, "label": "POSITION"},
{"start": 2077, "end": 2094, "token_start": 350, "token_end": 353, "label": "EXP_DURATION"},
{"start": 2154, "end": 2172, "token_start": 364, "token_end": 366, "label": "POSITION"},
{"start": 2997, "end": 3020, "token_start": 487, "token_end": 492, "label": "DEGREE"},
{"start": 3021, "end": 3039, "token_start": 493, "token_end": 497, "label": "EDU_DURATION"},
{"start": 3081, "end": 3084, "token_start": 505, "token_end": 506, "label": "MARKS"},
{"start": 3095, "end": 3121, "token_start": 508, "token_end": 513, "label": "DEGREE"},
{"start": 3122, "end": 3140, "token_start": 514, "token_end": 517, "label": "EDU_DURATION"},
{"start": 3190, "end": 3193, "token_start": 526, "token_end": 527, "label": "MARKS"}],
"tokens": [{"text": "Karthik", "start": 0, "end": 7, "id": 0, "ws": true, "disabled": false},
...,
{"text": "aggregate", "start": 3194, "end": 3203, "id": 528, "ws": false, "disabled": false}],
"_view_id": "relations", "relations": [{"head": 184, "child": 192,
"head_span": {"start": 1062, "end": 1092, "token_start": 179,
"token_end": 184, "label": "COMPANY_NAME"},
"child_span": {"start": 1112, "end": 1130, "token_start": 189,
"token_end": 192, "label": "EXP_DURATION"},
"color": "#c5bdf4", "label": "EXPERIENCE_AT"}, {"head": 184, "child": 199,
"head_span": {"start": 1062,
"end": 1092,
"token_start": 179,
"token_end": 184,
"label": "COMPANY_NAME"},
"child_span": {
"start": 1147,
"end": 1168,
"token_start": 197,
"token_end": 199,
"label": "POSITION"},
"color": "#ffd882",
"label": "EXPERIENCE_IN"},
{"head": 184, "child": 188,
"head_span": {"start": 1062, "end": 1092, "token_start": 179,
"token_end": 184, "label": "COMPANY_NAME"},
"child_span": {"start": 1094, "end": 1111, "token_start": 186,
"token_end": 188, "label": "EXP_LOCATION"},
"color": "#d9fbad", "label": "EXPERIENCE_PLACE"},
{"head": 184, "child": 353,
"head_span": {"start": 1062, "end": 1092, "token_start": 179,
"token_end": 184, "label": "COMPANY_NAME"},
"child_span": {"start": 2077, "end": 2094, "token_start": 350,
"token_end": 353, "label": "EXP_DURATION"},
"color": "#c5bdf4", "label": "EXPERIENCE_AT"}, {"head": 184, "child": 366,
"head_span": {"start": 1062,
"end": 1092,
"token_start": 179,
"token_end": 184,
"label": "COMPANY_NAME"},
"child_span": {
"start": 2154,
"end": 2172,
"token_start": 364,
"token_end": 366,
"label": "POSITION"},
"color": "#ffd882",
"label": "EXPERIENCE_IN"},
{"head": 492, "child": 497,
"head_span": {"start": 2997, "end": 3020, "token_start": 487,
"token_end": 492, "label": "DEGREE"},
"child_span": {"start": 3021, "end": 3039, "token_start": 493,
"token_end": 497, "label": "EDU_DURATION"},
"color": "#ffdaf9", "label": "EDUCATION_AT"}, {"head": 492, "child": 506,
"head_span": {"start": 2997,
"end": 3020,
"token_start": 487,
"token_end": 492,
"label": "DEGREE"},
"child_span": {"start": 3081,
"end": 3084,
"token_start": 505,
"token_end": 506,
"label": "MARKS"},
"color": "#b5c6c9",
"label": "EDUCATION_WITH"},
{"head": 513, "child": 517,
"head_span": {"start": 3095, "end": 3121, "token_start": 508,
"token_end": 513, "label": "DEGREE"},
"child_span": {"start": 3122, "end": 3140, "token_start": 514,
"token_end": 517, "label": "EDU_DURATION"},
"color": "#ffdaf9", "label": "EDUCATION_AT"}, {"head": 513, "child": 527,
"head_span": {"start": 3095,
"end": 3121,
"token_start": 508,
"token_end": 513,
"label": "DEGREE"},
"child_span": {"start": 3190,
"end": 3193,
"token_start": 526,
"token_end": 527,
"label": "MARKS"},
"color": "#b5c6c9",
"label": "EDUCATION_WITH"}],
"answer": "accept", "_timestamp": 1676293151}
Guide me if I am doing anything wrong.