I've created my own Language
subclass since I need a custom tokenizer. I've basically copied the English
language and overwritten create_tokenizer
from spacy.lang.en import Language, EnglishDefaults
from spacy.tokenizer import Tokenizer
from spacy.attrs import LANG
from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex
currencies = [
"DKK",
"SEK",
"NOK",
"GBP",
"EUR",
"USD",
"CHF",
]
def _return_en_fin(_):
return "en_fin"
class EnglishFinanceDefaults(EnglishDefaults):
lex_attr_getters = EnglishDefaults.lex_attr_getters
lex_attr_getters[LANG] = _return_en_fin
@classmethod
def create_tokenizer(cls, nlp=None) -> Tokenizer:
prefixes = cls.prefixes + (
r"[1-4][Qq]",
"[Qq][1-4]",
"[Hh]1",
"1[Hh]",
*[rf"{ccy}" for ccy in currencies],
r"[\/'¹\[\]~]",
r"-(?=\D)", # tokenize "-" unless its a number
)
infixes = cls.infixes + (
r"(?<=\d\d)[a-zA-Z]+", # 2018Jan
r"[\/'¹\[\]%]", # 3/19, 18'4, US$
r"(?<=\S)-", # 1-Feb, Jan-30
r"(?<=\d)(bn|BN|Bn|m|M|b|B)",
r"[$£€]",
)
suffixes = cls.suffixes + (
r"[1-4][Qq]",
"[Qq][1-4]",
"[Hh]1",
"1[Hh]",
*[rf"{ccy}" for ccy in currencies],
r"[-\/'¹\[\]]",
r"202[01]",
)
tokenizer = EnglishDefaults.create_tokenizer(nlp)
tokenizer.prefix_search = compile_prefix_regex(prefixes).search
tokenizer.infix_finditer = compile_infix_regex(infixes).finditer
tokenizer.suffix_search = compile_suffix_regex(suffixes).search
return tokenizer
class EnglishFinance(Language):
lang = _return_en_fin("")
Defaults = EnglishFinanceDefaults
Then I've saved the language to_disk
and packaged it with spacy package
. I've pip installed the packaged language model and it all works so far. The issue comes when I start training a new NER model using my new packaged model
❯ prodigy train-curve ner ner-period-date-month-year en_fin_model
✔ Starting with model 'en_fin_model'
Training 4 times with 25%, 50%, 75%, 100% of the data
=============================== ✨ Train curve ===============================
% Accuracy Difference
---- -------- ----------
/home/nixd/.cache/pypoetry/virtualenvs/annotator-U3km5bEc-py3.8/lib/python3.8/site-packages/spacy/language.py:635: UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.
I suspect that it has to do with lex_attr_getters
in EnglishFinanceDefaults
but I'm not sure how I'm supposed to do it?
Question number 2
Lets say that I've labeled for four different labels but I only care about the performance on one or two of those labels. Is there an easy way to ignore some labels (or to just check performance on the labels of interest)?