Hi!
I need to customize the tokenizer to correctly handle this text:
Experience with Node.js/Express and Python/Django.
I customized the infixes, but only Python/Django
us tokenized correctly, while Node.js/Express
is treated as a single token.
import os
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
prefix_regex = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
infixes = tuple([i for i in nlp.Defaults.infixes if i != '#'] + [r',', r'/', r'\(', r'\)', ])
infix_regex = spacy.util.compile_infix_regex(infixes)
suffix_regex = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
tokenizer = Tokenizer(
nlp.vocab,
prefix_search=prefix_regex.search,
suffix_search=suffix_regex.search,
infix_finditer=infix_regex.finditer,
token_match=None
)
nlp = spacy.blank('en')
nlp.tokenizer = tokenizer
nlp.to_disk('my-custom-model')
# Test
text = 'Experience with Node.js/Express and Python/Django.'
for t in nlp.tokenizer.explain(text):
print(t)
'''
OUTPUT
('TOKEN', 'Experience')
('TOKEN', 'with')
('TOKEN_MATCH', 'Node.js/Express')
('TOKEN', 'and')
('TOKEN', 'Python')
('INFIX', '/')
('TOKEN', 'Django')
('SUFFIX', '.')
'''
How can I make the infix /
to work correctly?