mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Recognise multiple infixes in a token.
This commit is contained in:
parent
6df3858dbc
commit
04d0209be9
|
@ -43,7 +43,7 @@ def test_double_hyphen(en_tokenizer):
|
|||
assert tokens[6].text == u'-'
|
||||
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
|
||||
# on infixes.
|
||||
#assert tokens[7].text == u'bred'
|
||||
#assert tokens[8].text == u'--'
|
||||
#assert tokens[9].text == u'people'
|
||||
assert tokens[7].text == u'bred'
|
||||
assert tokens[8].text == u'--'
|
||||
assert tokens[9].text == u'people'
|
||||
|
||||
|
|
|
@ -28,8 +28,9 @@ cdef class Tokenizer:
|
|||
self._suffix_re = suffix_re
|
||||
self._infix_re = infix_re
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules)
|
||||
self._rules = rules
|
||||
self._rules = {}
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
|
@ -158,7 +159,8 @@ cdef class Tokenizer:
|
|||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes,
|
||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||
vector[const LexemeC*] *prefixes,
|
||||
vector[const LexemeC*] *suffixes):
|
||||
cdef size_t i
|
||||
cdef unicode prefix
|
||||
|
@ -215,20 +217,23 @@ cdef class Tokenizer:
|
|||
if string:
|
||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if not cache_hit:
|
||||
match = self.find_infix(string)
|
||||
if match is None:
|
||||
matches = self.find_infix(string)
|
||||
if not matches:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
else:
|
||||
split = match.start()
|
||||
end = match.end()
|
||||
# Append the beginning, affix, end of the infix span
|
||||
span = string[:split]
|
||||
# let's say we have dyn-o-mite-dave
|
||||
# the regex finds the start and end positions of the hyphens
|
||||
start = 0
|
||||
for match in matches:
|
||||
infix_start = match.start()
|
||||
infix_end = match.end()
|
||||
span = string[start:infix_start]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
|
||||
span = string[split:end]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
|
||||
span = string[end:]
|
||||
infix_span = string[infix_start:infix_end]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
||||
start = infix_end
|
||||
span = string[start:]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
|
@ -251,7 +256,7 @@ cdef class Tokenizer:
|
|||
self._cache.set(key, cached)
|
||||
|
||||
def find_infix(self, unicode string):
|
||||
return self._infix_re.search(string)
|
||||
return list(self._infix_re.finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
match = self._prefix_re.search(string)
|
||||
|
@ -262,17 +267,19 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''Add special-case tokenization rules.
|
||||
'''
|
||||
cdef int i
|
||||
cdef list substrings
|
||||
cdef unicode chunk
|
||||
cdef unicode form
|
||||
cdef unicode lemma
|
||||
cdef dict props
|
||||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def add_special_case(self, unicode chunk, substrings):
|
||||
'''Add a special-case tokenization rule.
|
||||
|
||||
For instance, "don't" is special-cased to tokenize into
|
||||
["do", "n't"]. The split tokens can have lemmas and part-of-speech
|
||||
tags.
|
||||
'''
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
|
@ -280,3 +287,4 @@ cdef class Tokenizer:
|
|||
key = hash_string(chunk)
|
||||
self._specials.set(key, cached)
|
||||
self._cache.set(key, cached)
|
||||
self._rules[chunk] = substrings
|
||||
|
|
Loading…
Reference in New Issue
Block a user