From 586130891010c4c142f48a4a179dcc733ff18346 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 8 Sep 2019 20:29:59 +0200 Subject: [PATCH] Generalize handling of tokenizer special cases Handle tokenizer special cases more generally by using the Matcher internally to match special cases after the affix/token_match tokenization is complete. Instead of only matching special cases while processing balanced or nearly balanced prefixes and suffixes, this recognizes special cases in a wider range of contexts: * Allows arbitrary numbers of prefixes/affixes around special cases * Allows special cases separated by infixes Existing tests/settings that couldn't be preserved as before: * The emoticon '")' is no longer a supported special case * The emoticon ':)' in "example:)" is a false positive again When merged with #4258 (or the relevant cache bugfix), the affix and token_match properties should be modified to flush and reload all special cases to use the updated internal tokenization with the Matcher. --- spacy/errors.py | 3 + spacy/lang/tokenizer_exceptions.py | 1 - spacy/tests/regression/test_issue1001-1500.py | 5 + spacy/tests/tokenizer/test_exceptions.py | 9 +- spacy/tests/tokenizer/test_tokenizer.py | 15 +++ spacy/tokenizer.pxd | 7 +- spacy/tokenizer.pyx | 119 ++++++++++++------ 7 files changed, 116 insertions(+), 43 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 489f70ca7..41cfc7870 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -452,6 +452,9 @@ class Errors(object): "Make sure that you're passing in absolute token indices, not " "relative token offsets.\nstart: {start}, end: {end}, label: " "{label}, direction: {dir}") + E158 = ("Tokenizer special cases are not allowed to modify the text. " + "This would map '{chunk}' to '{orth}' given token attributes " + "'{token_attrs}'.") @add_codes diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 4d5ff4423..5db5bb136 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -120,7 +120,6 @@ emoticons = set( (-: =) (= -") :] :-] [: diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 9074b34b7..a9647f006 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -33,6 +33,11 @@ def test_issue1061(): doc = tokenizer(text) assert '_MATH_' in [w.text for w in doc] assert 'MATH' not in [w.text for w in doc] + text = '...gimme...? that ...gimme...? or else ...gimme...?!' + + tokenizer = English.Defaults.create_tokenizer() + tokenizer.add_special_case(u'...gimme...?', [{ORTH: u'...gimme...?'}]) + assert [w.text for w in nlp(text)] == ['...gimme...?', 'that', '...gimme...?', 'or', 'else', '...gimme...?', '!'] @pytest.mark.xfail( diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index a79363abb..c2011487e 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -7,7 +7,7 @@ import pytest def test_tokenizer_handles_emoticons(tokenizer): # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ....""" tokens = tokenizer(text) assert tokens[0].text == ":o" assert tokens[1].text == ":/" @@ -28,12 +28,11 @@ def test_tokenizer_handles_emoticons(tokenizer): assert tokens[16].text == ">:(" assert tokens[17].text == ":D" assert tokens[18].text == "=|" - assert tokens[19].text == '")' - assert tokens[20].text == ":>" - assert tokens[21].text == "...." + assert tokens[19].text == ":>" + assert tokens[20].text == "...." -@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)]) +@pytest.mark.parametrize("text,length", [("108)", 2), ("XDN", 1)]) def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 803c31abf..2c8ef73c4 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -108,6 +108,12 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens): assert doc[1].text == tokens[1]["orth"] +@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])]) +def test_tokenizer_validate_special_case(tokenizer, text, tokens): + with pytest.raises(ValueError): + tokenizer.add_special_case(text, tokens) + + @pytest.mark.parametrize( "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] ) @@ -120,3 +126,12 @@ def test_tokenizer_add_special_case_tag(text, tokens): assert doc[0].tag_ == tokens[0]["tag"] assert doc[0].pos_ == "NOUN" assert doc[1].text == tokens[1]["orth"] + + +def test_tokenizer_special_cases_with_affixes(tokenizer): + text = '(((_SPECIAL_ A/B, A/B-A/B")' + tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) + tokenizer.add_special_case("A/B", [{"orth": "A/B"}]) + doc = tokenizer(text) + print([token.text for token in doc]) + assert [token.text for token in doc] == ["(", "(", "(", "_SPECIAL_", "A/B", ",", "A/B", "-", "A/B", '"', ")"] diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 919b0928b..ded641b48 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -21,15 +21,18 @@ cdef class Tokenizer: cdef public object suffix_search cdef public object infix_finditer cdef object _rules + cdef object _special_matcher cpdef Doc tokens_from_list(self, list strings) + cdef Doc _tokenize_affixes(self, unicode string) + cdef int _apply_special_cases(self, Doc doc) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes, int* has_special) + vector[LexemeC*] *suffixes) cdef int _attach_tokens(self, Doc tokens, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 19029ec05..26d777c61 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -19,6 +19,9 @@ from .compat import unescape_unicode from .errors import Errors, Warnings, deprecation_warning from . import util +from .attrs import intify_attrs +from .matcher import Matcher +from .symbols import ORTH cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment @@ -57,9 +60,8 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - if rules is not None: - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + self._special_matcher = Matcher(self.vocab) + self._load_special_cases(rules) def __reduce__(self): args = (self.vocab, @@ -74,7 +76,6 @@ cdef class Tokenizer: deprecation_warning(Warnings.W002) return Doc(self.vocab, words=strings) - @cython.boundscheck(False) def __call__(self, unicode string): """Tokenize a string. @@ -83,6 +84,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#call """ + doc = self._tokenize_affixes(string) + self._apply_special_cases(doc) + return doc + + @cython.boundscheck(False) + cdef Doc _tokenize_affixes(self, unicode string): + """Tokenize according to affix and token_match settings. + + string (unicode): The string to tokenize. + RETURNS (Doc): A container for linguistic annotations. + """ if len(string) >= (2 ** 30): raise ValueError(Errors.E025.format(length=len(string))) cdef int length = len(string) @@ -145,6 +157,51 @@ cdef class Tokenizer: for k in keys: del self._cache[k] + cdef int _apply_special_cases(self, Doc doc): + """Retokenize doc according to special cases. + + doc (Doc): Document. + """ + cdef int i + # Find all special cases and filter overlapping matches + spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)] + spans = util.filter_spans(spans) + spans = [(span.text, span.start, span.end) for span in spans] + # Modify tokenization according to filtered special cases + cdef int offset = 0 + cdef int span_length_diff + cdef int idx_offset + for span in spans: + # Allocate more memory for doc if needed + span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1]) + while doc.length + offset + span_length_diff >= doc.max_length: + doc._realloc(doc.length * 2) + # Find special case entries in cache + cached = <_Cached*>self._specials.get(hash_string(span[0])) + if cached == NULL: + continue + # Shift original tokens... + # ...from span position to end if new span is shorter + if span_length_diff < 0: + for i in range(span[2] + offset, doc.length + offset): + doc.c[span_length_diff + i] = doc.c[i] + # ...from end to span position if new span is longer + elif span_length_diff > 0: + for i in range(doc.length + offset - 1, span[2] + offset - 1, -1): + doc.c[span_length_diff + i] = doc.c[i] + # Copy special case tokens into doc and adjust token and character + # offsets + idx_offset = 0 + for i in range(cached.length): + orig_idx = doc.c[span[1] + offset + i].idx + doc.c[span[1] + offset + i] = cached.data.tokens[i] + doc.c[span[1] + offset + i].idx = orig_idx + idx_offset + idx_offset += cached.data.tokens[i].lex.length + # Token offset for special case spans + offset += span_length_diff + doc.length += offset + return True + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: @@ -162,18 +219,15 @@ cdef class Tokenizer: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size - cdef int has_special = 0 orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, - &has_special) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, has_special, + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes, - int* has_special): + vector[const LexemeC*] *suffixes): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -188,24 +242,10 @@ cdef class Tokenizer: if pre_len != 0: prefix = string[:pre_len] minus_pre = string[pre_len:] - # Check whether we've hit a special-case - if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: - string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) - has_special[0] = 1 - break - if self.token_match and self.token_match(string): - break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] minus_suf = string[:-suf_len] - # Check whether we've hit a special-case - if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): - string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) - has_special[0] = 1 - break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] prefixes.push_back(self.vocab.get(mem, prefix)) @@ -216,9 +256,6 @@ cdef class Tokenizer: elif suf_len: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) - if string and (self._specials.get(hash_string(string)) != NULL): - has_special[0] = 1 - break return string cdef int _attach_tokens(self, Doc tokens, unicode string, @@ -280,14 +317,11 @@ cdef class Tokenizer: tokens.push_back(lexeme, False) cdef int _save_cached(self, const TokenC* tokens, hash_t key, - int has_special, int n) except -1: + int n) except -1: cdef int i for i in range(n): if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 - # See #1250 - if has_special: - return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True @@ -339,10 +373,24 @@ cdef class Tokenizer: match = self.suffix_search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, special_cases): + def _load_special_cases(self, special_cases): """Add special-case tokenization rules.""" - for chunk, substrings in sorted(special_cases.items()): - self.add_special_case(chunk, substrings) + if special_cases is not None: + for chunk, substrings in sorted(special_cases.items()): + self._validate_special_case(chunk, substrings) + self.add_special_case(chunk, substrings) + + def _validate_special_case(self, chunk, substrings): + """Check whether the `ORTH` fields match the string. + + string (unicode): The string to specially tokenize. + substrings (iterable): A sequence of dicts, where each dict describes + a token and its attributes. + """ + attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings] + orth = "".join([spec[ORTH] for spec in attrs]) + if chunk != orth: + raise ValueError(Errors.E158.format(chunk=chunk, orth=orth, token_attrs=substrings)) def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. @@ -354,6 +402,7 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#add_special_case """ + self._validate_special_case(string, substrings) substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) @@ -361,8 +410,8 @@ cdef class Tokenizer: cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) self._specials.set(key, cached) - self._cache.set(key, cached) self._rules[string] = substrings + self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)]) def to_disk(self, path, **kwargs): """Save the current state to a directory.