From 1748549aebce44b3e3dce42815fd281b4e6894bd Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:16:19 +0100 Subject: [PATCH 1/9] Added exception pattern mechanism to the tokenizer. --- spacy/language.py | 6 ++++- spacy/language_data/__init__.py | 1 + spacy/language_data/special_cases.py | 5 ++++ spacy/tests/tokenizer/test_urls.py | 19 ++++++++++++++ spacy/tokenizer.pxd | 2 ++ spacy/tokenizer.pyx | 39 +++++++++++++++++++++------- spacy/util.py | 5 ++++ 7 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/spacy/language.py b/spacy/language.py index c6f1376a4..16bffcd7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,6 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions + if cls.exception_patterns: + rule_match = util.compile_rule_regex(cls.exception_patterns).match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -80,7 +82,7 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, + return Tokenizer(vocab, rules=rules, rule_match=rule_match, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer) @@ -142,6 +144,8 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline + exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..aa379d86d 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,3 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * +from .special_cases import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py new file mode 100644 index 000000000..e7b2be5a5 --- /dev/null +++ b/spacy/language_data/special_cases.py @@ -0,0 +1,5 @@ +from __future__ import unicode_literals + +EXCEPTION_PATTERNS = r''' +((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) +'''.strip().split() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py new file mode 100644 index 000000000..5d0654d50 --- /dev/null +++ b/spacy/tests/tokenizer/test_urls.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize("text", [ + u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + u"www.google.com?q=google", + u"google.com", + u"www.red-stars.com", + pytest.mark.xfail(u"red-stars.com"), + u"http://foo.com/blah_(wikipedia)#cite-1", + u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", + u"mailto:foo.bar@baz.com", + u"mailto:foo-bar@baz-co.com" +]) +def test_simple_url(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].orth_ == text diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e53b7dbd1..24c76f7ee 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,6 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab + cdef public object rule_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -24,6 +25,7 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 + cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 66c93528b..ec5b5ea87 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,7 +28,7 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, infix_finditer=None): '''Load a Tokenizer, reading unsupplied components from the path. @@ -39,6 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + Special case matcher. Signature of re.compile(string).match prefix_search: Signature of re.compile(string).search suffix_search: @@ -65,10 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -76,6 +77,9 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +93,7 @@ cdef class Tokenizer: self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() + self.rule_match = rule_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -100,8 +105,9 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, + self.rule_match, + self._prefix_re, + self._suffix_re, self._infix_re) return (self.__class__, args, None, None) @@ -202,9 +208,12 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + if self._match_rule(span): + tokens.push_back(self.vocab.get(tokens.mem, span), False) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -314,6 +323,18 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) + cdef int _match_rule(self, unicode string): + """Check whether the given string matches any of the patterns. + + string (unicode): The string to segment. + + Returns (int or None): The length of the prefix if present, otherwise None. + """ + if self.rule_match is None: + return 0 + match = self.rule_match(string) + return (match.end() - match.start()) if match is not None else 0 + def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index afed4142e..316e431ad 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,6 +108,11 @@ def compile_infix_regex(entries): return re.compile(expression) +def compile_rule_regex(entries): + expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' + return re.compile(expression) + + def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From d9c59c47517843f97ea3e36b0db66879e2af1b5d Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:30:49 +0100 Subject: [PATCH 2/9] Maintaining backward compatibility. --- spacy/tokenizer.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ec5b5ea87..4aabdb3db 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,8 +28,8 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, - infix_finditer=None): + def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + infix_finditer=None, rule_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -69,7 +69,7 @@ cdef class Tokenizer: infix_finditer = util.compile_infix_regex(entries).finditer return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -77,9 +77,6 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +86,9 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. ''' self.mem = Pool() self._cache = PreshMap() From 3a9be4d485f8f80201ddf8056e16bf39c18ceff3 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Fri, 23 Dec 2016 23:49:34 +0100 Subject: [PATCH 3/9] Updated token exception handling mechanism to allow the usage of arbitrary functions as token exception matchers. --- spacy/language.py | 10 +++--- spacy/language_data/__init__.py | 2 +- spacy/language_data/special_cases.py | 5 --- spacy/language_data/tokenizer_exceptions.py | 11 +++++++ spacy/tokenizer.pxd | 3 +- spacy/tokenizer.pyx | 35 +++++++-------------- spacy/util.py | 5 --- 7 files changed, 29 insertions(+), 42 deletions(-) delete mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/language_data/tokenizer_exceptions.py diff --git a/spacy/language.py b/spacy/language.py index 16bffcd7b..bebdeab20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,8 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions - if cls.exception_patterns: - rule_match = util.compile_rule_regex(cls.exception_patterns).match + if cls.token_match: + token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -82,9 +82,9 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, rule_match=rule_match, + return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None): @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline - exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + token_match = language_data.TOKEN_MATCH prefixes = tuple(language_data.TOKENIZER_PREFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index aa379d86d..028924796 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,4 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * -from .special_cases import * +from .tokenizer_exceptions import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py deleted file mode 100644 index e7b2be5a5..000000000 --- a/spacy/language_data/special_cases.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import unicode_literals - -EXCEPTION_PATTERNS = r''' -((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) -'''.strip().split() diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py new file mode 100644 index 000000000..6551440f2 --- /dev/null +++ b/spacy/language_data/tokenizer_exceptions.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import re + +_URL_PATTERN = r''' +^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +'''.strip() + +TOKEN_MATCH = re.compile(_URL_PATTERN).match + +__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 24c76f7ee..1a3e86b49 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,7 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab - cdef public object rule_match + cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -25,7 +25,6 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4aabdb3db..63ac84482 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, rule_match = None): + infix_finditer=None, token_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,8 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - Special case matcher. Signature of re.compile(string).match + token_match: + A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -67,9 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -86,14 +86,13 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. + token_match: + A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() - self.rule_match = rule_match + self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -105,10 +104,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self.rule_match, self._prefix_re, self._suffix_re, - self._infix_re) + self._infix_re, + self.token_match) return (self.__class__, args, None, None) @@ -208,7 +207,7 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self._match_rule(span): + if self.token_match and self.token_match(span): tokens.push_back(self.vocab.get(tokens.mem, span), False) else: span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) @@ -323,18 +322,6 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - cdef int _match_rule(self, unicode string): - """Check whether the given string matches any of the patterns. - - string (unicode): The string to segment. - - Returns (int or None): The length of the prefix if present, otherwise None. - """ - if self.rule_match is None: - return 0 - match = self.rule_match(string) - return (match.end() - match.start()) if match is not None else 0 - def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index 316e431ad..afed4142e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,11 +108,6 @@ def compile_infix_regex(entries): return re.compile(expression) -def compile_rule_regex(entries): - expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' - return re.compile(expression) - - def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From 623d94e14f5d7fc2162529353dbcbde0e4c38564 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:30:28 +1100 Subject: [PATCH 4/9] Whitespace --- spacy/tokenizer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 63ac84482..4fe12fc3f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, token_match = None): + infix_finditer=None, token_match=None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: From 3e8d9c772e381f7d4befb4035055c6a867561979 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:52:17 +1100 Subject: [PATCH 5/9] Test interaction of token_match and punctuation Check that the new token_match function applies after punctuation is split off. --- spacy/tests/tokenizer/test_urls.py | 64 ++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 5d0654d50..1a964d5e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -2,8 +2,7 @@ from __future__ import unicode_literals import pytest - -@pytest.mark.parametrize("text", [ +URLS = [ u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", u"www.google.com?q=google", u"google.com", @@ -13,7 +12,66 @@ import pytest u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", u"mailto:foo.bar@baz.com", u"mailto:foo-bar@baz-co.com" -]) +] + +# Punctuation we want to check is split away before the URL +PREFIXES = [ + "(", '"', "...", ":", "<", ">", ")" +] + +# Punctuation we want to check is split away after the URL +SUFFIXES = [ + "(", '"', "...", ":", "<", ">"] + +@pytest.mark.parametrize("text", URLS) def test_simple_url(en_tokenizer, text): tokens = en_tokenizer(text) assert tokens[0].orth_ == text + assert len(tokens) == 1 + + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, prefix, url): + tokens = en_tokenizer(prefix + url) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, suffix, url): + tokens = en_tokenizer(url + suffix) + assert tokens[1].text == suffix + assert tokens[0].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_surround_url(en_tokenizer, prefix, suffix, url): + tokens = en_tokenizer(prefix + url + suffix) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert tokens[2].text == suffix + assert len(tokens) == 3 + +@pytest.mark.parametrize("prefix1", PREFIXES) +@pytest.mark.parametrize("prefix2", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, prefix1, prefix2, url): + tokens = en_tokenizer(prefix1 + prefix2 + url) + assert tokens[0].text == prefix1 + assert tokens[1].text == prefix2 + assert tokens[2].text == url + assert len(tokens) == 3 + +@pytest.mark.parametrize("suffix1", SUFFIXES) +@pytest.mark.parametrize("suffix2", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, suffix1, suffix2, url): + tokens = en_tokenizer(url + suffix1 + suffix2) + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 + assert len(tokens) == 3 From 3ba7c167a8c1f51eab20b055540bcffd4c097ae9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Dec 2016 17:10:08 -0600 Subject: [PATCH 6/9] Fix URL tests --- spacy/tests/tokenizer/test_urls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 1a964d5e5..9e0172834 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -16,12 +16,12 @@ URLS = [ # Punctuation we want to check is split away before the URL PREFIXES = [ - "(", '"', "...", ":", "<", ">", ")" + "(", '"', "...", ">" ] # Punctuation we want to check is split away after the URL SUFFIXES = [ - "(", '"', "...", ":", "<", ">"] + '"', ":", ">"] @pytest.mark.parametrize("text", URLS) def test_simple_url(en_tokenizer, text): @@ -40,10 +40,10 @@ def test_prefixed_url(en_tokenizer, prefix, url): @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) -def test_prefixed_url(en_tokenizer, suffix, url): +def test_suffixed_url(en_tokenizer, url, suffix): tokens = en_tokenizer(url + suffix) - assert tokens[1].text == suffix assert tokens[0].text == url + assert tokens[1].text == suffix assert len(tokens) == 2 @pytest.mark.parametrize("prefix", PREFIXES) From fde53be3b40827ded28f33662c261770bbc2604a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Dec 2016 17:11:50 -0600 Subject: [PATCH 7/9] Move whole token mach inside _split_affixes. --- spacy/tokenizer.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4fe12fc3f..0e83c4a75 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -207,12 +207,9 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self.token_match and self.token_match(span): - tokens.push_back(self.vocab.get(tokens.mem, span), False) - else: - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -224,6 +221,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: @@ -234,6 +233,8 @@ cdef class Tokenizer: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break + if self.token_match and self.token_match(string): + break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] @@ -271,7 +272,11 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if not cache_hit: + if cache_hit: + pass + elif self.token_match and self.token_match(string): + tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + else: matches = self.find_infix(string) if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) From 1bd53bbf89fe134e20ccb3e6000798c02ef03468 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 11:26:21 +0100 Subject: [PATCH 8/9] Fix typos (resolves #718) --- spacy/en/tokenizer_exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 398ae486b..ee6f4675f 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -599,7 +599,7 @@ TOKENIZER_EXCEPTIONS = { ], "She's": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'s"} ], @@ -741,7 +741,7 @@ TOKENIZER_EXCEPTIONS = { ], "Shedve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ], From 4fc4d3d0e39054cb5aac0daa1e6417234a00c33b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 15:41:16 +0100 Subject: [PATCH 9/9] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e99d6dadc..a55f98646 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,8 +12,6 @@ -## Screenshots (if appropriate): - ## Types of changes - [ ] Bug fix (non-breaking change fixing an issue) @@ -27,4 +25,4 @@ - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. \ No newline at end of file +- [ ] All new and existing tests passed.