From 1748549aebce44b3e3dce42815fd281b4e6894bd Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:16:19 +0100 Subject: [PATCH 01/81] Added exception pattern mechanism to the tokenizer. --- spacy/language.py | 6 ++++- spacy/language_data/__init__.py | 1 + spacy/language_data/special_cases.py | 5 ++++ spacy/tests/tokenizer/test_urls.py | 19 ++++++++++++++ spacy/tokenizer.pxd | 2 ++ spacy/tokenizer.pyx | 39 +++++++++++++++++++++------- spacy/util.py | 5 ++++ 7 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/spacy/language.py b/spacy/language.py index c6f1376a4..16bffcd7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,6 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions + if cls.exception_patterns: + rule_match = util.compile_rule_regex(cls.exception_patterns).match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -80,7 +82,7 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, + return Tokenizer(vocab, rules=rules, rule_match=rule_match, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer) @@ -142,6 +144,8 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline + exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..aa379d86d 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,3 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * +from .special_cases import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py new file mode 100644 index 000000000..e7b2be5a5 --- /dev/null +++ b/spacy/language_data/special_cases.py @@ -0,0 +1,5 @@ +from __future__ import unicode_literals + +EXCEPTION_PATTERNS = r''' +((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) +'''.strip().split() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py new file mode 100644 index 000000000..5d0654d50 --- /dev/null +++ b/spacy/tests/tokenizer/test_urls.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize("text", [ + u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + u"www.google.com?q=google", + u"google.com", + u"www.red-stars.com", + pytest.mark.xfail(u"red-stars.com"), + u"http://foo.com/blah_(wikipedia)#cite-1", + u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", + u"mailto:foo.bar@baz.com", + u"mailto:foo-bar@baz-co.com" +]) +def test_simple_url(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].orth_ == text diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e53b7dbd1..24c76f7ee 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,6 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab + cdef public object rule_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -24,6 +25,7 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 + cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 66c93528b..ec5b5ea87 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,7 +28,7 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, infix_finditer=None): '''Load a Tokenizer, reading unsupplied components from the path. @@ -39,6 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + Special case matcher. Signature of re.compile(string).match prefix_search: Signature of re.compile(string).search suffix_search: @@ -65,10 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -76,6 +77,9 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +93,7 @@ cdef class Tokenizer: self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() + self.rule_match = rule_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -100,8 +105,9 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, + self.rule_match, + self._prefix_re, + self._suffix_re, self._infix_re) return (self.__class__, args, None, None) @@ -202,9 +208,12 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + if self._match_rule(span): + tokens.push_back(self.vocab.get(tokens.mem, span), False) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -314,6 +323,18 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) + cdef int _match_rule(self, unicode string): + """Check whether the given string matches any of the patterns. + + string (unicode): The string to segment. + + Returns (int or None): The length of the prefix if present, otherwise None. + """ + if self.rule_match is None: + return 0 + match = self.rule_match(string) + return (match.end() - match.start()) if match is not None else 0 + def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index afed4142e..316e431ad 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,6 +108,11 @@ def compile_infix_regex(entries): return re.compile(expression) +def compile_rule_regex(entries): + expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' + return re.compile(expression) + + def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From d9c59c47517843f97ea3e36b0db66879e2af1b5d Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:30:49 +0100 Subject: [PATCH 02/81] Maintaining backward compatibility. --- spacy/tokenizer.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ec5b5ea87..4aabdb3db 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,8 +28,8 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, - infix_finditer=None): + def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + infix_finditer=None, rule_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -69,7 +69,7 @@ cdef class Tokenizer: infix_finditer = util.compile_infix_regex(entries).finditer return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -77,9 +77,6 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +86,9 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. ''' self.mem = Pool() self._cache = PreshMap() From 3a9be4d485f8f80201ddf8056e16bf39c18ceff3 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Fri, 23 Dec 2016 23:49:34 +0100 Subject: [PATCH 03/81] Updated token exception handling mechanism to allow the usage of arbitrary functions as token exception matchers. --- spacy/language.py | 10 +++--- spacy/language_data/__init__.py | 2 +- spacy/language_data/special_cases.py | 5 --- spacy/language_data/tokenizer_exceptions.py | 11 +++++++ spacy/tokenizer.pxd | 3 +- spacy/tokenizer.pyx | 35 +++++++-------------- spacy/util.py | 5 --- 7 files changed, 29 insertions(+), 42 deletions(-) delete mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/language_data/tokenizer_exceptions.py diff --git a/spacy/language.py b/spacy/language.py index 16bffcd7b..bebdeab20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,8 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions - if cls.exception_patterns: - rule_match = util.compile_rule_regex(cls.exception_patterns).match + if cls.token_match: + token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -82,9 +82,9 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, rule_match=rule_match, + return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None): @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline - exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + token_match = language_data.TOKEN_MATCH prefixes = tuple(language_data.TOKENIZER_PREFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index aa379d86d..028924796 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,4 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * -from .special_cases import * +from .tokenizer_exceptions import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py deleted file mode 100644 index e7b2be5a5..000000000 --- a/spacy/language_data/special_cases.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import unicode_literals - -EXCEPTION_PATTERNS = r''' -((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) -'''.strip().split() diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py new file mode 100644 index 000000000..6551440f2 --- /dev/null +++ b/spacy/language_data/tokenizer_exceptions.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import re + +_URL_PATTERN = r''' +^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +'''.strip() + +TOKEN_MATCH = re.compile(_URL_PATTERN).match + +__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 24c76f7ee..1a3e86b49 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,7 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab - cdef public object rule_match + cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -25,7 +25,6 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4aabdb3db..63ac84482 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, rule_match = None): + infix_finditer=None, token_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,8 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - Special case matcher. Signature of re.compile(string).match + token_match: + A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -67,9 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -86,14 +86,13 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. + token_match: + A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() - self.rule_match = rule_match + self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -105,10 +104,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self.rule_match, self._prefix_re, self._suffix_re, - self._infix_re) + self._infix_re, + self.token_match) return (self.__class__, args, None, None) @@ -208,7 +207,7 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self._match_rule(span): + if self.token_match and self.token_match(span): tokens.push_back(self.vocab.get(tokens.mem, span), False) else: span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) @@ -323,18 +322,6 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - cdef int _match_rule(self, unicode string): - """Check whether the given string matches any of the patterns. - - string (unicode): The string to segment. - - Returns (int or None): The length of the prefix if present, otherwise None. - """ - if self.rule_match is None: - return 0 - match = self.rule_match(string) - return (match.end() - match.start()) if match is not None else 0 - def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index 316e431ad..afed4142e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,11 +108,6 @@ def compile_infix_regex(entries): return re.compile(expression) -def compile_rule_regex(entries): - expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' - return re.compile(expression) - - def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From f112e7754e4f4368f0a82c3aae3a58f5300176f0 Mon Sep 17 00:00:00 2001 From: Petter Hohle Date: Wed, 28 Dec 2016 18:39:01 +0100 Subject: [PATCH 04/81] Add PART to tag map 16 of the 17 PoS tags in the UD tag set is added; PART is missing. --- spacy/language_data/tag_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index f5b6b5040..966960721 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -20,5 +20,6 @@ TAG_MAP = { "X": {POS: X}, "CONJ": {POS: CONJ}, "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + "VERB": {POS: VERB}, + "PART": {POS: PART} } From 623d94e14f5d7fc2162529353dbcbde0e4c38564 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:30:28 +1100 Subject: [PATCH 05/81] Whitespace --- spacy/tokenizer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 63ac84482..4fe12fc3f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, token_match = None): + infix_finditer=None, token_match=None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: From 3e8d9c772e381f7d4befb4035055c6a867561979 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:52:17 +1100 Subject: [PATCH 06/81] Test interaction of token_match and punctuation Check that the new token_match function applies after punctuation is split off. --- spacy/tests/tokenizer/test_urls.py | 64 ++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 5d0654d50..1a964d5e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -2,8 +2,7 @@ from __future__ import unicode_literals import pytest - -@pytest.mark.parametrize("text", [ +URLS = [ u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", u"www.google.com?q=google", u"google.com", @@ -13,7 +12,66 @@ import pytest u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", u"mailto:foo.bar@baz.com", u"mailto:foo-bar@baz-co.com" -]) +] + +# Punctuation we want to check is split away before the URL +PREFIXES = [ + "(", '"', "...", ":", "<", ">", ")" +] + +# Punctuation we want to check is split away after the URL +SUFFIXES = [ + "(", '"', "...", ":", "<", ">"] + +@pytest.mark.parametrize("text", URLS) def test_simple_url(en_tokenizer, text): tokens = en_tokenizer(text) assert tokens[0].orth_ == text + assert len(tokens) == 1 + + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, prefix, url): + tokens = en_tokenizer(prefix + url) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, suffix, url): + tokens = en_tokenizer(url + suffix) + assert tokens[1].text == suffix + assert tokens[0].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_surround_url(en_tokenizer, prefix, suffix, url): + tokens = en_tokenizer(prefix + url + suffix) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert tokens[2].text == suffix + assert len(tokens) == 3 + +@pytest.mark.parametrize("prefix1", PREFIXES) +@pytest.mark.parametrize("prefix2", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, prefix1, prefix2, url): + tokens = en_tokenizer(prefix1 + prefix2 + url) + assert tokens[0].text == prefix1 + assert tokens[1].text == prefix2 + assert tokens[2].text == url + assert len(tokens) == 3 + +@pytest.mark.parametrize("suffix1", SUFFIXES) +@pytest.mark.parametrize("suffix2", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, suffix1, suffix2, url): + tokens = en_tokenizer(url + suffix1 + suffix2) + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 + assert len(tokens) == 3 From 3ba7c167a8c1f51eab20b055540bcffd4c097ae9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Dec 2016 17:10:08 -0600 Subject: [PATCH 07/81] Fix URL tests --- spacy/tests/tokenizer/test_urls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 1a964d5e5..9e0172834 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -16,12 +16,12 @@ URLS = [ # Punctuation we want to check is split away before the URL PREFIXES = [ - "(", '"', "...", ":", "<", ">", ")" + "(", '"', "...", ">" ] # Punctuation we want to check is split away after the URL SUFFIXES = [ - "(", '"', "...", ":", "<", ">"] + '"', ":", ">"] @pytest.mark.parametrize("text", URLS) def test_simple_url(en_tokenizer, text): @@ -40,10 +40,10 @@ def test_prefixed_url(en_tokenizer, prefix, url): @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) -def test_prefixed_url(en_tokenizer, suffix, url): +def test_suffixed_url(en_tokenizer, url, suffix): tokens = en_tokenizer(url + suffix) - assert tokens[1].text == suffix assert tokens[0].text == url + assert tokens[1].text == suffix assert len(tokens) == 2 @pytest.mark.parametrize("prefix", PREFIXES) From fde53be3b40827ded28f33662c261770bbc2604a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Dec 2016 17:11:50 -0600 Subject: [PATCH 08/81] Move whole token mach inside _split_affixes. --- spacy/tokenizer.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4fe12fc3f..0e83c4a75 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -207,12 +207,9 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self.token_match and self.token_match(span): - tokens.push_back(self.vocab.get(tokens.mem, span), False) - else: - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -224,6 +221,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: @@ -234,6 +233,8 @@ cdef class Tokenizer: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break + if self.token_match and self.token_match(string): + break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] @@ -271,7 +272,11 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if not cache_hit: + if cache_hit: + pass + elif self.token_match and self.token_match(string): + tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + else: matches = self.find_infix(string) if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) From 505f31f2bfc28c38259354cae69e3260c043cec3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 31 Dec 2016 10:24:24 +0100 Subject: [PATCH 09/81] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c48c3479b..9f65c24fb 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ open-source software, released under the MIT license. :target: https://pypi.python.org/pypi/spacy :alt: pypi Version -.. image:: https://badges.gitter.im/spaCy-users.png +.. image:: https://badges.gitter.im/explosion.png :target: https://gitter.im/explosion/spaCy :alt: spaCy on Gitter From acdd2fc9a61a2396b4c5331aaddc33c5f27780b3 Mon Sep 17 00:00:00 2001 From: Guy Rosin Date: Sat, 31 Dec 2016 14:53:05 +0200 Subject: [PATCH 10/81] Tiny code typo --- website/docs/usage/rule-based-matching.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index bedadb0d3..fde9ee4d7 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -141,7 +141,7 @@ p span.merge(label=label, tag='NNP' if label else span.root.tag_) matcher.add_entity('GoogleNow', on_match=merge_phrases) - matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}]) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded']) matcher(doc) print([w.text for w in doc]) From d845ab3d201cd4dbafe998042760aadb26fab257 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 03:17:29 +0100 Subject: [PATCH 11/81] Add Gitter room to social meta --- website/_harp.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index bc8cf4d84..c9e8dd935 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -22,7 +22,8 @@ "twitter": "spacy_io", "github": "explosion", "reddit": "spacynlp", - "codepen": "explosion" + "codepen": "explosion", + "gitter": "explosion/spaCy" }, "NAVIGATION": { From 2afbf6b6c04074a96856e4df2107b9b252a189ac Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 03:17:43 +0100 Subject: [PATCH 12/81] Add missing closing tag for symbol --- website/assets/img/graphics.svg | 1 + 1 file changed, 1 insertion(+) diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg index 23036f4ca..dc69deda4 100644 --- a/website/assets/img/graphics.svg +++ b/website/assets/img/graphics.svg @@ -64,5 +64,6 @@ matt-signature + From 3ca8de4666c30616452c1374dacc727754d3d764 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 03:18:08 +0100 Subject: [PATCH 13/81] Use rem value for top/bottom card padding Fix rendering / interpretation error in Firefox --- website/assets/css/_base/_objects.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 2b037dca7..7aaaef787 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -60,7 +60,7 @@ background: $color-back border-radius: 2px border: 1px solid $color-subtle - padding: 3.5% 2.5% + padding: 3rem 2.5% //- Icons From cd0da315d59e34977f91288c0b708f48c8e9f43c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 03:18:36 +0100 Subject: [PATCH 14/81] Bump version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index c9e8dd935..dae05528a 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -54,7 +54,7 @@ } }, - "V_CSS": "1.10", + "V_CSS": "1.12", "V_JS": "1.0", "DEFAULT_SYNTAX" : "python", "ANALYTICS": "UA-58931649-1", From a9a7cddf5b92c8e83965d42b4a1b3c8868212a6e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 03:18:51 +0100 Subject: [PATCH 15/81] Update icons and remove unused SVG meta --- website/_includes/_mixins-base.jade | 2 +- website/assets/img/icons.svg | 38 +++++++++++++---------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 27f195690..21f18db18 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -30,7 +30,7 @@ mixin svg(file, name, width, height) //- Icon mixin icon(name, size) - +svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes) + +svg("icons", name, size || 20).o-icon&attributes(attributes) //- Pro/Con/Neutral icon diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg index 9237c9994..224224084 100644 --- a/website/assets/img/icons.svg +++ b/website/assets/img/icons.svg @@ -1,32 +1,28 @@ - - github - + + - - code - + + - - anchor - + + - - book - + + - - pro - + + - - con - + + - - neutral - + + + + + From e3d84572f2099d6e283333fa608f33329a933e1e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 12:28:37 +0100 Subject: [PATCH 16/81] Fix ents input format example --- website/docs/usage/entity-recognition.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 4b62a290b..a96df5694 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -57,7 +57,7 @@ p doc.ents = [Span(0, 1, label='GPE')] assert doc[0].ent_type_ == 'GPE' doc.ents = [] - doc.ents = [(u'LondonCity', 0, 1, u'GPE')] + doc.ents = [(u'LondonCity', u'GPE', 0, 1)] p | The value you assign should be a sequence, the values of which From 4acd026cb6b39e957d661bd7458d2f2c26c056a9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 12:43:43 +0100 Subject: [PATCH 17/81] Add missing documentation to mixins --- website/_includes/_mixins-base.jade | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 21f18db18..6e9e71508 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -1,6 +1,7 @@ //- 💫 MIXINS > BASE //- Aside wrapper + label - [string] aside label mixin aside-wrapper(label) aside.c-aside @@ -21,6 +22,10 @@ mixin date(input) //- SVG from map + file - [string] SVG file name in /assets/img/ + name - [string] SVG symbol id + width - [integer] width in px + height - [integer] height in px (default: same as width) mixin svg(file, name, width, height) svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) @@ -28,12 +33,15 @@ mixin svg(file, name, width, height) //- Icon + name - [string] icon name, should be SVG symbol ID + size - [integer] icon width and height (default: 20) mixin icon(name, size) +svg("icons", name, size || 20).o-icon&attributes(attributes) //- Pro/Con/Neutral icon + icon - [string] "pro", "con" or "neutral" (default: "neutral") mixin procon(icon) - colors = { pro: "green", con: "red" } @@ -41,6 +49,7 @@ mixin procon(icon) //- Headlines Helper Mixin + level - [integer] 1, 2, 3, 4, or 5 mixin headline(level) if level == 1 @@ -65,6 +74,7 @@ mixin headline(level) //- Permalink rendering + id - [string] permalink ID used for link anchor mixin permalink(id) if id @@ -77,6 +87,7 @@ mixin permalink(id) //- Terminal-style code window + label - [string] title displayed in top bar of terminal window mixin terminal(label) .x-terminal From 134e115d9cafbaa519931b6462412897ae89b3ee Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 12:45:17 +0100 Subject: [PATCH 18/81] Bump version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index dae05528a..32e6874d7 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -54,7 +54,7 @@ } }, - "V_CSS": "1.12", + "V_CSS": "1.13", "V_JS": "1.0", "DEFAULT_SYNTAX" : "python", "ANALYTICS": "UA-58931649-1", From 78e54b375f414c652723723596eb6e397c5a87c6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 12:45:37 +0100 Subject: [PATCH 19/81] Move scripts to own file --- website/_includes/_scripts.jade | 23 +++++++++++++++++++++++ website/_layout.jade | 11 +---------- 2 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 website/_includes/_scripts.jade diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade new file mode 100644 index 000000000..544cf0977 --- /dev/null +++ b/website/_includes/_scripts.jade @@ -0,0 +1,23 @@ +//- 💫 INCLUDES > SCRIPTS + +script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") +script(src="/assets/js/prism.js", type="text/javascript") + +if SECTION == "docs" + script. + ((window.gitter = {}).chat = {}).options = { + useStyles: false, + activationElement: '.js-gitter-button', + targetElement: '.js-gitter', + room: '!{SOCIAL.gitter}' + }; + + script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) + +if environment == "deploy" + script + | window.ga=window.ga||function(){ + | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; + | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); + + script(async src="https://www.google-analytics.com/analytics.js") diff --git a/website/_layout.jade b/website/_layout.jade index b04c4b5f3..d5c52df3f 100644 --- a/website/_layout.jade +++ b/website/_layout.jade @@ -52,13 +52,4 @@ html(lang="en") main!=yield include _includes/_footer - script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") - script(src="/assets/js/prism.js", type="text/javascript") - - if environment == "deploy" - script - | window.ga=window.ga||function(){ - | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; - | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); - - script(async src="https://www.google-analytics.com/analytics.js") + include _includes/_scripts From a1a4b253a15a202591a1044a1e0776a79160661b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 12:46:01 +0100 Subject: [PATCH 20/81] Add Gitter chat widget component to docs --- website/_includes/_mixins-base.jade | 12 +++ website/_includes/_page-docs.jade | 2 + website/assets/css/_components/_chat.sass | 100 ++++++++++++++++++++++ website/assets/css/style.sass | 1 + 4 files changed, 115 insertions(+) create mode 100644 website/assets/css/_components/_chat.sass diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 6e9e71508..ed0802a4f 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -98,6 +98,18 @@ mixin terminal(label) block +//- Gitter chat button and widget + button - [string] text shown on button + label - [string] title of chat window (default: same as button) + +mixin gitter(button, label) + aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) + + button.js-gitter-button.c-chat__button.u-text-small + +icon("chat").o-icon--inline + !=button + + //- Logo mixin logo() diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index 09cbfa6a5..72db134cd 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside .o-inline-list +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] + +gitter("spaCy chat") + include _footer diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass new file mode 100644 index 000000000..6ac1a67a7 --- /dev/null +++ b/website/assets/css/_components/_chat.sass @@ -0,0 +1,100 @@ +//- 💫 CSS > COMPONENTS > CHAT + +.c-chat + @include position(fixed, top, left, 0, 60%) + bottom: 0 + right: 0 + display: flex + flex-flow: column nowrap + background: $color-back + transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7) + box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25) + z-index: 100 + + @include breakpoint(min, md) + left: calc(100% - #{$aside-width} - #{$aside-padding}) + + @include breakpoint(max, sm) + left: 50% + + @include breakpoint(max, xs) + left: 0 + + &.is-collapsed:not(.is-loading) + transform: translateX(110%) + + &:before + @include position(absolute, top, left, 1rem, 2rem) + content: attr(data-title) + font: bold 1.4rem $font-code + text-transform: uppercase + color: $color-back + + &:after + @include position(absolute, top, left, 0, 100%) + content: "" + z-index: -1 + bottom: 0 + right: -100% + background: $color-back + + & > iframe + width: 100% + flex: 1 1 calc(100% - #{$nav-height}) + border: 0 + + .gitter-chat-embed-loading-wrapper + @include position(absolute, top, left, 0, 0) + right: 0 + bottom: 0 + display: none + justify-content: center + align-items: center + + .is-loading & + display: flex + + .gitter-chat-embed-action-bar, + .gitter-chat-embed-action-bar-item + display: flex + + .gitter-chat-embed-action-bar + align-items: center + justify-content: flex-end + background: $color-theme + padding: 0 1rem 0 2rem + flex: 0 0 $nav-height + + .gitter-chat-embed-action-bar-item + @include size(40px) + padding: 0 + opacity: 0.65 + background-position: 50% + background-repeat: no-repeat + background-size: 22px 22px + border: 0 + cursor: pointer + transition: all 0.2s ease + + &:focus, + &:hover + opacity: 1 + + &.gitter-chat-embed-action-bar-item-pop-out + background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyMDAgMTcxLjQyOSIgZmlsbD0iIzNhMzEzMyI+PHBhdGggZD0iTTE1Ny4xNDMsMTAzLjU3MXYzNS43MTRjMCw4Ljg1NC0zLjE0NCwxNi40MjYtOS40MzEsMjIuNzEzcy0xMy44NTgsOS40MzEtMjIuNzEyLDkuNDMxSDMyLjE0MyBjLTguODU0LDAtMTYuNDI1LTMuMTQ0LTIyLjcxMi05LjQzMVMwLDE0OC4xNCwwLDEzOS4yODVWNDYuNDI5YzAtOC44NTQsMy4xNDQtMTYuNDI1LDkuNDMxLTIyLjcxMiBjNi4yODctNi4yODcsMTMuODU4LTkuNDMxLDIyLjcxMi05LjQzMWg3OC41NzJjMS4wNDEsMCwxLjg5NiwwLjMzNSwyLjU2NiwxLjAwNGMwLjY3LDAuNjcsMS4wMDQsMS41MjUsMS4wMDQsMi41NjdWMjUgYzAsMS4wNDItMC4zMzQsMS44OTctMS4wMDQsMi41NjdjLTAuNjcsMC42Ny0xLjUyNSwxLjAwNC0yLjU2NiwxLjAwNEgzMi4xNDNjLTQuOTExLDAtOS4xMTUsMS43NDktMTIuNjEyLDUuMjQ2IHMtNS4yNDYsNy43MDEtNS4yNDYsMTIuNjEydjkyLjg1NmMwLDQuOTExLDEuNzQ5LDkuMTE1LDUuMjQ2LDEyLjYxMnM3LjcwMSw1LjI0NSwxMi42MTIsNS4yNDVIMTI1YzQuOTEsMCw5LjExNS0xLjc0OCwxMi42MTEtNS4yNDUgYzMuNDk3LTMuNDk3LDUuMjQ2LTcuNzAxLDUuMjQ2LTEyLjYxMnYtMzUuNzE0YzAtMS4wNDIsMC4zMzQtMS44OTcsMS4wMDQtMi41NjdjMC42Ny0wLjY2OSwxLjUyNS0xLjAwNCwyLjU2Ny0xLjAwNGg3LjE0MyBjMS4wNDIsMCwxLjg5NywwLjMzNSwyLjU2NywxLjAwNEMxNTYuODA5LDEwMS42NzQsMTU3LjE0MywxMDIuNTI5LDE1Ny4xNDMsMTAzLjU3MXogTTIwMCw3LjE0M3Y1Ny4xNDMgYzAsMS45MzUtMC43MDcsMy42MDktMi4xMjEsNS4wMjJjLTEuNDEzLDEuNDE0LTMuMDg4LDIuMTIxLTUuMDIxLDIuMTIxYy0xLjkzNSwwLTMuNjA5LTAuNzA3LTUuMDIyLTIuMTIxbC0xOS42NDQtMTkuNjQzIGwtNzIuNzY3LDcyLjc2OWMtMC43NDQsMC43NDQtMS42LDEuMTE1LTIuNTY3LDEuMTE1cy0xLjgyMy0wLjM3MS0yLjU2Ny0xLjExNUw3Ny41NjcsMTA5LjcxYy0wLjc0NC0wLjc0NC0xLjExNi0xLjYtMS4xMTYtMi41NjcgYzAtMC45NjcsMC4zNzItMS44MjIsMS4xMTYtMi41NjZsNzIuNzY4LTcyLjc2OGwtMTkuNjQ0LTE5LjY0M2MtMS40MTMtMS40MTQtMi4xMi0zLjA4OC0yLjEyLTUuMDIyYzAtMS45MzUsMC43MDctMy42MDksMi4xMi01LjAyMiBDMTMyLjEwNSwwLjcwNywxMzMuNzc5LDAsMTM1LjcxNSwwaDU3LjE0M2MxLjkzNCwwLDMuNjA4LDAuNzA3LDUuMDIxLDIuMTIxQzE5OS4yOTMsMy41MzQsMjAwLDUuMjA4LDIwMCw3LjE0M3oiLz48L3N2Zz4=) + margin-right: -4px + + &.gitter-chat-embed-action-bar-item-collapse-chat + background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxNzEuNDI5IDE3MS40MjkiIGZpbGw9IiMzYTMxMzMiPjxwYXRoIGQ9Ik0xMjIuNDMzLDEwNi4xMzhsLTE2LjI5NSwxNi4yOTVjLTAuNzQ0LDAuNzQ0LTEuNiwxLjExNi0yLjU2NiwxLjExNmMtMC45NjgsMC0xLjgyMy0wLjM3Mi0yLjU2Ny0xLjExNmwtMTUuMjktMTUuMjkgbC0xNS4yOSwxNS4yOWMtMC43NDQsMC43NDQtMS42LDEuMTE2LTIuNTY3LDEuMTE2cy0xLjgyMy0wLjM3Mi0yLjU2Ny0xLjExNmwtMTYuMjk0LTE2LjI5NWMtMC43NDQtMC43NDQtMS4xMTYtMS42LTEuMTE2LTIuNTY2IGMwLTAuOTY4LDAuMzcyLTEuODIzLDEuMTE2LTIuNTY3bDE1LjI5LTE1LjI5bC0xNS4yOS0xNS4yOWMtMC43NDQtMC43NDQtMS4xMTYtMS42LTEuMTE2LTIuNTY3czAuMzcyLTEuODIzLDEuMTE2LTIuNTY3IEw2NS4yOSw0OC45OTZjMC43NDQtMC43NDQsMS42LTEuMTE2LDIuNTY3LTEuMTE2czEuODIzLDAuMzcyLDIuNTY3LDEuMTE2bDE1LjI5LDE1LjI5bDE1LjI5LTE1LjI5IGMwLjc0NC0wLjc0NCwxLjYtMS4xMTYsMi41NjctMS4xMTZjMC45NjcsMCwxLjgyMiwwLjM3MiwyLjU2NiwxLjExNmwxNi4yOTUsMTYuMjk0YzAuNzQ0LDAuNzQ0LDEuMTE2LDEuNiwxLjExNiwyLjU2NyBzLTAuMzcyLDEuODIzLTEuMTE2LDIuNTY3bC0xNS4yOSwxNS4yOWwxNS4yOSwxNS4yOWMwLjc0NCwwLjc0NCwxLjExNiwxLjYsMS4xMTYsMi41NjcgQzEyMy41NDksMTA0LjUzOSwxMjMuMTc3LDEwNS4zOTQsMTIyLjQzMywxMDYuMTM4eiBNMTQ2LjQyOSw4NS43MTRjMC0xMS4wMTItMi43MTctMjEuMTY4LTguMTQ4LTMwLjQ2OSBzLTEyLjc5Ny0xNi42NjctMjIuMDk4LTIyLjA5OFM5Ni43MjYsMjUsODUuNzE0LDI1cy0yMS4xNjgsMi43MTYtMzAuNDY5LDguMTQ3UzM4LjU3OSw0NS45NDUsMzMuMTQ3LDU1LjI0NlMyNSw3NC43MDMsMjUsODUuNzE0IHMyLjcxNiwyMS4xNjgsOC4xNDcsMzAuNDY5czEyLjc5NywxNi42NjYsMjIuMDk4LDIyLjA5OHMxOS40NTcsOC4xNDgsMzAuNDY5LDguMTQ4czIxLjE2OC0yLjcxNywzMC40NjktOC4xNDggczE2LjY2Ni0xMi43OTcsMjIuMDk4LTIyLjA5OFMxNDYuNDI5LDk2LjcyNiwxNDYuNDI5LDg1LjcxNHogTTE3MS40MjksODUuNzE0YzAsMTUuNTUxLTMuODMyLDI5Ljg5My0xMS40OTYsNDMuMDI0IGMtNy42NjQsMTMuMTMzLTE4LjA2MiwyMy41My0zMS4xOTQsMzEuMTk0Yy0xMy4xMzIsNy42NjQtMjcuNDc0LDExLjQ5Ni00My4wMjQsMTEuNDk2cy0yOS44OTItMy44MzItNDMuMDI0LTExLjQ5NiBjLTEzLjEzMy03LjY2NC0yMy41MzEtMTguMDYyLTMxLjE5NC0zMS4xOTRDMy44MzIsMTE1LjYwNywwLDEwMS4yNjUsMCw4NS43MTRTMy44MzIsNTUuODIyLDExLjQ5Niw0Mi42OSBjNy42NjQtMTMuMTMzLDE4LjA2Mi0yMy41MzEsMzEuMTk0LTMxLjE5NEM1NS44MjIsMy44MzIsNzAuMTY0LDAsODUuNzE0LDBzMjkuODkzLDMuODMyLDQzLjAyNCwxMS40OTYgYzEzLjEzMyw3LjY2NCwyMy41MywxOC4wNjIsMzEuMTk0LDMxLjE5NEMxNjcuNTk3LDU1LjgyMiwxNzEuNDI5LDcwLjE2NCwxNzEuNDI5LDg1LjcxNHoiLz48L3N2Zz4=) + +.c-chat__button + @include position(fixed, bottom, right, 0, 2rem) + padding: 1rem 1.5rem + background: $color-front + color: $color-back + border-top-left-radius: 4px + border-top-right-radius: 4px + z-index: 20 + border-color: $color-theme + border-style: solid + border-width: 1px 1px 0 1px diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass index 5ab135ab9..a8d2edad4 100644 --- a/website/assets/css/style.sass +++ b/website/assets/css/style.sass @@ -24,6 +24,7 @@ $theme: blue !default @import _components/asides @import _components/buttons +@import _components/chat @import _components/code @import _components/landing @import _components/lists From 87c7496065be290dacfe840dc500a5d1d12984f8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 1 Jan 2017 13:25:28 +0100 Subject: [PATCH 21/81] Use better chat window icons with more compact markup --- website/_harp.json | 2 +- website/assets/css/_components/_chat.sass | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index 32e6874d7..04a66f772 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -54,7 +54,7 @@ } }, - "V_CSS": "1.13", + "V_CSS": "1.14", "V_JS": "1.0", "DEFAULT_SYNTAX" : "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass index 6ac1a67a7..2a1e5cc3d 100644 --- a/website/assets/css/_components/_chat.sass +++ b/website/assets/css/_components/_chat.sass @@ -68,7 +68,7 @@ .gitter-chat-embed-action-bar-item @include size(40px) padding: 0 - opacity: 0.65 + opacity: 0.75 background-position: 50% background-repeat: no-repeat background-size: 22px 22px @@ -81,11 +81,11 @@ opacity: 1 &.gitter-chat-embed-action-bar-item-pop-out - background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyMDAgMTcxLjQyOSIgZmlsbD0iIzNhMzEzMyI+PHBhdGggZD0iTTE1Ny4xNDMsMTAzLjU3MXYzNS43MTRjMCw4Ljg1NC0zLjE0NCwxNi40MjYtOS40MzEsMjIuNzEzcy0xMy44NTgsOS40MzEtMjIuNzEyLDkuNDMxSDMyLjE0MyBjLTguODU0LDAtMTYuNDI1LTMuMTQ0LTIyLjcxMi05LjQzMVMwLDE0OC4xNCwwLDEzOS4yODVWNDYuNDI5YzAtOC44NTQsMy4xNDQtMTYuNDI1LDkuNDMxLTIyLjcxMiBjNi4yODctNi4yODcsMTMuODU4LTkuNDMxLDIyLjcxMi05LjQzMWg3OC41NzJjMS4wNDEsMCwxLjg5NiwwLjMzNSwyLjU2NiwxLjAwNGMwLjY3LDAuNjcsMS4wMDQsMS41MjUsMS4wMDQsMi41NjdWMjUgYzAsMS4wNDItMC4zMzQsMS44OTctMS4wMDQsMi41NjdjLTAuNjcsMC42Ny0xLjUyNSwxLjAwNC0yLjU2NiwxLjAwNEgzMi4xNDNjLTQuOTExLDAtOS4xMTUsMS43NDktMTIuNjEyLDUuMjQ2IHMtNS4yNDYsNy43MDEtNS4yNDYsMTIuNjEydjkyLjg1NmMwLDQuOTExLDEuNzQ5LDkuMTE1LDUuMjQ2LDEyLjYxMnM3LjcwMSw1LjI0NSwxMi42MTIsNS4yNDVIMTI1YzQuOTEsMCw5LjExNS0xLjc0OCwxMi42MTEtNS4yNDUgYzMuNDk3LTMuNDk3LDUuMjQ2LTcuNzAxLDUuMjQ2LTEyLjYxMnYtMzUuNzE0YzAtMS4wNDIsMC4zMzQtMS44OTcsMS4wMDQtMi41NjdjMC42Ny0wLjY2OSwxLjUyNS0xLjAwNCwyLjU2Ny0xLjAwNGg3LjE0MyBjMS4wNDIsMCwxLjg5NywwLjMzNSwyLjU2NywxLjAwNEMxNTYuODA5LDEwMS42NzQsMTU3LjE0MywxMDIuNTI5LDE1Ny4xNDMsMTAzLjU3MXogTTIwMCw3LjE0M3Y1Ny4xNDMgYzAsMS45MzUtMC43MDcsMy42MDktMi4xMjEsNS4wMjJjLTEuNDEzLDEuNDE0LTMuMDg4LDIuMTIxLTUuMDIxLDIuMTIxYy0xLjkzNSwwLTMuNjA5LTAuNzA3LTUuMDIyLTIuMTIxbC0xOS42NDQtMTkuNjQzIGwtNzIuNzY3LDcyLjc2OWMtMC43NDQsMC43NDQtMS42LDEuMTE1LTIuNTY3LDEuMTE1cy0xLjgyMy0wLjM3MS0yLjU2Ny0xLjExNUw3Ny41NjcsMTA5LjcxYy0wLjc0NC0wLjc0NC0xLjExNi0xLjYtMS4xMTYtMi41NjcgYzAtMC45NjcsMC4zNzItMS44MjIsMS4xMTYtMi41NjZsNzIuNzY4LTcyLjc2OGwtMTkuNjQ0LTE5LjY0M2MtMS40MTMtMS40MTQtMi4xMi0zLjA4OC0yLjEyLTUuMDIyYzAtMS45MzUsMC43MDctMy42MDksMi4xMi01LjAyMiBDMTMyLjEwNSwwLjcwNywxMzMuNzc5LDAsMTM1LjcxNSwwaDU3LjE0M2MxLjkzNCwwLDMuNjA4LDAuNzA3LDUuMDIxLDIuMTIxQzE5OS4yOTMsMy41MzQsMjAwLDUuMjA4LDIwMCw3LjE0M3oiLz48L3N2Zz4=) + background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTYgMmgtOC4wMjFjLTEuMDk5IDAtMS45NzkgMC44OC0xLjk3OSAxLjk4djguMDIwYzAgMS4xIDAuOSAyIDIgMmg4YzEuMSAwIDItMC45IDItMnYtOGMwLTEuMS0wLjktMi0yLTJ6TTE2IDEyaC04di04aDh2OHpNNCAxMGgtMnY2YzAgMS4xIDAuOSAyIDIgMmg2di0yaC02di02eiI+PC9wYXRoPjwvc3ZnPg==) margin-right: -4px &.gitter-chat-embed-action-bar-item-collapse-chat - background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxNzEuNDI5IDE3MS40MjkiIGZpbGw9IiMzYTMxMzMiPjxwYXRoIGQ9Ik0xMjIuNDMzLDEwNi4xMzhsLTE2LjI5NSwxNi4yOTVjLTAuNzQ0LDAuNzQ0LTEuNiwxLjExNi0yLjU2NiwxLjExNmMtMC45NjgsMC0xLjgyMy0wLjM3Mi0yLjU2Ny0xLjExNmwtMTUuMjktMTUuMjkgbC0xNS4yOSwxNS4yOWMtMC43NDQsMC43NDQtMS42LDEuMTE2LTIuNTY3LDEuMTE2cy0xLjgyMy0wLjM3Mi0yLjU2Ny0xLjExNmwtMTYuMjk0LTE2LjI5NWMtMC43NDQtMC43NDQtMS4xMTYtMS42LTEuMTE2LTIuNTY2IGMwLTAuOTY4LDAuMzcyLTEuODIzLDEuMTE2LTIuNTY3bDE1LjI5LTE1LjI5bC0xNS4yOS0xNS4yOWMtMC43NDQtMC43NDQtMS4xMTYtMS42LTEuMTE2LTIuNTY3czAuMzcyLTEuODIzLDEuMTE2LTIuNTY3IEw2NS4yOSw0OC45OTZjMC43NDQtMC43NDQsMS42LTEuMTE2LDIuNTY3LTEuMTE2czEuODIzLDAuMzcyLDIuNTY3LDEuMTE2bDE1LjI5LDE1LjI5bDE1LjI5LTE1LjI5IGMwLjc0NC0wLjc0NCwxLjYtMS4xMTYsMi41NjctMS4xMTZjMC45NjcsMCwxLjgyMiwwLjM3MiwyLjU2NiwxLjExNmwxNi4yOTUsMTYuMjk0YzAuNzQ0LDAuNzQ0LDEuMTE2LDEuNiwxLjExNiwyLjU2NyBzLTAuMzcyLDEuODIzLTEuMTE2LDIuNTY3bC0xNS4yOSwxNS4yOWwxNS4yOSwxNS4yOWMwLjc0NCwwLjc0NCwxLjExNiwxLjYsMS4xMTYsMi41NjcgQzEyMy41NDksMTA0LjUzOSwxMjMuMTc3LDEwNS4zOTQsMTIyLjQzMywxMDYuMTM4eiBNMTQ2LjQyOSw4NS43MTRjMC0xMS4wMTItMi43MTctMjEuMTY4LTguMTQ4LTMwLjQ2OSBzLTEyLjc5Ny0xNi42NjctMjIuMDk4LTIyLjA5OFM5Ni43MjYsMjUsODUuNzE0LDI1cy0yMS4xNjgsMi43MTYtMzAuNDY5LDguMTQ3UzM4LjU3OSw0NS45NDUsMzMuMTQ3LDU1LjI0NlMyNSw3NC43MDMsMjUsODUuNzE0IHMyLjcxNiwyMS4xNjgsOC4xNDcsMzAuNDY5czEyLjc5NywxNi42NjYsMjIuMDk4LDIyLjA5OHMxOS40NTcsOC4xNDgsMzAuNDY5LDguMTQ4czIxLjE2OC0yLjcxNywzMC40NjktOC4xNDggczE2LjY2Ni0xMi43OTcsMjIuMDk4LTIyLjA5OFMxNDYuNDI5LDk2LjcyNiwxNDYuNDI5LDg1LjcxNHogTTE3MS40MjksODUuNzE0YzAsMTUuNTUxLTMuODMyLDI5Ljg5My0xMS40OTYsNDMuMDI0IGMtNy42NjQsMTMuMTMzLTE4LjA2MiwyMy41My0zMS4xOTQsMzEuMTk0Yy0xMy4xMzIsNy42NjQtMjcuNDc0LDExLjQ5Ni00My4wMjQsMTEuNDk2cy0yOS44OTItMy44MzItNDMuMDI0LTExLjQ5NiBjLTEzLjEzMy03LjY2NC0yMy41MzEtMTguMDYyLTMxLjE5NC0zMS4xOTRDMy44MzIsMTE1LjYwNywwLDEwMS4yNjUsMCw4NS43MTRTMy44MzIsNTUuODIyLDExLjQ5Niw0Mi42OSBjNy42NjQtMTMuMTMzLDE4LjA2Mi0yMy41MzEsMzEuMTk0LTMxLjE5NEM1NS44MjIsMy44MzIsNzAuMTY0LDAsODUuNzE0LDBzMjkuODkzLDMuODMyLDQzLjAyNCwxMS40OTYgYzEzLjEzMyw3LjY2NCwyMy41MywxOC4wNjIsMzEuMTk0LDMxLjE5NEMxNjcuNTk3LDU1LjgyMiwxNzEuNDI5LDcwLjE2NCwxNzEuNDI5LDg1LjcxNHoiLz48L3N2Zz4=) + background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij48cGF0aCBmaWxsPSIjZmZmIiBkPSJNMTguOTg0IDYuNDIybC01LjU3OCA1LjU3OCA1LjU3OCA1LjU3OC0xLjQwNiAxLjQwNi01LjU3OC01LjU3OC01LjU3OCA1LjU3OC0xLjQwNi0xLjQwNiA1LjU3OC01LjU3OC01LjU3OC01LjU3OCAxLjQwNi0xLjQwNiA1LjU3OCA1LjU3OCA1LjU3OC01LjU3OHoiPjwvcGF0aD48L3N2Zz4=) .c-chat__button @include position(fixed, bottom, right, 0, 2rem) From 614f95f3bf1f0f48a6d75bc3caed7376b26f87ba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 2 Jan 2017 00:29:08 +0100 Subject: [PATCH 22/81] Remove help cursor from API links --- website/_includes/_mixins.jade | 2 +- website/assets/css/_base/_utilities.sass | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 8fe24b11b..8a42024c1 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -44,7 +44,7 @@ mixin api(path) +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block block - | #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle] + | #[+icon("book", 18).o-icon--inline.u-color-subtle] //- Aside for text diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 95be81bcd..2c40858a8 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -141,12 +141,6 @@ background: $pattern -//- Cursors - -.u-help - cursor: help - - //- Hidden elements .u-hidden From 1b82756cc74d136c7329cd6027c857adf823d1e8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 2 Jan 2017 00:29:24 +0100 Subject: [PATCH 23/81] Tidy up and fix formatting and consistency --- website/assets/css/_base/_fonts.sass | 26 +++++++++++++------------- website/docs/index.jade | 2 -- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/website/assets/css/_base/_fonts.sass b/website/assets/css/_base/_fonts.sass index 72aaf97f8..be113798c 100644 --- a/website/assets/css/_base/_fonts.sass +++ b/website/assets/css/_base/_fonts.sass @@ -6,36 +6,36 @@ font-family: "Source Sans Pro" font-style: normal font-weight: 400 - src: url("../fonts/sourcesanspro-regular.eot") - src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") + src: url("/assets/fonts/sourcesanspro-regular.eot") + src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") @font-face font-family: "Source Sans Pro" font-style: italic font-weight: 400 - src: url("../fonts/sourcesanspro-italic.eot") - src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") + src: url("/assets/fonts/sourcesanspro-italic.eot") + src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") @font-face font-family: "Source Sans Pro" font-style: normal font-weight: 700 - src: url("../fonts/sourcesanspro-bold.eot") - src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") + src: url("/assets/fonts/sourcesanspro-bold.eot") + src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") @font-face font-family: "Source Sans Pro" font-style: italic font-weight: 700 - src: url("../fonts/sourcesanspro-bolditalic.eot") - src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") + src: url("/assets/fonts/sourcesanspro-bolditalic.eot") + src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") // Source Code Pro @font-face - font-family: "Source Code Pro" - font-style: normal - font-weight: 600 - src: url("../fonts/sourcecodepro-semibold.eot") - src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") + font-family: "Source Code Pro" + font-style: normal + font-weight: 600 + src: url("/assets/fonts/sourcecodepro-semibold.eot") + src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") diff --git a/website/docs/index.jade b/website/docs/index.jade index d2949b8c4..c19602002 100644 --- a/website/docs/index.jade +++ b/website/docs/index.jade @@ -2,8 +2,6 @@ include ../_includes/_mixins -p=lorem_short - +aside("Help us improve the docs") | Did you spot a mistake or come across explanations that | are unclear? You can find a "Suggest edits" button at the From 1bd53bbf89fe134e20ccb3e6000798c02ef03468 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 11:26:21 +0100 Subject: [PATCH 24/81] Fix typos (resolves #718) --- spacy/en/tokenizer_exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 398ae486b..ee6f4675f 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -599,7 +599,7 @@ TOKENIZER_EXCEPTIONS = { ], "She's": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'s"} ], @@ -741,7 +741,7 @@ TOKENIZER_EXCEPTIONS = { ], "Shedve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ], From 4fc4d3d0e39054cb5aac0daa1e6417234a00c33b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 15:41:16 +0100 Subject: [PATCH 25/81] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e99d6dadc..a55f98646 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,8 +12,6 @@ -## Screenshots (if appropriate): - ## Types of changes - [ ] Bug fix (non-breaking change fixing an issue) @@ -27,4 +25,4 @@ - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. \ No newline at end of file +- [ ] All new and existing tests passed. From b19cfcc14482ccabc63da01811927b0e96650c24 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:17:57 +0100 Subject: [PATCH 26/81] Reorganise English tokenizer exceptions (as discussed in #718) Add logic to generate exceptions that follow a consistent pattern (like verbs and pronouns) and allow certain tokens to be excluded explicitly. --- spacy/en/tokenizer_exceptions.py | 2197 ++++++------------------------ 1 file changed, 416 insertions(+), 1781 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 398ae486b..4df3fe535 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -5,1791 +5,279 @@ from ..symbols import * from ..language_data import PRON_LEMMA -TOKENIZER_EXCEPTIONS = { - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "Theydve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "shouldn't've": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "There'll": [ - {ORTH: "There"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "howll": [ - {ORTH: "how"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Hadn't've": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "who'll": [ - {ORTH: "who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "aint": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - " ": [ - {TAG: "SP", ORTH: " "} - ], - - "Shouldnt": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "when's": [ - {ORTH: "when"}, - {ORTH: "'s", LEMMA: "be"} - ], - - "Didnt": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "itll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Who're": [ - {ORTH: "Who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Ain't": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Can't": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyre": [ - {ORTH: "Why"}, - {ORTH: "re"} - ], - - "Aren't": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Neednt": [ - {ORTH: "Need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "should've": [ - {ORTH: "should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "shouldn't": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Idve": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "weve": [ - {ORTH: "we"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Ive": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Youdve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weren't": [ - {ORTH: "Were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "werent": [ - {ORTH: "were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyre": [ - {ORTH: "why"}, - {ORTH: "re"} - ], - - "I'm": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "She'd've": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "we'll": [ - {ORTH: "we"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Don't": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyll": [ - {ORTH: "Why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "they've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "wasn't": [ - {ORTH: "was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "could've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what've": [ - {ORTH: "what"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "havent": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Who've": [ - {ORTH: "Who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shan't": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "i'll": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "you'd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "whens": [ - {ORTH: "when"}, - {ORTH: "s", LEMMA: "be"} - ], - - "whys": [ - {ORTH: "why"}, - {ORTH: "s"} - ], - - "Whereve": [ - {ORTH: "Where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "there'd": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "hadn't've": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "whatll": [ - {ORTH: "what"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't've": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "there's": [ - {ORTH: "there"}, - {ORTH: "'s"} - ], - - "Who'll": [ - {ORTH: "Who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "youll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldve": [ - {ORTH: "would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldnt": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Thered": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youre": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be"} - ], - - "Couldn't've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "who're": [ - {ORTH: "who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Whys": [ - {ORTH: "Why"}, - {ORTH: "s"} - ], - - "mightn't've": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wholl": [ - {ORTH: "Who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hadn't": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Havent": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Whatve": [ - {ORTH: "What"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Thats": [ - {ORTH: "That"}, - {ORTH: "s"} - ], - - "Howll": [ - {ORTH: "How"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "You'll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Cant": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "i'd": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "weren't": [ - {ORTH: "were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "would've": [ - {ORTH: "would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "i'm": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "why'll": [ - {ORTH: "why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "we'd've": [ - {ORTH: "we"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shouldve": [ - {ORTH: "Should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "can't": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "thats": [ - {ORTH: "that"}, - {ORTH: "s"} - ], - - "Hes": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Needn't": [ - {ORTH: "Need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "It's": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Why're": [ - {ORTH: "Why"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Hed": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - +EXC = {} + +EXCLUDE_EXC = ["Ill", "ill", "Hell", "hell", "Well", "well", "Whore", "whore"] + + +# Pronouns + +for pron in ["i"]: + for orth in [pron, pron.title()]: + EXC[orth + "'m"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} + ] + + EXC[pron + "m"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } + ] + + EXC[pron + "'ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] + + EXC[pron + "ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] + + +for pron in ["i", "you", "he", "she", "it", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for pron in ["you", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + +# W-words, relative pronouns, prepositions etc. + +for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: + for orth in [word, word.title()]: + EXC[orth + "'s"] = [ + {ORTH: orth}, + {ORTH: "'s"} + ] + + EXC[orth + "s"] = [ + {ORTH: orth}, + {ORTH: "s"} + ] + + EXC[orth + "'ll"] = [ + {ORTH: orth}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'re"] = [ + {ORTH: orth}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "'ve"] = [ + {ORTH: orth}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth}, + {ORTH: "'d"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth}, + {ORTH: "d"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +# Verbs + +for verb_data in [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "could", TAG: "MD"}, + {ORTH: "do", LEMMA: "do"}, + {ORTH: "does", LEMMA: "do"}, + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "need"}, + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "should"}, + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "would"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "n't've"] = [ + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ntve"] = [ + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "should"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "'ve"] = [ + dict(data), + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ve"] = [ + dict(data), + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "was", LEMMA: "be"}, + {ORTH: "were", LEMMA: "be"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + +# Abbreviations + +ABBREVIATIONS = { "Mt.": [ {ORTH: "Mt.", LEMMA: "Mount"} ], - "couldn't": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What've": [ - {ORTH: "What"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "theydve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "aren't": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Mightn't": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "I've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whered": [ - {ORTH: "Where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Itdve": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'ma": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "whos": [ - {ORTH: "who"}, - {ORTH: "s"} - ], - - "They'd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "What'll": [ - {ORTH: "What"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustve": [ - {ORTH: "Must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "whod": [ - {ORTH: "who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "mightntve": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'd've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Must've": [ - {ORTH: "Must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "it'd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "what're": [ - {ORTH: "what"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Wasn't": [ - {ORTH: "Was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "what's": [ - {ORTH: "what"}, - {ORTH: "'s"} - ], - - "he'd've": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "She'd": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shedve": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "ain't": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "She's": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "i'd've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We'd've": [ - {ORTH: "We"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "must've": [ - {ORTH: "must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "That's": [ - {ORTH: "That"}, - {ORTH: "'s"} - ], - - "whatre": [ - {ORTH: "what"}, - {ORTH: "re"} - ], - - "you'd've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Dont": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "thered": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "couldn't've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whens": [ - {ORTH: "When"}, - {ORTH: "s"} - ], - - "Isnt": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "mightve": [ - {ORTH: "might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "didnt": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "ive": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd've": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ], - - "Itll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "didn't": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "cant": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "im": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "they'd've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Hadntve": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weve": [ - {ORTH: "We"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightnt": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "youdve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Shedve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Hadn't": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What're": [ - {ORTH: "What"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "He'll": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "wholl": [ - {ORTH: "who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "They're": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "shouldnt": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "whered": [ - {ORTH: "where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "youve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "mustve": [ - {ORTH: "must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Youve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "therell": [ - {ORTH: "there"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "might've": [ - {ORTH: "might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustn't": [ - {ORTH: "Must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wheres": [ - {ORTH: "where"}, - {ORTH: "s"} - ], - - "they're": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "idve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hows": [ - {ORTH: "how"}, - {ORTH: "s"} - ], - - "youre": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Didn't": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Im": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"} - ], - - "howd": [ - {ORTH: "how"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "you've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "You're": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "she'll": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "don't": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "itd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Hedve": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "isnt": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "won't": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "We're": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "dont": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "ima": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: "us"} - ], - - "he's": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "we've": [ - {ORTH: "we"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "What's": [ - {ORTH: "What"}, - {ORTH: "'s"} - ], - - "Who's": [ - {ORTH: "Who"}, - {ORTH: "'s"} - ], - - "hedve": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "he'd": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "When's": [ - {ORTH: "When"}, - {ORTH: "'s"} - ], - - "Mightn't've": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We've": [ - {ORTH: "We"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Couldntve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Who'd": [ - {ORTH: "Who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "haven't": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "arent": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "You'd've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldn't": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "who's": [ - {ORTH: "who"}, - {ORTH: "'s"} - ], - - "Mightve": [ - {ORTH: "Might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Theredve": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theredve": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who'd": [ - {ORTH: "who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Where's": [ - {ORTH: "Where"}, - {ORTH: "'s"} - ], - - "wont": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "she'd've": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Should've": [ - {ORTH: "Should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "theyre": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re"} - ], - - "Wouldntve": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Where've": [ - {ORTH: "Where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mustn't": [ - {ORTH: "must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "isn't": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Aint": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "why's": [ - {ORTH: "why"}, - {ORTH: "'s"} - ], - - "There'd": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "They'll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "how'll": [ - {ORTH: "how"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Wedve": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldntve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "There's": [ - {ORTH: "There"}, - {ORTH: "'s"} - ], - - "we'd": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whod": [ - {ORTH: "Who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "whatve": [ - {ORTH: "what"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldve": [ - {ORTH: "Would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "there'll": [ - {ORTH: "there"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "needn't": [ - {ORTH: "need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "shouldntve": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "why're": [ - {ORTH: "why"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Doesnt": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whereve": [ - {ORTH: "where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "I'd": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Might've": [ - {ORTH: "Might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mightnt": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mightn't": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "you're": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "They've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what'll": [ - {ORTH: "what"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Could've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Would've": [ - {ORTH: "Would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Isn't": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: "us"} - ], - - "She'll": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You'd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "wouldnt": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Why'll": [ - {ORTH: "Why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Where'd": [ - {ORTH: "Where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Theyre": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Won't": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldn't": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "it's": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "it'll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "They'd've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Ima": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "whats": [ - {ORTH: "what"}, - {ORTH: "s"} - ], - - "How's": [ - {ORTH: "How"}, - {ORTH: "'s"} - ], - - "Shouldntve": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "youd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Whatll": [ - {ORTH: "What"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Wouldn't've": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "How'd": [ - {ORTH: "How"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "doesnt": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shouldn't": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "He'd've": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightntve": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldnt": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Haven't": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "doesn't": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Hasn't": [ - {ORTH: "Has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "how's": [ - {ORTH: "how"}, - {ORTH: "'s"} - ], - - "hes": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "he'll": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "hed": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "how'd": [ - {ORTH: "how"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "we're": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM :"are"} - ], - - "Hadnt": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shant": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Theyve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Hows": [ - {ORTH: "How"}, - {ORTH: "s"} - ], - - "We'll": [ - {ORTH: "We"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "i've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whove": [ - {ORTH: "Who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "i'ma": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "Howd": [ - {ORTH: "How"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "hadnt": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "shant": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "There'd've": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "I'll": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Why's": [ - {ORTH: "Why"}, - {ORTH: "'s"} - ], - - "Shouldn't've": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wasnt": [ - {ORTH: "Was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whove": [ - {ORTH: "who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hasn't": [ - {ORTH: "has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wouldntve": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wheres": [ - {ORTH: "Where"}, - {ORTH: "s"} - ], - - "How'll": [ - {ORTH: "How"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "there'd've": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whos": [ - {ORTH: "Who"}, - {ORTH: "s"} - ], - - "shes": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Doesn't": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Arent": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Hasnt": [ - {ORTH: "Has", LEMMA: "have"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He's": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "wasnt": [ - {ORTH: "was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyll": [ - {ORTH: "why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "mustnt": [ - {ORTH: "must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He'd": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Shes": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "where've": [ - {ORTH: "where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Youll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hasnt": [ - {ORTH: "has"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "theyll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "it'd've": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "itdve": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "wedve": [ - {ORTH: "we"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Werent": [ - {ORTH: "Were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Therell": [ - {ORTH: "There"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "shan't": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Wont": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "hadntve": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who've": [ - {ORTH: "who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whatre": [ - {ORTH: "What"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "where'd": [ - {ORTH: "where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shouldve": [ - {ORTH: "should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "where's": [ - {ORTH: "where"}, - {ORTH: "'s"} - ], - - "neednt": [ - {ORTH: "need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "It'll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "We'd": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whats": [ - {ORTH: "What"}, - {ORTH: "s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "Itd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "she'd": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Mustnt": [ - {ORTH: "Must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "you'll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "she's": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Couldnt": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "that's": [ - {ORTH: "that"}, - {ORTH: "'s"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - "Ak.": [ {ORTH: "Ak.", LEMMA: "Alaska"} ], @@ -2000,6 +488,153 @@ TOKENIZER_EXCEPTIONS = { } +# Other exceptions + +OTHER = { + " ": [ + {ORTH: " ", TAG: "SP"} + ], + + "\u00a0": [ + {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} + ], + + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} + ], + + "'cause": [ + {ORTH: "'cause", LEMMA: "because"} + ], + + "y'all": [ + {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "yall": [ + {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "'em": [ + {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "em": [ + {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "nothin'": [ + {ORTH: "nothin'", LEMMA: "nothing"} + ], + + "nuthin'": [ + {ORTH: "nuthin'", LEMMA: "nothing"} + ], + + "'nuff": [ + {ORTH: "'nuff", LEMMA: "enough"} + ], + + "ol'": [ + {ORTH: "ol'", LEMMA: "old"} + ], + + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Not've": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "gonna": [ + {ORTH: "gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "Gonna": [ + {ORTH: "Gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "Let's": [ + {ORTH: "Let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "'S": [ + {ORTH: "'S", LEMMA: "'s"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: "'s"} + ], + + "\u2018S": [ + {ORTH: "\u2018S", LEMMA: "'s"} + ], + + "\u2018s": [ + {ORTH: "\u2018s", LEMMA: "'s"} + ], + + "\u2014": [ + {ORTH: "\u2014", TAG: ":", LEMMA: "--"} + ], + + "\n": [ + {ORTH: "\n", TAG: "SP"} + ], + + "\t": [ + {ORTH: "\t", TAG: "SP"} + ] +} + + +TOKENIZER_EXCEPTIONS = dict(EXC) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) +TOKENIZER_EXCEPTIONS.update(OTHER) + + +# Remove EXCLUDE_EXC if in exceptions + +for string in EXCLUDE_EXC: + if string in TOKENIZER_EXCEPTIONS: + TOKENIZER_EXCEPTIONS.pop(string) + + +# Abbreviations with only one ORTH token + ORTH_ONLY = [ "''", "\")", From 461cbb99d8c522f601da9663e3c9fa65be6f317f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:21:29 +0100 Subject: [PATCH 27/81] Revert "Reorganise English tokenizer exceptions (as discussed in #718)" This reverts commit b19cfcc14482ccabc63da01811927b0e96650c24. --- spacy/en/tokenizer_exceptions.py | 2087 ++++++++++++++++++++++++------ 1 file changed, 1726 insertions(+), 361 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 4df3fe535..398ae486b 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -5,279 +5,1791 @@ from ..symbols import * from ..language_data import PRON_LEMMA -EXC = {} +TOKENIZER_EXCEPTIONS = { + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} + ], -EXCLUDE_EXC = ["Ill", "ill", "Hell", "hell", "Well", "well", "Whore", "whore"] + "Theydve": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + "shouldn't've": [ + {ORTH: "should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], -# Pronouns + "There'll": [ + {ORTH: "There"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], -for pron in ["i"]: - for orth in [pron, pron.title()]: - EXC[orth + "'m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} - ] + "howll": [ + {ORTH: "how"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], - EXC[pron + "m"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } - ] + "Hadn't've": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - EXC[pron + "'ma"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] + "who'll": [ + {ORTH: "who"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], - EXC[pron + "ma"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] + "aint": [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + " ": [ + {TAG: "SP", ORTH: " "} + ], -for pron in ["i", "you", "he", "she", "it", "we", "they"]: - for orth in [pron, pron.title()]: - EXC[orth + "'ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] + "Shouldnt": [ + {ORTH: "Should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] + "when's": [ + {ORTH: "when"}, + {ORTH: "'s", LEMMA: "be"} + ], - EXC[orth + "'ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "Didnt": [ + {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "itll": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], - EXC[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "Who're": [ + {ORTH: "Who"}, + {ORTH: "'re", LEMMA: "be"} + ], - EXC[orth + "llve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "Ain't": [ + {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "'d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ] + "Can't": [ + {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ] + "Whyre": [ + {ORTH: "Why"}, + {ORTH: "re"} + ], - EXC[orth + "'d've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "Aren't": [ + {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "dve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "Neednt": [ + {ORTH: "Need"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + "should've": [ + {ORTH: "should"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], -for pron in ["you", "we", "they"]: - for orth in [pron, pron.title()]: - EXC[orth + "'re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] + "shouldn't": [ + {ORTH: "should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ] + "Idve": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + "weve": [ + {ORTH: "we"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], -# W-words, relative pronouns, prepositions etc. + "Ive": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: - for orth in [word, word.title()]: - EXC[orth + "'s"] = [ - {ORTH: orth}, - {ORTH: "'s"} - ] + "they'd": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], - EXC[orth + "s"] = [ - {ORTH: orth}, - {ORTH: "s"} - ] + "Youdve": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], - EXC[orth + "'ll"] = [ - {ORTH: orth}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] + "theyve": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], - EXC[orth + "ll"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] + "Weren't": [ + {ORTH: "Were"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "'ll've"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "werent": [ + {ORTH: "were"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "llve"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "whyre": [ + {ORTH: "why"}, + {ORTH: "re"} + ], - EXC[orth + "'re"] = [ - {ORTH: orth}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] + "I'm": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], - EXC[orth + "re"] = [ - {ORTH: orth}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ] + "She'd've": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - EXC[orth + "'ve"] = [ - {ORTH: orth}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - EXC[orth + "ve"] = [ - {ORTH: orth}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "we'll": [ + {ORTH: "we"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], - EXC[orth + "'d"] = [ - {ORTH: orth}, - {ORTH: "'d"} - ] + "Don't": [ + {ORTH: "Do", LEMMA: "do"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[orth + "d"] = [ - {ORTH: orth}, - {ORTH: "d"} - ] + "Whyll": [ + {ORTH: "Why"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], - EXC[orth + "'d've"] = [ - {ORTH: orth}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "they've": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - EXC[orth + "dve"] = [ - {ORTH: orth}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "wasn't": [ + {ORTH: "was"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + "could've": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], -# Verbs + "what've": [ + {ORTH: "what"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], -for verb_data in [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "could", TAG: "MD"}, - {ORTH: "do", LEMMA: "do"}, - {ORTH: "does", LEMMA: "do"}, - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "might"}, - {ORTH: "must"}, - {ORTH: "need"}, - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "should"}, - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "would"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + "havent": [ + {ORTH: "have", TAG: "VB"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ - dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] + "Who've": [ + {ORTH: "Who"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - EXC[data[ORTH] + "nt"] = [ - dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] + "Shan't": [ + {ORTH: "Sha", LEMMA: "shall"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], - EXC[data[ORTH] + "n't've"] = [ - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "i'll": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], - EXC[data[ORTH] + "ntve"] = [ - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "you'd": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + "whens": [ + {ORTH: "when"}, + {ORTH: "s", LEMMA: "be"} + ], -for verb_data in [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "might"}, - {ORTH: "must"}, - {ORTH: "should"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + "whys": [ + {ORTH: "why"}, + {ORTH: "s"} + ], - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "'ve"] = [ - dict(data), - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] + "Whereve": [ + {ORTH: "Where"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], - EXC[data[ORTH] + "ve"] = [ - dict(data), - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] + "\u00a0": [ + {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} + ], + "there'd": [ + {ORTH: "there"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], -for verb_data in [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + "hadn't've": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ - dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] + "whatll": [ + {ORTH: "what"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], - EXC[data[ORTH] + "nt"] = [ - dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] + "wouldn't've": [ + {ORTH: "would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + "there's": [ + {ORTH: "there"}, + {ORTH: "'s"} + ], -# Abbreviations + "Who'll": [ + {ORTH: "Who"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "youll": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "wouldve": [ + {ORTH: "would"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldnt": [ + {ORTH: "Would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Thered": [ + {ORTH: "There"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Youre": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be"} + ], + + "Couldn't've": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "who're": [ + {ORTH: "who"}, + {ORTH: "'re", LEMMA: "be"} + ], + + "Whys": [ + {ORTH: "Why"}, + {ORTH: "s"} + ], + + "mightn't've": [ + {ORTH: "might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wholl": [ + {ORTH: "Who"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "hadn't": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Havent": [ + {ORTH: "Have", TAG: "VB"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Whatve": [ + {ORTH: "What"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Thats": [ + {ORTH: "That"}, + {ORTH: "s"} + ], + + "Howll": [ + {ORTH: "How"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "wouldn't": [ + {ORTH: "would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "You'll": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Cant": [ + {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "i'd": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "weren't": [ + {ORTH: "were"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "would've": [ + {ORTH: "would"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "i'm": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], + + "why'll": [ + {ORTH: "why"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "we'd've": [ + {ORTH: "we"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Shouldve": [ + {ORTH: "Should"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "can't": [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "thats": [ + {ORTH: "that"}, + {ORTH: "s"} + ], + + "Hes": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "Needn't": [ + {ORTH: "Need"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "It's": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "Why're": [ + {ORTH: "Why"}, + {ORTH: "'re", LEMMA: "be"} + ], + + "Hed": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], -ABBREVIATIONS = { "Mt.": [ {ORTH: "Mt.", LEMMA: "Mount"} ], + "couldn't": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "What've": [ + {ORTH: "What"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "It'd": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "theydve": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "aren't": [ + {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Mightn't": [ + {ORTH: "Might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "'S": [ + {ORTH: "'S", LEMMA: "'s"} + ], + + "I've": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whered": [ + {ORTH: "Where"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Itdve": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "I'ma": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ma"} + ], + + "whos": [ + {ORTH: "who"}, + {ORTH: "s"} + ], + + "They'd": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "What'll": [ + {ORTH: "What"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "You've": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mustve": [ + {ORTH: "Must"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "whod": [ + {ORTH: "who"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "mightntve": [ + {ORTH: "might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "I'd've": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Must've": [ + {ORTH: "Must"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "it'd": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "what're": [ + {ORTH: "what"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "Wasn't": [ + {ORTH: "Was"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "what's": [ + {ORTH: "what"}, + {ORTH: "'s"} + ], + + "he'd've": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "She'd": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "shedve": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "ain't": [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "She's": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "i'd've": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "We'd've": [ + {ORTH: "We"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "must've": [ + {ORTH: "must"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "That's": [ + {ORTH: "That"}, + {ORTH: "'s"} + ], + + "whatre": [ + {ORTH: "what"}, + {ORTH: "re"} + ], + + "you'd've": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Dont": [ + {ORTH: "Do", LEMMA: "do"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "thered": [ + {ORTH: "there"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Youd": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "couldn't've": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whens": [ + {ORTH: "When"}, + {ORTH: "s"} + ], + + "Isnt": [ + {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "mightve": [ + {ORTH: "might"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "didnt": [ + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "ive": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "It'd've": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "\t": [ + {ORTH: "\t", TAG: "SP"} + ], + + "Itll": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "didn't": [ + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "cant": [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "im": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], + + "they'd've": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Hadntve": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Weve": [ + {ORTH: "We"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Mightnt": [ + {ORTH: "Might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "youdve": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Shedve": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "theyd": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Hadn't": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "What're": [ + {ORTH: "What"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "He'll": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "wholl": [ + {ORTH: "who"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "They're": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "shouldnt": [ + {ORTH: "should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "\n": [ + {ORTH: "\n", TAG: "SP"} + ], + + "whered": [ + {ORTH: "where"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "youve": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldve": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "mustve": [ + {ORTH: "must"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Youve": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "therell": [ + {ORTH: "there"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "might've": [ + {ORTH: "might"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mustn't": [ + {ORTH: "Must"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "wheres": [ + {ORTH: "where"}, + {ORTH: "s"} + ], + + "they're": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "idve": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "hows": [ + {ORTH: "how"}, + {ORTH: "s"} + ], + + "youre": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "Didn't": [ + {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Couldve": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Im": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"} + ], + + "howd": [ + {ORTH: "how"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "you've": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "You're": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "she'll": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Theyll": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "don't": [ + {ORTH: "do", LEMMA: "do"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "itd": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Hedve": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "isnt": [ + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "won't": [ + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "We're": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "\u2018S": [ + {ORTH: "\u2018S", LEMMA: "'s"} + ], + + "\u2018s": [ + {ORTH: "\u2018s", LEMMA: "'s"} + ], + + "dont": [ + {ORTH: "do", LEMMA: "do"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "ima": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ma"} + ], + + "Let's": [ + {ORTH: "Let"}, + {ORTH: "'s", LEMMA: "us"} + ], + + "he's": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "we've": [ + {ORTH: "we"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "What's": [ + {ORTH: "What"}, + {ORTH: "'s"} + ], + + "Who's": [ + {ORTH: "Who"}, + {ORTH: "'s"} + ], + + "hedve": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "he'd": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "When's": [ + {ORTH: "When"}, + {ORTH: "'s"} + ], + + "Mightn't've": [ + {ORTH: "Might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "We've": [ + {ORTH: "We"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Couldntve": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Who'd": [ + {ORTH: "Who"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "haven't": [ + {ORTH: "have", TAG: "VB"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "arent": [ + {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "You'd've": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldn't": [ + {ORTH: "Would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "who's": [ + {ORTH: "who"}, + {ORTH: "'s"} + ], + + "Mightve": [ + {ORTH: "Might"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Theredve": [ + {ORTH: "There"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "theredve": [ + {ORTH: "there"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "who'd": [ + {ORTH: "who"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Where's": [ + {ORTH: "Where"}, + {ORTH: "'s"} + ], + + "wont": [ + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "she'd've": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Should've": [ + {ORTH: "Should"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "theyre": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re"} + ], + + "Wouldntve": [ + {ORTH: "Would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Where've": [ + {ORTH: "Where"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mustn't": [ + {ORTH: "must"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "isn't": [ + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Aint": [ + {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "why's": [ + {ORTH: "why"}, + {ORTH: "'s"} + ], + + "There'd": [ + {ORTH: "There"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "They'll": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "how'll": [ + {ORTH: "how"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Wedve": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldntve": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "There's": [ + {ORTH: "There"}, + {ORTH: "'s"} + ], + + "we'd": [ + {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Whod": [ + {ORTH: "Who"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "whatve": [ + {ORTH: "what"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldve": [ + {ORTH: "Would"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "there'll": [ + {ORTH: "there"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "needn't": [ + {ORTH: "need"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "shouldntve": [ + {ORTH: "should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "why're": [ + {ORTH: "why"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "Doesnt": [ + {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whereve": [ + {ORTH: "where"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "they'll": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "I'd": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Might've": [ + {ORTH: "Might"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mightnt": [ + {ORTH: "might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Not've": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mightn't": [ + {ORTH: "might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "you're": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "They've": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "what'll": [ + {ORTH: "what"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Could've": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Would've": [ + {ORTH: "Would"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Isn't": [ + {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: "us"} + ], + + "She'll": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "You'd": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "wouldnt": [ + {ORTH: "would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Why'll": [ + {ORTH: "Why"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Where'd": [ + {ORTH: "Where"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Theyre": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "Won't": [ + {ORTH: "Wo", LEMMA: "will"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Couldn't": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "it's": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "it'll": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "They'd've": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Ima": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ma"} + ], + + "gonna": [ + {ORTH: "gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "Gonna": [ + {ORTH: "Gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "whats": [ + {ORTH: "what"}, + {ORTH: "s"} + ], + + "How's": [ + {ORTH: "How"}, + {ORTH: "'s"} + ], + + "Shouldntve": [ + {ORTH: "Should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "youd": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Whatll": [ + {ORTH: "What"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "Wouldn't've": [ + {ORTH: "Would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "How'd": [ + {ORTH: "How"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "doesnt": [ + {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Shouldn't": [ + {ORTH: "Should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "He'd've": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mightntve": [ + {ORTH: "Might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldnt": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Haven't": [ + {ORTH: "Have", TAG: "VB"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "doesn't": [ + {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Hasn't": [ + {ORTH: "Has"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "how's": [ + {ORTH: "how"}, + {ORTH: "'s"} + ], + + "hes": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "he'll": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "hed": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "how'd": [ + {ORTH: "how"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "we're": [ + {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM :"are"} + ], + + "Hadnt": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Shant": [ + {ORTH: "Sha", LEMMA: "shall"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Theyve": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Hows": [ + {ORTH: "How"}, + {ORTH: "s"} + ], + + "We'll": [ + {ORTH: "We"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "i've": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whove": [ + {ORTH: "Who"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "i'ma": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ma"} + ], + + "Howd": [ + {ORTH: "How"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "hadnt": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "shant": [ + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "There'd've": [ + {ORTH: "There"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "I'll": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Why's": [ + {ORTH: "Why"}, + {ORTH: "'s"} + ], + + "Shouldn't've": [ + {ORTH: "Should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wasnt": [ + {ORTH: "Was"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whove": [ + {ORTH: "who"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "hasn't": [ + {ORTH: "has"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "wouldntve": [ + {ORTH: "would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wheres": [ + {ORTH: "Where"}, + {ORTH: "s"} + ], + + "How'll": [ + {ORTH: "How"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "there'd've": [ + {ORTH: "there"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whos": [ + {ORTH: "Who"}, + {ORTH: "s"} + ], + + "shes": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "Doesn't": [ + {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Arent": [ + {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Hasnt": [ + {ORTH: "Has", LEMMA: "have"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "He's": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "wasnt": [ + {ORTH: "was"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whyll": [ + {ORTH: "why"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "mustnt": [ + {ORTH: "must"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "He'd": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Shes": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "where've": [ + {ORTH: "where"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Youll": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "hasnt": [ + {ORTH: "has"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "theyll": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "it'd've": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "itdve": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "wedve": [ + {ORTH: "we"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Werent": [ + {ORTH: "Were"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Therell": [ + {ORTH: "There"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "shan't": [ + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Wont": [ + {ORTH: "Wo", LEMMA: "will"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "hadntve": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "who've": [ + {ORTH: "who"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whatre": [ + {ORTH: "What"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: "'s"} + ], + + "where'd": [ + {ORTH: "where"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "shouldve": [ + {ORTH: "should"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "where's": [ + {ORTH: "where"}, + {ORTH: "'s"} + ], + + "neednt": [ + {ORTH: "need"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "It'll": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "We'd": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Whats": [ + {ORTH: "What"}, + {ORTH: "s"} + ], + + "\u2014": [ + {ORTH: "\u2014", TAG: ":", LEMMA: "--"} + ], + + "Itd": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "she'd": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Mustnt": [ + {ORTH: "Must"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "you'll": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Theyd": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "she's": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "Couldnt": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "that's": [ + {ORTH: "that"}, + {ORTH: "'s"} + ], + + "'em": [ + {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "ol'": [ + {ORTH: "ol'", LEMMA: "old"} + ], + "Ak.": [ {ORTH: "Ak.", LEMMA: "Alaska"} ], @@ -488,153 +2000,6 @@ ABBREVIATIONS = { } -# Other exceptions - -OTHER = { - " ": [ - {ORTH: " ", TAG: "SP"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "'cause": [ - {ORTH: "'cause", LEMMA: "because"} - ], - - "y'all": [ - {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "yall": [ - {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "em": [ - {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "nothin'": [ - {ORTH: "nothin'", LEMMA: "nothing"} - ], - - "nuthin'": [ - {ORTH: "nuthin'", LEMMA: "nothing"} - ], - - "'nuff": [ - {ORTH: "'nuff", LEMMA: "enough"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ] -} - - -TOKENIZER_EXCEPTIONS = dict(EXC) -TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) -TOKENIZER_EXCEPTIONS.update(OTHER) - - -# Remove EXCLUDE_EXC if in exceptions - -for string in EXCLUDE_EXC: - if string in TOKENIZER_EXCEPTIONS: - TOKENIZER_EXCEPTIONS.pop(string) - - -# Abbreviations with only one ORTH token - ORTH_ONLY = [ "''", "\")", From fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:21:36 +0100 Subject: [PATCH 28/81] Revert "Merge remote-tracking branch 'origin/master'" This reverts commit d3b181cdf1139e970b65bd58d08aaa0a6b76a5a7, reversing changes made to b19cfcc14482ccabc63da01811927b0e96650c24. --- .github/PULL_REQUEST_TEMPLATE.md | 4 +- spacy/language.py | 6 +- spacy/language_data/__init__.py | 1 - spacy/language_data/tokenizer_exceptions.py | 11 --- spacy/tests/tokenizer/test_urls.py | 77 --------------------- spacy/tokenizer.pxd | 1 - spacy/tokenizer.pyx | 29 +++----- 7 files changed, 12 insertions(+), 117 deletions(-) delete mode 100644 spacy/language_data/tokenizer_exceptions.py delete mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a55f98646..e99d6dadc 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,6 +12,8 @@ +## Screenshots (if appropriate): + ## Types of changes - [ ] Bug fix (non-breaking change fixing an issue) @@ -25,4 +27,4 @@ - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. +- [ ] All new and existing tests passed. \ No newline at end of file diff --git a/spacy/language.py b/spacy/language.py index bebdeab20..c6f1376a4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,8 +67,6 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions - if cls.token_match: - token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -84,7 +82,7 @@ class BaseDefaults(object): vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer, token_match=token_match) + infix_finditer=infix_finditer) @classmethod def create_tagger(cls, nlp=None): @@ -144,8 +142,6 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline - token_match = language_data.TOKEN_MATCH - prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 028924796..f6aa4317c 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,4 +3,3 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * -from .tokenizer_exceptions import * diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py deleted file mode 100644 index 6551440f2..000000000 --- a/spacy/language_data/tokenizer_exceptions.py +++ /dev/null @@ -1,11 +0,0 @@ -from __future__ import unicode_literals - -import re - -_URL_PATTERN = r''' -^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ -'''.strip() - -TOKEN_MATCH = re.compile(_URL_PATTERN).match - -__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py deleted file mode 100644 index 9e0172834..000000000 --- a/spacy/tests/tokenizer/test_urls.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import unicode_literals - -import pytest - -URLS = [ - u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", - u"www.google.com?q=google", - u"google.com", - u"www.red-stars.com", - pytest.mark.xfail(u"red-stars.com"), - u"http://foo.com/blah_(wikipedia)#cite-1", - u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", - u"mailto:foo.bar@baz.com", - u"mailto:foo-bar@baz-co.com" -] - -# Punctuation we want to check is split away before the URL -PREFIXES = [ - "(", '"', "...", ">" -] - -# Punctuation we want to check is split away after the URL -SUFFIXES = [ - '"', ":", ">"] - -@pytest.mark.parametrize("text", URLS) -def test_simple_url(en_tokenizer, text): - tokens = en_tokenizer(text) - assert tokens[0].orth_ == text - assert len(tokens) == 1 - - -@pytest.mark.parametrize("prefix", PREFIXES) -@pytest.mark.parametrize("url", URLS) -def test_prefixed_url(en_tokenizer, prefix, url): - tokens = en_tokenizer(prefix + url) - assert tokens[0].text == prefix - assert tokens[1].text == url - assert len(tokens) == 2 - -@pytest.mark.parametrize("suffix", SUFFIXES) -@pytest.mark.parametrize("url", URLS) -def test_suffixed_url(en_tokenizer, url, suffix): - tokens = en_tokenizer(url + suffix) - assert tokens[0].text == url - assert tokens[1].text == suffix - assert len(tokens) == 2 - -@pytest.mark.parametrize("prefix", PREFIXES) -@pytest.mark.parametrize("suffix", SUFFIXES) -@pytest.mark.parametrize("url", URLS) -def test_surround_url(en_tokenizer, prefix, suffix, url): - tokens = en_tokenizer(prefix + url + suffix) - assert tokens[0].text == prefix - assert tokens[1].text == url - assert tokens[2].text == suffix - assert len(tokens) == 3 - -@pytest.mark.parametrize("prefix1", PREFIXES) -@pytest.mark.parametrize("prefix2", PREFIXES) -@pytest.mark.parametrize("url", URLS) -def test_two_prefix_url(en_tokenizer, prefix1, prefix2, url): - tokens = en_tokenizer(prefix1 + prefix2 + url) - assert tokens[0].text == prefix1 - assert tokens[1].text == prefix2 - assert tokens[2].text == url - assert len(tokens) == 3 - -@pytest.mark.parametrize("suffix1", SUFFIXES) -@pytest.mark.parametrize("suffix2", SUFFIXES) -@pytest.mark.parametrize("url", URLS) -def test_two_prefix_url(en_tokenizer, suffix1, suffix2, url): - tokens = en_tokenizer(url + suffix1 + suffix2) - assert tokens[0].text == url - assert tokens[1].text == suffix1 - assert tokens[2].text == suffix2 - assert len(tokens) == 3 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 1a3e86b49..e53b7dbd1 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,7 +16,6 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab - cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e83c4a75..66c93528b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, token_match=None): + infix_finditer=None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,8 +39,6 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - token_match: - A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -67,9 +65,10 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): + + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -86,13 +85,10 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. - token_match: - A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() - self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -104,10 +100,9 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, - self._infix_re, - self.token_match) + self._prefix_re, + self._suffix_re, + self._infix_re) return (self.__class__, args, None, None) @@ -221,8 +216,6 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self.token_match and self.token_match(string): - break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: @@ -233,8 +226,6 @@ cdef class Tokenizer: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break - if self.token_match and self.token_match(string): - break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] @@ -272,11 +263,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - elif self.token_match and self.token_match(string): - tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) - else: + if not cache_hit: matches = self.find_infix(string) if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) From 35b39f53c32d2c228e275b4f8f11147055c0db03 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:26:09 +0100 Subject: [PATCH 29/81] Reorganise English tokenizer exceptions (as discussed in #718) Add logic to generate exceptions that follow a consistent pattern (like verbs and pronouns) and allow certain tokens to be excluded explicitly. --- spacy/en/tokenizer_exceptions.py | 2197 ++++++------------------------ 1 file changed, 416 insertions(+), 1781 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 398ae486b..4df3fe535 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -5,1791 +5,279 @@ from ..symbols import * from ..language_data import PRON_LEMMA -TOKENIZER_EXCEPTIONS = { - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "Theydve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "shouldn't've": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "There'll": [ - {ORTH: "There"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "howll": [ - {ORTH: "how"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Hadn't've": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "who'll": [ - {ORTH: "who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "aint": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - " ": [ - {TAG: "SP", ORTH: " "} - ], - - "Shouldnt": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "when's": [ - {ORTH: "when"}, - {ORTH: "'s", LEMMA: "be"} - ], - - "Didnt": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "itll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Who're": [ - {ORTH: "Who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Ain't": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Can't": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyre": [ - {ORTH: "Why"}, - {ORTH: "re"} - ], - - "Aren't": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Neednt": [ - {ORTH: "Need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "should've": [ - {ORTH: "should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "shouldn't": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Idve": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "weve": [ - {ORTH: "we"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Ive": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Youdve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weren't": [ - {ORTH: "Were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "werent": [ - {ORTH: "were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyre": [ - {ORTH: "why"}, - {ORTH: "re"} - ], - - "I'm": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "She'd've": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "we'll": [ - {ORTH: "we"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Don't": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyll": [ - {ORTH: "Why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "they've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "wasn't": [ - {ORTH: "was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "could've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what've": [ - {ORTH: "what"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "havent": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Who've": [ - {ORTH: "Who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shan't": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "i'll": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "you'd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "whens": [ - {ORTH: "when"}, - {ORTH: "s", LEMMA: "be"} - ], - - "whys": [ - {ORTH: "why"}, - {ORTH: "s"} - ], - - "Whereve": [ - {ORTH: "Where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "there'd": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "hadn't've": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "whatll": [ - {ORTH: "what"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't've": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "there's": [ - {ORTH: "there"}, - {ORTH: "'s"} - ], - - "Who'll": [ - {ORTH: "Who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "youll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldve": [ - {ORTH: "would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldnt": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Thered": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youre": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be"} - ], - - "Couldn't've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "who're": [ - {ORTH: "who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Whys": [ - {ORTH: "Why"}, - {ORTH: "s"} - ], - - "mightn't've": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wholl": [ - {ORTH: "Who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hadn't": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Havent": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Whatve": [ - {ORTH: "What"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Thats": [ - {ORTH: "That"}, - {ORTH: "s"} - ], - - "Howll": [ - {ORTH: "How"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "You'll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Cant": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "i'd": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "weren't": [ - {ORTH: "were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "would've": [ - {ORTH: "would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "i'm": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "why'll": [ - {ORTH: "why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "we'd've": [ - {ORTH: "we"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shouldve": [ - {ORTH: "Should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "can't": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "thats": [ - {ORTH: "that"}, - {ORTH: "s"} - ], - - "Hes": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Needn't": [ - {ORTH: "Need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "It's": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Why're": [ - {ORTH: "Why"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Hed": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - +EXC = {} + +EXCLUDE_EXC = ["Ill", "ill", "Hell", "hell", "Well", "well", "Whore", "whore"] + + +# Pronouns + +for pron in ["i"]: + for orth in [pron, pron.title()]: + EXC[orth + "'m"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} + ] + + EXC[pron + "m"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } + ] + + EXC[pron + "'ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] + + EXC[pron + "ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] + + +for pron in ["i", "you", "he", "she", "it", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for pron in ["you", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + +# W-words, relative pronouns, prepositions etc. + +for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: + for orth in [word, word.title()]: + EXC[orth + "'s"] = [ + {ORTH: orth}, + {ORTH: "'s"} + ] + + EXC[orth + "s"] = [ + {ORTH: orth}, + {ORTH: "s"} + ] + + EXC[orth + "'ll"] = [ + {ORTH: orth}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'re"] = [ + {ORTH: orth}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "'ve"] = [ + {ORTH: orth}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth}, + {ORTH: "'d"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth}, + {ORTH: "d"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +# Verbs + +for verb_data in [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "could", TAG: "MD"}, + {ORTH: "do", LEMMA: "do"}, + {ORTH: "does", LEMMA: "do"}, + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "need"}, + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "should"}, + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "would"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "n't've"] = [ + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ntve"] = [ + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "should"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "'ve"] = [ + dict(data), + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ve"] = [ + dict(data), + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "was", LEMMA: "be"}, + {ORTH: "were", LEMMA: "be"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + +# Abbreviations + +ABBREVIATIONS = { "Mt.": [ {ORTH: "Mt.", LEMMA: "Mount"} ], - "couldn't": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What've": [ - {ORTH: "What"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "theydve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "aren't": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Mightn't": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "I've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whered": [ - {ORTH: "Where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Itdve": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'ma": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "whos": [ - {ORTH: "who"}, - {ORTH: "s"} - ], - - "They'd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "What'll": [ - {ORTH: "What"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustve": [ - {ORTH: "Must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "whod": [ - {ORTH: "who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "mightntve": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'd've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Must've": [ - {ORTH: "Must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "it'd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "what're": [ - {ORTH: "what"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Wasn't": [ - {ORTH: "Was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "what's": [ - {ORTH: "what"}, - {ORTH: "'s"} - ], - - "he'd've": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "She'd": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shedve": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "ain't": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "She's": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "i'd've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We'd've": [ - {ORTH: "We"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "must've": [ - {ORTH: "must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "That's": [ - {ORTH: "That"}, - {ORTH: "'s"} - ], - - "whatre": [ - {ORTH: "what"}, - {ORTH: "re"} - ], - - "you'd've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Dont": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "thered": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "couldn't've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whens": [ - {ORTH: "When"}, - {ORTH: "s"} - ], - - "Isnt": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "mightve": [ - {ORTH: "might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "didnt": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "ive": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd've": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ], - - "Itll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "didn't": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "cant": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "im": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "they'd've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Hadntve": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weve": [ - {ORTH: "We"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightnt": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "youdve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Shedve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Hadn't": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What're": [ - {ORTH: "What"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "He'll": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "wholl": [ - {ORTH: "who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "They're": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "shouldnt": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "whered": [ - {ORTH: "where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "youve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "mustve": [ - {ORTH: "must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Youve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "therell": [ - {ORTH: "there"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "might've": [ - {ORTH: "might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustn't": [ - {ORTH: "Must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wheres": [ - {ORTH: "where"}, - {ORTH: "s"} - ], - - "they're": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "idve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hows": [ - {ORTH: "how"}, - {ORTH: "s"} - ], - - "youre": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Didn't": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Im": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"} - ], - - "howd": [ - {ORTH: "how"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "you've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "You're": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "she'll": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "don't": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "itd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Hedve": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "isnt": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "won't": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "We're": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "dont": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "ima": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: "us"} - ], - - "he's": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "we've": [ - {ORTH: "we"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "What's": [ - {ORTH: "What"}, - {ORTH: "'s"} - ], - - "Who's": [ - {ORTH: "Who"}, - {ORTH: "'s"} - ], - - "hedve": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "he'd": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "When's": [ - {ORTH: "When"}, - {ORTH: "'s"} - ], - - "Mightn't've": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We've": [ - {ORTH: "We"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Couldntve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Who'd": [ - {ORTH: "Who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "haven't": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "arent": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "You'd've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldn't": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "who's": [ - {ORTH: "who"}, - {ORTH: "'s"} - ], - - "Mightve": [ - {ORTH: "Might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Theredve": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theredve": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who'd": [ - {ORTH: "who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Where's": [ - {ORTH: "Where"}, - {ORTH: "'s"} - ], - - "wont": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "she'd've": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Should've": [ - {ORTH: "Should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "theyre": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re"} - ], - - "Wouldntve": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Where've": [ - {ORTH: "Where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mustn't": [ - {ORTH: "must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "isn't": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Aint": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "why's": [ - {ORTH: "why"}, - {ORTH: "'s"} - ], - - "There'd": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "They'll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "how'll": [ - {ORTH: "how"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Wedve": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldntve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "There's": [ - {ORTH: "There"}, - {ORTH: "'s"} - ], - - "we'd": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whod": [ - {ORTH: "Who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "whatve": [ - {ORTH: "what"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldve": [ - {ORTH: "Would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "there'll": [ - {ORTH: "there"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "needn't": [ - {ORTH: "need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "shouldntve": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "why're": [ - {ORTH: "why"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Doesnt": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whereve": [ - {ORTH: "where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "I'd": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Might've": [ - {ORTH: "Might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mightnt": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mightn't": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "you're": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "They've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what'll": [ - {ORTH: "what"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Could've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Would've": [ - {ORTH: "Would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Isn't": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: "us"} - ], - - "She'll": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You'd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "wouldnt": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Why'll": [ - {ORTH: "Why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Where'd": [ - {ORTH: "Where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Theyre": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Won't": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldn't": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "it's": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "it'll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "They'd've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Ima": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "whats": [ - {ORTH: "what"}, - {ORTH: "s"} - ], - - "How's": [ - {ORTH: "How"}, - {ORTH: "'s"} - ], - - "Shouldntve": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "youd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Whatll": [ - {ORTH: "What"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Wouldn't've": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "How'd": [ - {ORTH: "How"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "doesnt": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shouldn't": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "He'd've": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightntve": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldnt": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Haven't": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "doesn't": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Hasn't": [ - {ORTH: "Has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "how's": [ - {ORTH: "how"}, - {ORTH: "'s"} - ], - - "hes": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "he'll": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "hed": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "how'd": [ - {ORTH: "how"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "we're": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM :"are"} - ], - - "Hadnt": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shant": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Theyve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Hows": [ - {ORTH: "How"}, - {ORTH: "s"} - ], - - "We'll": [ - {ORTH: "We"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "i've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whove": [ - {ORTH: "Who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "i'ma": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "Howd": [ - {ORTH: "How"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "hadnt": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "shant": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "There'd've": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "I'll": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Why's": [ - {ORTH: "Why"}, - {ORTH: "'s"} - ], - - "Shouldn't've": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wasnt": [ - {ORTH: "Was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whove": [ - {ORTH: "who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hasn't": [ - {ORTH: "has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wouldntve": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wheres": [ - {ORTH: "Where"}, - {ORTH: "s"} - ], - - "How'll": [ - {ORTH: "How"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "there'd've": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whos": [ - {ORTH: "Who"}, - {ORTH: "s"} - ], - - "shes": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Doesn't": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Arent": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Hasnt": [ - {ORTH: "Has", LEMMA: "have"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He's": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "wasnt": [ - {ORTH: "was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyll": [ - {ORTH: "why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "mustnt": [ - {ORTH: "must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He'd": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Shes": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "where've": [ - {ORTH: "where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Youll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hasnt": [ - {ORTH: "has"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "theyll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "it'd've": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "itdve": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "wedve": [ - {ORTH: "we"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Werent": [ - {ORTH: "Were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Therell": [ - {ORTH: "There"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "shan't": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Wont": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "hadntve": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who've": [ - {ORTH: "who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whatre": [ - {ORTH: "What"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "where'd": [ - {ORTH: "where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shouldve": [ - {ORTH: "should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "where's": [ - {ORTH: "where"}, - {ORTH: "'s"} - ], - - "neednt": [ - {ORTH: "need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "It'll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "We'd": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whats": [ - {ORTH: "What"}, - {ORTH: "s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "Itd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "she'd": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Mustnt": [ - {ORTH: "Must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "you'll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "she's": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Couldnt": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "that's": [ - {ORTH: "that"}, - {ORTH: "'s"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - "Ak.": [ {ORTH: "Ak.", LEMMA: "Alaska"} ], @@ -2000,6 +488,153 @@ TOKENIZER_EXCEPTIONS = { } +# Other exceptions + +OTHER = { + " ": [ + {ORTH: " ", TAG: "SP"} + ], + + "\u00a0": [ + {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} + ], + + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} + ], + + "'cause": [ + {ORTH: "'cause", LEMMA: "because"} + ], + + "y'all": [ + {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "yall": [ + {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "'em": [ + {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "em": [ + {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "nothin'": [ + {ORTH: "nothin'", LEMMA: "nothing"} + ], + + "nuthin'": [ + {ORTH: "nuthin'", LEMMA: "nothing"} + ], + + "'nuff": [ + {ORTH: "'nuff", LEMMA: "enough"} + ], + + "ol'": [ + {ORTH: "ol'", LEMMA: "old"} + ], + + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Not've": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "gonna": [ + {ORTH: "gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "Gonna": [ + {ORTH: "Gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "Let's": [ + {ORTH: "Let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "'S": [ + {ORTH: "'S", LEMMA: "'s"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: "'s"} + ], + + "\u2018S": [ + {ORTH: "\u2018S", LEMMA: "'s"} + ], + + "\u2018s": [ + {ORTH: "\u2018s", LEMMA: "'s"} + ], + + "\u2014": [ + {ORTH: "\u2014", TAG: ":", LEMMA: "--"} + ], + + "\n": [ + {ORTH: "\n", TAG: "SP"} + ], + + "\t": [ + {ORTH: "\t", TAG: "SP"} + ] +} + + +TOKENIZER_EXCEPTIONS = dict(EXC) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) +TOKENIZER_EXCEPTIONS.update(OTHER) + + +# Remove EXCLUDE_EXC if in exceptions + +for string in EXCLUDE_EXC: + if string in TOKENIZER_EXCEPTIONS: + TOKENIZER_EXCEPTIONS.pop(string) + + +# Abbreviations with only one ORTH token + ORTH_ONLY = [ "''", "\")", From 84a87951eb8f97eff9992a3fa79a8ae5c8a6e2f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:27:43 +0100 Subject: [PATCH 30/81] Fix typos --- spacy/en/tokenizer_exceptions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 4df3fe535..9537761b3 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -19,18 +19,18 @@ for pron in ["i"]: {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} ] - EXC[pron + "m"] = [ + EXC[orth + "m"] = [ {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } ] - EXC[pron + "'ma"] = [ + EXC[orth + "'ma"] = [ {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "a", LEMMA: "going to", NORM: "gonna"} ] - EXC[pron + "ma"] = [ + EXC[orth + "ma"] = [ {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "a", LEMMA: "going to", NORM: "gonna"} From 6f51609b5e417fee1ca89e3d99593ac26116a1ce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 21:24:14 +0100 Subject: [PATCH 31/81] Use yellow color for neutral pro/con icon --- website/_includes/_mixins-base.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index ed0802a4f..bc8b85557 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -44,7 +44,7 @@ mixin icon(name, size) icon - [string] "pro", "con" or "neutral" (default: "neutral") mixin procon(icon) - - colors = { pro: "green", con: "red" } + - colors = { pro: "green", con: "red", neutral: "yellow" } +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) From d677db6277ec554c567c7349ceb16b1bff9bfc3a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 21:24:35 +0100 Subject: [PATCH 32/81] Change "Multi-language support" to amber for spaCy --- website/docs/api/index.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 20995df2e..24f3d4458 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -23,7 +23,7 @@ p +row +cell Multi-language support - each icon in [ "con", "pro", "pro", "pro" ] + each icon in [ "neutral", "pro", "pro", "pro" ] +cell.u-text-center #[+procon(icon)] +row From dd7cd44ba515a9ac5273a1ce1445a9d8c65649ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 21:27:25 +0100 Subject: [PATCH 33/81] Update README.rst --- README.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 9f65c24fb..6e03aa0c4 100644 --- a/README.rst +++ b/README.rst @@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. -It was designed from day 1 to be used in real products. It's commercial -open-source software, released under the MIT license. +It was designed from day one to be used in real products. spaCy currently supports +English and German, as well as tokenization for Chinese, Spanish, Italian, French, +Portuguese, Dutch, Swedish and Hungarian. It's commercial open-source software, +released under the MIT license. 💫 **Version 1.5 out now!** `Read the release notes here. `_ From 1d237664af28dd1685acdd11e94c9d1aa8cd1714 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 23:01:31 +0100 Subject: [PATCH 34/81] Add lowercase lemma to tokenizer exceptions --- spacy/en/tokenizer_exceptions.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 9537761b3..cdd543adf 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -112,44 +112,44 @@ for pron in ["you", "we", "they"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for orth in [word, word.title()]: EXC[orth + "'s"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'s"} ] EXC[orth + "s"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "s"} ] EXC[orth + "'ll"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"} ] EXC[orth + "ll"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "ll", LEMMA: "will", TAG: "MD"} ] EXC[orth + "'ll've"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ] EXC[orth + "llve"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ] EXC[orth + "'re"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'re", LEMMA: "be", NORM: "are"} ] EXC[orth + "re"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "re", LEMMA: "be", NORM: "are"} ] @@ -159,28 +159,28 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: ] EXC[orth + "ve"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'ve", LEMMA: "have", TAG: "VB"} ] EXC[orth + "'d"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'d"} ] EXC[orth + "d"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "d"} ] EXC[orth + "'d've"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'ve", LEMMA: "have", TAG: "VB"} ] EXC[orth + "dve"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word}, {ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ] From aafc894285f1a3bb5fd6c2bb3ce8e65da5bb408e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 23:02:16 +0100 Subject: [PATCH 35/81] Modernize tokenizer tests for contractions Use @pytest.mark.parametrize. --- spacy/tests/tokenizer/test_contractions.py | 95 +++++++++++++--------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index 568e34704..6e8fb7518 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -1,58 +1,73 @@ from __future__ import unicode_literals +from ...en import English import pytest +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() -def test_possess(en_tokenizer): - tokens = en_tokenizer("Mike's") - assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike" - assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s" + +@pytest.mark.parametrize('inputs', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) +def test_tokenizer_handles_poss_contraction(en_tokenizer, inputs): + text_poss, text = inputs + tokens = en_tokenizer(text_poss) assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == "'s" -def test_apostrophe(en_tokenizer): - tokens = en_tokenizer("schools'") +@pytest.mark.parametrize('text', ["schools'", "Alexis'"]) +def test_tokenizer_splits_trailing_apos(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 2 - assert tokens[1].orth_ == "'" - assert tokens[0].orth_ == "schools" + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'" -def test_LL(en_tokenizer): - tokens = en_tokenizer("we'll") +@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"]) +def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"]) +def test_tokenizer_handles_ll_contraction(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 2 - assert tokens[1].orth_ == "'ll" + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'ll" assert tokens[1].lemma_ == "will" - assert tokens[0].orth_ == "we" -def test_aint(en_tokenizer): - tokens = en_tokenizer("ain't") - assert len(tokens) == 2 - assert tokens[0].orth_ == "ai" - assert tokens[0].lemma_ == "be" - assert tokens[1].orth_ == "n't" - assert tokens[1].lemma_ == "not" - -def test_capitalized(en_tokenizer): - tokens = en_tokenizer("can't") - assert len(tokens) == 2 - tokens = en_tokenizer("Can't") - assert len(tokens) == 2 - tokens = en_tokenizer("Ain't") - assert len(tokens) == 2 - assert tokens[0].orth_ == "Ai" - assert tokens[0].lemma_ == "be" +@pytest.mark.parametrize('inputs', [("can't", "Can't"), ("ain't", "Ain't")]) +def test_tokenizer_handles_capitalization(en_tokenizer, inputs): + text_lower, text_title = inputs + tokens_lower = en_tokenizer(text_lower) + tokens_title = en_tokenizer(text_title) + assert tokens_title[0].text == tokens_lower[0].text.title() + assert tokens_lower[0].text == tokens_title[0].text.lower() + assert tokens_lower[1].text == tokens_title[1].text -def test_punct(en_tokenizer): - tokens = en_tokenizer("We've") +@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) +def test_tokenizer_keeps_title_case(en_tokenizer, pron): + for contraction in ["'ll", "'d"]: + tokens = en_tokenizer(pron + contraction) + assert tokens[0].text == pron + assert tokens[1].text == contraction + + +@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) +def test_tokenizer_excludes_ambiguous(en_tokenizer, exc): + tokens = en_tokenizer(exc) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('inputs', [("We've", "``We've"), ("couldn't", "couldn't)")]) +def test_tokenizer_splits_defined_punct(en_tokenizer, inputs): + wo_punct, w_punct = inputs + tokens = en_tokenizer(wo_punct) assert len(tokens) == 2 - tokens = en_tokenizer("``We've") + tokens = en_tokenizer(w_punct) assert len(tokens) == 3 - - -@pytest.mark.xfail -def test_therell(en_tokenizer): - tokens = en_tokenizer("there'll") - assert len(tokens) == 2 - assert tokens[0].text == "there" - assert tokens[1].text == "there" From 667051375d4a0f1e88652c1b34980f6722c11730 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:46:35 +0100 Subject: [PATCH 36/81] Modernize tokenizer tests for whitespace --- spacy/tests/tokenizer/test_whitespace.py | 33 +++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index ad34c8791..906ad310c 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,37 +1,46 @@ """Test that tokens are created correctly for whitespace.""" + + from __future__ import unicode_literals import pytest -def test_single_space(en_tokenizer): - tokens = en_tokenizer('hello possums') +@pytest.mark.parametrize('text', ["hello possums"]) +def test_tokenizer_splits_single_space(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 2 -def test_double_space(en_tokenizer): - tokens = en_tokenizer('hello possums') +@pytest.mark.parametrize('text', ["hello possums"]) +def test_tokenizer_splits_double_space(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 - assert tokens[1].orth_ == ' ' + assert tokens[1].text == " " -def test_newline(en_tokenizer): - tokens = en_tokenizer('hello\npossums') +@pytest.mark.parametrize('text', ["hello\npossums"]) +def test_tokenizer_splits_newline(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 + assert tokens[1].text == "\n" -def test_newline_space(en_tokenizer): +@pytest.mark.parametrize('text', ["hello \npossums"]) +def test_tokenizer_splits_newline_space(en_tokenizer, text): tokens = en_tokenizer('hello \npossums') assert len(tokens) == 3 -def test_newline_double_space(en_tokenizer): - tokens = en_tokenizer('hello \npossums') +@pytest.mark.parametrize('text', ["hello \npossums"]) +def test_tokenizer_splits_newline_double_space(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 -def test_newline_space_wrap(en_tokenizer): - tokens = en_tokenizer('hello \n possums') +@pytest.mark.parametrize('text', ["hello \n possums"]) +def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 From 59059fed27979abd24c412d0248b2dcdb0055906 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:47:11 +0100 Subject: [PATCH 37/81] Move regression test for #351 to own file --- spacy/tests/regression/test_issue351.py | 16 ++++++++++++ spacy/tests/tokenizer/test_whitespace.py | 32 ------------------------ 2 files changed, 16 insertions(+), 32 deletions(-) create mode 100644 spacy/tests/regression/test_issue351.py diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py new file mode 100644 index 000000000..84d4398c5 --- /dev/null +++ b/spacy/tests/regression/test_issue351.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals +from ...en import English + +import pytest + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() + + +def test_issue351(en_tokenizer): + doc = en_tokenizer(" This is a cat.") + assert doc[0].idx == 0 + assert len(doc[0]) == 3 + assert doc[1].idx == 3 diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 906ad310c..8ba138b0c 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -42,35 +42,3 @@ def test_tokenizer_splits_newline_double_space(en_tokenizer, text): def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 - - -def test_leading_space_offsets(en_tokenizer): - '''Issue #351 - # this works - - text1 = u"This is a cat." - a = english_spacy(text1) - - tok0 = list(a.sents)[0][0] - print tok0, tok0.idx, text1[tok0.idx] - - tok1 = list(a.sents)[0][1] - print tok1, tok1.idx, text1[tok1.idx] - - print "==" - - # this does not work - - text2 = u" This is a cat." - b = english_spacy(text2) - - tok0 = list(b.sents)[0][0] -print tok0, tok0.idx, text2[tok0.idx] - - tok1 = list(b.sents)[0][1] - print tok1, tok1.idx, text2[tok1.idx] - ''' - doc = en_tokenizer(u" This is a cat.") - assert doc[0].idx == 0 - assert len(doc[0]) == 3 - assert doc[1].idx == 3 From f09b5a5dfd058f7a13e5a2d94b6799f20fb7fa87 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:47:42 +0100 Subject: [PATCH 38/81] Modernize tokenizer tests for infixes --- spacy/tests/tokenizer/test_infix.py | 92 ++++++++++++++--------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 1b7cbaa7b..d197e79ea 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -2,61 +2,61 @@ from __future__ import unicode_literals import pytest -def test_hyphen(en_tokenizer): - tokens = en_tokenizer('best-known') + +@pytest.mark.parametrize('text', ["best-known"]) +def test_tokenizer_splits_hyphens(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 -def test_numeric_range(en_tokenizer): - tokens = en_tokenizer('0.1-13.5') +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 -def test_period(en_tokenizer): - tokens = en_tokenizer('best.Known') + +@pytest.mark.parametrize('text', ["best.Known", "Hello.World"]) +def test_tokenizer_splits_period(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 - tokens = en_tokenizer('zombo.com') + + +@pytest.mark.parametrize('text', ["Hello,world", "one,two"]) +def test_tokenizer_splits_comma(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["best...Known", "best...known"]) +def test_tokenizer_splits_ellipsis(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +def test_tokenizer_keep_urls(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 1 -def test_ellipsis(en_tokenizer): - tokens = en_tokenizer('best...Known') - assert len(tokens) == 3 - tokens = en_tokenizer('best...known') - assert len(tokens) == 3 - -def test_big_ellipsis(en_tokenizer): - '''Test regression identified in Issue #360''' - tokens = en_tokenizer(u'$45...............Asking') - assert len(tokens) > 2 - - - -def test_email(en_tokenizer): - tokens = en_tokenizer('hello@example.com') - assert len(tokens) == 1 - tokens = en_tokenizer('hi+there@gmail.it') +@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +def test_tokenizer_keeps_email(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 1 -def test_double_hyphen(en_tokenizer): - tokens = en_tokenizer(u'No decent--let alone well-bred--people.') - assert tokens[0].text == u'No' - assert tokens[1].text == u'decent' - assert tokens[2].text == u'--' - assert tokens[3].text == u'let' - assert tokens[4].text == u'alone' - assert tokens[5].text == u'well' - assert tokens[6].text == u'-' - # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter - # on infixes. - assert tokens[7].text == u'bred' - assert tokens[8].text == u'--' - assert tokens[9].text == u'people' - - -def test_infix_comma(en_tokenizer): - # Re issue #326 - tokens = en_tokenizer(u'Hello,world') - assert tokens[0].text == u'Hello' - assert tokens[1].text == u',' - assert tokens[2].text == u'world' +def test_tokenizer_splits_double_hyphen(en_tokenizer): + tokens = en_tokenizer("No decent--let alone well-bred--people.") + assert tokens[0].text == "No" + assert tokens[1].text == "decent" + assert tokens[2].text == "--" + assert tokens[3].text == "let" + assert tokens[4].text == "alone" + assert tokens[5].text == "well" + assert tokens[6].text == "-" + assert tokens[7].text == "bred" + assert tokens[8].text == "--" + assert tokens[9].text == "people" From ee6b49b293279d14466744debc3392081f6da4ec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:47:59 +0100 Subject: [PATCH 39/81] Modernize tokenizer tests for emoticons --- spacy/tests/tokenizer/test_emoticons.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py index e0022dbbd..3f5c4bc04 100644 --- a/spacy/tests/tokenizer/test_emoticons.py +++ b/spacy/tests/tokenizer/test_emoticons.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals + import pytest -def test_tweebo_challenge(en_tokenizer): +def test_tokenizer_handles_emoticons(en_tokenizer): + # Tweebo challenge (CMU) text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" tokens = en_tokenizer(text) assert tokens[0].orth_ == ":o" @@ -29,7 +31,7 @@ def test_tweebo_challenge(en_tokenizer): assert tokens[21].orth_ == '....' -def test_false_positive(en_tokenizer): - text = "example:)" +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): tokens = en_tokenizer(text) - assert len(tokens) == 3 + assert len(tokens) == length From 109f202e8ff154cefef50b6b1f53da11c2e68392 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:48:21 +0100 Subject: [PATCH 40/81] Update conftest fixture --- spacy/tests/tokenizer/conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py index 06ccde7b3..2f1d65e99 100644 --- a/spacy/tests/tokenizer/conftest.py +++ b/spacy/tests/tokenizer/conftest.py @@ -1,7 +1,7 @@ import pytest -from spacy.en import English +from ...en import English -@pytest.fixture(scope="module") -def en_tokenizer(EN): - return EN.tokenizer +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() From 550630df733d84ba54c3510873f9c046fe66a328 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:48:42 +0100 Subject: [PATCH 41/81] Update tokenizer tests for contractions --- spacy/tests/tokenizer/test_contractions.py | 31 +++++++++------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index 6e8fb7518..d8c0b0c79 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -1,15 +1,10 @@ from __future__ import unicode_literals -from ...en import English + import pytest -@pytest.fixture -def en_tokenizer(): - return English.Defaults.create_tokenizer() - -@pytest.mark.parametrize('inputs', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) -def test_tokenizer_handles_poss_contraction(en_tokenizer, inputs): - text_poss, text = inputs +@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) +def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): tokens = en_tokenizer(text_poss) assert len(tokens) == 2 assert tokens[0].text == text @@ -40,9 +35,8 @@ def test_tokenizer_handles_ll_contraction(en_tokenizer, text): assert tokens[1].lemma_ == "will" -@pytest.mark.parametrize('inputs', [("can't", "Can't"), ("ain't", "Ain't")]) -def test_tokenizer_handles_capitalization(en_tokenizer, inputs): - text_lower, text_title = inputs +@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")]) +def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title): tokens_lower = en_tokenizer(text_lower) tokens_title = en_tokenizer(text_title) assert tokens_title[0].text == tokens_lower[0].text.title() @@ -51,11 +45,11 @@ def test_tokenizer_handles_capitalization(en_tokenizer, inputs): @pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) -def test_tokenizer_keeps_title_case(en_tokenizer, pron): - for contraction in ["'ll", "'d"]: - tokens = en_tokenizer(pron + contraction) - assert tokens[0].text == pron - assert tokens[1].text == contraction +@pytest.mark.parametrize('contraction', ["'ll", "'d"]) +def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction): + tokens = en_tokenizer(pron + contraction) + assert tokens[0].text == pron + assert tokens[1].text == contraction @pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) @@ -64,9 +58,8 @@ def test_tokenizer_excludes_ambiguous(en_tokenizer, exc): assert len(tokens) == 1 -@pytest.mark.parametrize('inputs', [("We've", "``We've"), ("couldn't", "couldn't)")]) -def test_tokenizer_splits_defined_punct(en_tokenizer, inputs): - wo_punct, w_punct = inputs +@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")]) +def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): tokens = en_tokenizer(wo_punct) assert len(tokens) == 2 tokens = en_tokenizer(w_punct) From 8279993a6f24704cdf5f65e15ee0b1e4bb2efa61 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:49:20 +0100 Subject: [PATCH 42/81] Modernize and merge tokenizer tests for punctuation --- spacy/tests/tokenizer/test_only_punct.py | 9 -- spacy/tests/tokenizer/test_post_punct.py | 43 -------- spacy/tests/tokenizer/test_pre_punct.py | 46 -------- spacy/tests/tokenizer/test_punct.py | 109 +++++++++++++++++++ spacy/tests/tokenizer/test_surround_punct.py | 32 ------ 5 files changed, 109 insertions(+), 130 deletions(-) delete mode 100644 spacy/tests/tokenizer/test_only_punct.py delete mode 100644 spacy/tests/tokenizer/test_post_punct.py delete mode 100644 spacy/tests/tokenizer/test_pre_punct.py create mode 100644 spacy/tests/tokenizer/test_punct.py delete mode 100644 spacy/tests/tokenizer/test_surround_punct.py diff --git a/spacy/tests/tokenizer/test_only_punct.py b/spacy/tests/tokenizer/test_only_punct.py deleted file mode 100644 index 12c958088..000000000 --- a/spacy/tests/tokenizer/test_only_punct.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import unicode_literals - - -def test_only_pre1(en_tokenizer): - assert len(en_tokenizer("(")) == 1 - - -def test_only_pre2(en_tokenizer): - assert len(en_tokenizer("((")) == 2 diff --git a/spacy/tests/tokenizer/test_post_punct.py b/spacy/tests/tokenizer/test_post_punct.py deleted file mode 100644 index ff1120c63..000000000 --- a/spacy/tests/tokenizer/test_post_punct.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def close_puncts(): - return [')', ']', '}', '*'] - - -def test_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[1].string == p - assert tokens[0].string == word_str - - -def test_two_different_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].string == word_str - assert tokens[1].string == p - assert tokens[2].string == "'" - - -def test_three_same_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + p + p - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].string == word_str - assert tokens[1].string == p - - -def test_double_end_quote(en_tokenizer): - assert len(en_tokenizer("Hello''")) == 2 - assert len(en_tokenizer("''")) == 1 diff --git a/spacy/tests/tokenizer/test_pre_punct.py b/spacy/tests/tokenizer/test_pre_punct.py deleted file mode 100644 index 9aec1dc7b..000000000 --- a/spacy/tests/tokenizer/test_pre_punct.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -@pytest.fixture -def open_puncts(): - return ['(', '[', '{', '*'] - - -def test_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == word_str - - -def test_two_different_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + "`" + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == "`" - assert tokens[2].orth_ == word_str - - -def test_three_same_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + p + p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].orth_ == p - assert tokens[3].orth_ == word_str - - -def test_open_appostrophe(en_tokenizer): - string = "'The" - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == "'" diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/tokenizer/test_punct.py new file mode 100644 index 000000000..d238e593e --- /dev/null +++ b/spacy/tests/tokenizer/test_punct.py @@ -0,0 +1,109 @@ +from __future__ import unicode_literals + +import pytest + +PUNCT_OPEN = ['(', '[', '{', '*'] +PUNCT_CLOSE = [')', ']', '}', '*'] +PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +@pytest.mark.parametrize('text', ["(", "((", "<"]) +def test_tokenizer_only_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('punct_add', ["`"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('punct_add', ["'"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('text', ["'The"]) +def test_tokenizer_splits_open_appostrophe(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.parametrize('text', ["Hello''"]) +def test_tokenizer_splits_double_end_quote(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = en_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): + tokens = en_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text): + tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add) + assert len(tokens) == 5 + assert tokens[0].text == punct_open_add + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close_add diff --git a/spacy/tests/tokenizer/test_surround_punct.py b/spacy/tests/tokenizer/test_surround_punct.py deleted file mode 100644 index 7c7a50904..000000000 --- a/spacy/tests/tokenizer/test_surround_punct.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def paired_puncts(): - return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] - - -def test_token(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = open_ + word_str + close_ - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == open_ - assert tokens[1].orth_ == word_str - assert tokens[2].orth_ == close_ - - -def test_two_different(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = "`" + open_ + word_str + close_ + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 5 - assert tokens[0].orth_ == "`" - assert tokens[1].orth_ == open_ - assert tokens[2].orth_ == word_str - assert tokens[2].orth_ == word_str - assert tokens[3].orth_ == close_ - assert tokens[4].orth_ == "'" From c6e5a5349dae8aedb306840891863853a927c1a7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 4 Jan 2017 00:49:31 +0100 Subject: [PATCH 43/81] Move regression test for #360 into own file --- spacy/tests/regression/test_issue360.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 spacy/tests/regression/test_issue360.py diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py new file mode 100644 index 000000000..018289030 --- /dev/null +++ b/spacy/tests/regression/test_issue360.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals +from ...en import English + +import pytest + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() + + +def test_big_ellipsis(en_tokenizer): + tokens = en_tokenizer(u'$45...............Asking') + assert len(tokens) > 2 From 58adae877425333ed283b5c2bcc35d8b9effff12 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:09:22 +0100 Subject: [PATCH 44/81] Remove unused file --- spacy/tests/sun.tokens | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 spacy/tests/sun.tokens diff --git a/spacy/tests/sun.tokens b/spacy/tests/sun.tokens deleted file mode 100644 index 4b912e18e..000000000 --- a/spacy/tests/sun.tokens +++ /dev/null @@ -1,4 +0,0 @@ -The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] - -The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] -Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] From da10a049a68799674559e774f67bd5f35a6caef2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:09:48 +0100 Subject: [PATCH 45/81] Add unicode declarations --- spacy/tests/tokenizer/conftest.py | 3 +++ spacy/tests/tokenizer/test_contractions.py | 1 + spacy/tests/tokenizer/test_indices.py | 1 + spacy/tests/tokenizer/test_punct.py | 1 + spacy/tests/tokenizer/test_whitespace.py | 1 + 5 files changed, 7 insertions(+) diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py index 2f1d65e99..8d842cd6d 100644 --- a/spacy/tests/tokenizer/conftest.py +++ b/spacy/tests/tokenizer/conftest.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + import pytest from ...en import English diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index d8c0b0c79..f2c6ee37a 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import pytest diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/tokenizer/test_indices.py index 5df7bcc59..5a0dea627 100644 --- a/spacy/tests/tokenizer/test_indices.py +++ b/spacy/tests/tokenizer/test_indices.py @@ -1,3 +1,4 @@ +# coding: utf-8 """Test that token.idx correctly computes index into the original string.""" from __future__ import unicode_literals diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/tokenizer/test_punct.py index d238e593e..06ff0cc8b 100644 --- a/spacy/tests/tokenizer/test_punct.py +++ b/spacy/tests/tokenizer/test_punct.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import pytest diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 8ba138b0c..9dd3a19a1 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,3 +1,4 @@ +# coding: utf-8 """Test that tokens are created correctly for whitespace.""" From 2e72683baac03a103d99f8a6ceb8e460291c142e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:10:21 +0100 Subject: [PATCH 46/81] Add missing docstrings --- spacy/tests/tokenizer/test_contractions.py | 3 +++ spacy/tests/tokenizer/test_punct.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index f2c6ee37a..995a405fb 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -1,4 +1,7 @@ # coding: utf-8 +"""Test that tokens are created correctly for contractions.""" + + from __future__ import unicode_literals import pytest diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/tokenizer/test_punct.py index 06ff0cc8b..4428670d0 100644 --- a/spacy/tests/tokenizer/test_punct.py +++ b/spacy/tests/tokenizer/test_punct.py @@ -1,4 +1,7 @@ # coding: utf-8 +"""Test that open, closed and paired punctuation is split off correctly.""" + + from __future__ import unicode_literals import pytest From 34c47bb20d81226082b0cac546fc7508278492ee Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:10:51 +0100 Subject: [PATCH 47/81] Fix formatting --- spacy/tests/tokenizer/conftest.py | 1 + spacy/tests/tokenizer/test_indices.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py index 8d842cd6d..3a3516c41 100644 --- a/spacy/tests/tokenizer/conftest.py +++ b/spacy/tests/tokenizer/conftest.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest + from ...en import English diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/tokenizer/test_indices.py index 5a0dea627..0ed6ca4dc 100644 --- a/spacy/tests/tokenizer/test_indices.py +++ b/spacy/tests/tokenizer/test_indices.py @@ -1,13 +1,14 @@ # coding: utf-8 """Test that token.idx correctly computes index into the original string.""" + from __future__ import unicode_literals import pytest def test_simple_punct(en_tokenizer): - text = 'to walk, do foo' + text = "to walk, do foo" tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert tokens[1].idx == 3 @@ -17,7 +18,7 @@ def test_simple_punct(en_tokenizer): def test_complex_punct(en_tokenizer): - text = 'Tom (D., Ill.)!' + text = "Tom (D., Ill.)!" tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert len(tokens[0]) == 3 From 0e65dca9a54e25c83f2fc82bf6c0d68022367bba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:11:31 +0100 Subject: [PATCH 48/81] Modernize and merge tokenizer tests for exception and emoticons --- spacy/tests/tokenizer/test_emoticons.py | 37 ---------------- spacy/tests/tokenizer/test_exceptions.py | 54 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 37 deletions(-) delete mode 100644 spacy/tests/tokenizer/test_emoticons.py create mode 100644 spacy/tests/tokenizer/test_exceptions.py diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py deleted file mode 100644 index 3f5c4bc04..000000000 --- a/spacy/tests/tokenizer/test_emoticons.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -def test_tokenizer_handles_emoticons(en_tokenizer): - # Tweebo challenge (CMU) - text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = en_tokenizer(text) - assert tokens[0].orth_ == ":o" - assert tokens[1].orth_ == ":/" - assert tokens[2].orth_ == ":'(" - assert tokens[3].orth_ == ">:o" - assert tokens[4].orth_ == "(:" - assert tokens[5].orth_ == ":)" - assert tokens[6].orth_ == ">.<" - assert tokens[7].orth_ == "XD" - assert tokens[8].orth_ == "-__-" - assert tokens[9].orth_ == "o.O" - assert tokens[10].orth_ == ";D" - assert tokens[11].orth_ == ":-)" - assert tokens[12].orth_ == "@_@" - assert tokens[13].orth_ == ":P" - assert tokens[14].orth_ == "8D" - assert tokens[15].orth_ == ":1" - assert tokens[16].orth_ == ">:(" - assert tokens[17].orth_ == ":D" - assert tokens[18].orth_ == "=|" - assert tokens[19].orth_ == '")' - assert tokens[20].orth_ == ':>' - assert tokens[21].orth_ == '....' - - -@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) -def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py new file mode 100644 index 000000000..c194dce21 --- /dev/null +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -0,0 +1,54 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handles correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) +def test_tokenizer_handles_abbr(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(en_tokenizer): + text = "It's mediocre i.e. bad." + tokens = en_tokenizer(text) + assert len(tokens) == 6 + assert tokens[3].text == "i.e." + + +def test_tokenizer_handles_emoticons(en_tokenizer): + # Tweebo challenge (CMU) + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + tokens = en_tokenizer(text) + assert tokens[0].text == ":o" + assert tokens[1].text == ":/" + assert tokens[2].text == ":'(" + assert tokens[3].text == ">:o" + assert tokens[4].text == "(:" + assert tokens[5].text == ":)" + assert tokens[6].text == ">.<" + assert tokens[7].text == "XD" + assert tokens[8].text == "-__-" + assert tokens[9].text == "o.O" + assert tokens[10].text == ";D" + assert tokens[11].text == ":-)" + assert tokens[12].text == "@_@" + assert tokens[13].text == ":P" + assert tokens[14].text == "8D" + assert tokens[15].text == ":1" + assert tokens[16].text == ">:(" + assert tokens[17].text == ":D" + assert tokens[18].text == "=|" + assert tokens[19].text == '")' + assert tokens[20].text == ':>' + assert tokens[21].text == '....' + + +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length From 8a74129cdf09b63759d1786c0c40914e9f95af6d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:13:12 +0100 Subject: [PATCH 49/81] Modernize and merge tokenizer tests for prefixes/suffixes/infixes --- spacy/tests/tokenizer/test_infix.py | 62 -------- .../tokenizer/test_prefix_suffix_infix.py | 148 ++++++++++++++++++ spacy/tests/tokenizer/test_special_affix.py | 46 ------ spacy/tests/tokenizer/test_tokenizer.py | 8 - 4 files changed, 148 insertions(+), 116 deletions(-) delete mode 100644 spacy/tests/tokenizer/test_infix.py create mode 100644 spacy/tests/tokenizer/test_prefix_suffix_infix.py delete mode 100644 spacy/tests/tokenizer/test_special_affix.py diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py deleted file mode 100644 index d197e79ea..000000000 --- a/spacy/tests/tokenizer/test_infix.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -@pytest.mark.parametrize('text', ["best-known"]) -def test_tokenizer_splits_hyphens(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) -def test_tokenizer_splits_numeric_range(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["best.Known", "Hello.World"]) -def test_tokenizer_splits_period(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["Hello,world", "one,two"]) -def test_tokenizer_splits_comma(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[0].text == text.split(",")[0] - assert tokens[1].text == "," - assert tokens[2].text == text.split(",")[1] - - -@pytest.mark.parametrize('text', ["best...Known", "best...known"]) -def test_tokenizer_splits_ellipsis(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) -def test_tokenizer_keep_urls(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) -def test_tokenizer_keeps_email(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -def test_tokenizer_splits_double_hyphen(en_tokenizer): - tokens = en_tokenizer("No decent--let alone well-bred--people.") - assert tokens[0].text == "No" - assert tokens[1].text == "decent" - assert tokens[2].text == "--" - assert tokens[3].text == "let" - assert tokens[4].text == "alone" - assert tokens[5].text == "well" - assert tokens[6].text == "-" - assert tokens[7].text == "bred" - assert tokens[8].text == "--" - assert tokens[9].text == "people" diff --git a/spacy/tests/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/tokenizer/test_prefix_suffix_infix.py new file mode 100644 index 000000000..d6963ada1 --- /dev/null +++ b/spacy/tests/tokenizer/test_prefix_suffix_infix.py @@ -0,0 +1,148 @@ +# coding: utf-8 +"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["(can)"]) +def test_tokenizer_splits_no_special(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["can't"]) +def test_tokenizer_splits_no_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(can't"]) +def test_tokenizer_splits_prefix_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["can't)"]) +def test_tokenizer_splits_suffix_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(can't)"]) +def test_tokenizer_splits_even_wrap(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["(can't?)"]) +def test_tokenizer_splits_uneven_wrap(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)]) +def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text', ["U.S.)"]) +def test_tokenizer_splits_suffix_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(U.S.)"]) +def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(U.S.?)"]) +def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["best-known"]) +def test_tokenizer_splits_hyphens(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["best.Known", "Hello.World"]) +def test_tokenizer_splits_period_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["Hello,world", "one,two"]) +def test_tokenizer_splits_comma_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["best...Known", "best...known"]) +def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +def test_tokenizer_keep_urls(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +def test_tokenizer_keeps_email(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_splits_double_hyphen_infix(en_tokenizer): + tokens = en_tokenizer("No decent--let alone well-bred--people.") + assert tokens[0].text == "No" + assert tokens[1].text == "decent" + assert tokens[2].text == "--" + assert tokens[3].text == "let" + assert tokens[4].text == "alone" + assert tokens[5].text == "well" + assert tokens[6].text == "-" + assert tokens[7].text == "bred" + assert tokens[8].text == "--" + assert tokens[9].text == "people" + + +@pytest.mark.xfail +def test_tokenizer_splits_period_abbr(en_tokenizer): + text = "Today is Tuesday.Mr." + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert tokens[0].text == "Today" + assert tokens[1].text == "is" + assert tokens[2].text == "Tuesday" + assert tokens[3].text == "." + assert tokens[4].text == "Mr." + + +@pytest.mark.xfail +def test_tokenizer_splits_em_dash_infix(en_tokenizer): + # Re Issue #225 + tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """ + """you'll have to walk there.\u2014Ariel.""") + assert tokens[6].text == "Puddleton" + assert tokens[7].text == "?" + assert tokens[8].text == "\u2014" diff --git a/spacy/tests/tokenizer/test_special_affix.py b/spacy/tests/tokenizer/test_special_affix.py deleted file mode 100644 index 62cf114f1..000000000 --- a/spacy/tests/tokenizer/test_special_affix.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Test entries in the tokenization special-case interacting with prefix -and suffix punctuation.""" -from __future__ import unicode_literals -import pytest - - -def test_no_special(en_tokenizer): - assert len(en_tokenizer("(can)")) == 3 - - -def test_no_punct(en_tokenizer): - assert len(en_tokenizer("can't")) == 2 - - -def test_prefix(en_tokenizer): - assert len(en_tokenizer("(can't")) == 3 - - -def test_suffix(en_tokenizer): - assert len(en_tokenizer("can't)")) == 3 - - -def test_wrap(en_tokenizer): - assert len(en_tokenizer("(can't)")) == 4 - - -def test_uneven_wrap(en_tokenizer): - assert len(en_tokenizer("(can't?)")) == 5 - - -def test_prefix_interact(en_tokenizer): - assert len(en_tokenizer("U.S.")) == 1 - assert len(en_tokenizer("us.")) == 2 - assert len(en_tokenizer("(U.S.")) == 2 - - -def test_suffix_interact(en_tokenizer): - assert len(en_tokenizer("U.S.)")) == 2 - - -def test_even_wrap_interact(en_tokenizer): - assert len(en_tokenizer("(U.S.)")) == 3 - - -def test_uneven_wrap_interact(en_tokenizer): - assert len(en_tokenizer("(U.S.?)")) == 4 diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 091561ae3..45e8cf70e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -157,14 +157,6 @@ def test_two_whitespace(en_tokenizer): assert repr(tokens.text_with_ws) == repr(orig_str) -@pytest.mark.xfail -def test_em_dash_infix(en_tokenizer): - # Re Issue #225 - tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, ''' - '''you'll have to walk there.\u2014Ariel.''') - assert tokens[6].text == 'Puddleton' - assert tokens[7].text == '?' - assert tokens[8].text == '\u2014' #def test_cnts7(): # text = 'But then the 6,000-year ice age came...' From 2c2e8786537447027a13532b401163376002d574 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:14:16 +0100 Subject: [PATCH 50/81] Modernize and merge tokenizer tests for punctuation --- spacy/tests/tokenizer/test_punct.py | 20 +++++++++++++++++++- spacy/tests/tokenizer/test_tokenizer.py | 15 --------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/tokenizer/test_punct.py index 4428670d0..f6e8a0293 100644 --- a/spacy/tests/tokenizer/test_punct.py +++ b/spacy/tests/tokenizer/test_punct.py @@ -6,13 +6,19 @@ from __future__ import unicode_literals import pytest +from ... import util +from ...language_data import TOKENIZER_PREFIXES + +en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search + + PUNCT_OPEN = ['(', '[', '{', '*'] PUNCT_CLOSE = [')', ']', '}', '*'] PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] @pytest.mark.parametrize('text', ["(", "((", "<"]) -def test_tokenizer_only_punct(en_tokenizer, text): +def test_tokenizer_handles_only_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == len(text) @@ -111,3 +117,15 @@ def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, pu assert tokens[2].text == text assert tokens[3].text == punct_close assert tokens[4].text == punct_close_add + + +@pytest.mark.parametrize('text,punct', [("(can't", "(")]) +def test_tokenizer_splits_pre_punct_regex(text, punct): + match = en_search_prefixes(text) + assert match.group() == punct + + +def test_tokenizer_splits_bracket_period(en_tokenizer): + text = "(And a 6a.m. run through Washington Park)." + tokens = en_tokenizer(text) + assert tokens[len(tokens) - 1].text == "." diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 45e8cf70e..fab7d49d8 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -20,13 +20,6 @@ en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search # loaded = pickle.load(file_) # assert loaded is not None -def test_pre_punct_regex(): - string = "(can't" - match = en_search_prefixes(string) - assert match.group() == "(" - -def test_no_word(en_tokenizer): - tokens = en_tokenizer(u'') assert len(tokens) == 0 @@ -65,14 +58,6 @@ def test_contraction(en_tokenizer): assert len(tokens) == 5 assert tokens[4].orth == en_tokenizer.vocab['!'].orth -def test_contraction_punct(en_tokenizer): - tokens = [w.text for w in en_tokenizer("(can't")] - assert tokens == ['(', 'ca', "n't"] - tokens = en_tokenizer("`ain't") - assert len(tokens) == 3 - tokens = en_tokenizer('''"isn't''') - assert len(tokens) == 3 - tokens = en_tokenizer("can't!") assert len(tokens) == 3 From 8b284fc6f192a0c832e1edaf4e0d860f0d4706cd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:15:52 +0100 Subject: [PATCH 51/81] Modernize and merge tokenizer tests for text from file --- spacy/tests/{ => tokenizer}/sun.txt | 0 spacy/tests/tokenizer/test_tokenizer.py | 7 +++++++ spacy/tests/tokenizer/test_wiki_sun.py | 21 --------------------- 3 files changed, 7 insertions(+), 21 deletions(-) rename spacy/tests/{ => tokenizer}/sun.txt (100%) delete mode 100644 spacy/tests/tokenizer/test_wiki_sun.py diff --git a/spacy/tests/sun.txt b/spacy/tests/tokenizer/sun.txt similarity index 100% rename from spacy/tests/sun.txt rename to spacy/tests/tokenizer/sun.txt diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index fab7d49d8..92e610fe0 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +from os import path import pytest import io @@ -9,6 +10,7 @@ import tempfile from ... import util from ...language_data import TOKENIZER_PREFIXES +from spacy.util import utf8open en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search @@ -79,6 +81,11 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. def test_cnts1(en_tokenizer): text = u"""The U.S. Army likes Shock and Awe.""" +@pytest.mark.parametrize('file_name', ["sun.txt"]) +def test_tokenizer_handle_text_from_file(en_tokenizer, file_name): + loc = path.join(path.dirname(__file__), file_name) + text = utf8open(loc).read() + assert len(text) != 0 tokens = en_tokenizer(text) assert len(tokens) == 8 diff --git a/spacy/tests/tokenizer/test_wiki_sun.py b/spacy/tests/tokenizer/test_wiki_sun.py deleted file mode 100644 index 8d2a6682e..000000000 --- a/spacy/tests/tokenizer/test_wiki_sun.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import unicode_literals - -from spacy.util import utf8open - -import pytest -from os import path - - -HERE = path.dirname(__file__) - - -@pytest.fixture -def sun_txt(): - loc = path.join(HERE, '..', 'sun.txt') - return utf8open(loc).read() - - -def test_tokenize(sun_txt, en_tokenizer): - assert len(sun_txt) != 0 - tokens = en_tokenizer(sun_txt) - assert len(tokens) > 100 From a11f684822ef28b8029718a43040380625f492d2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:16:33 +0100 Subject: [PATCH 52/81] Modernize and merge tokenizer tests for whitespace --- spacy/tests/tokenizer/test_tokenizer.py | 5 ----- spacy/tests/tokenizer/test_whitespace.py | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 92e610fe0..f41969b4f 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -143,11 +143,6 @@ def test_ie(en_tokenizer): assert tokens[3].orth_ == "i.e." -def test_two_whitespace(en_tokenizer): - orig_str = u'there are 2 spaces after this ' - tokens = en_tokenizer(orig_str) - assert repr(tokens.text_with_ws) == repr(orig_str) - #def test_cnts7(): diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 9dd3a19a1..90dc80615 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -20,6 +20,12 @@ def test_tokenizer_splits_double_space(en_tokenizer, text): assert tokens[1].text == " " +@pytest.mark.parametrize('text', ["two spaces after this "]) +def test_tokenizer_handles_double_trainling_ws(en_tokenizer, text): + tokens = en_tokenizer(text) + assert repr(tokens.text_with_ws) == repr(text) + + @pytest.mark.parametrize('text', ["hello\npossums"]) def test_tokenizer_splits_newline(en_tokenizer, text): tokens = en_tokenizer(text) From 02cfda48c95b10acf59d06d6c769c04c60736b49 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:16:55 +0100 Subject: [PATCH 53/81] Modernize and merge tokenizer tests for string loading --- spacy/tests/tokenizer/test_string_loading.py | 9 --------- spacy/tests/tokenizer/test_tokenizer.py | 7 +++++++ 2 files changed, 7 insertions(+), 9 deletions(-) delete mode 100644 spacy/tests/tokenizer/test_string_loading.py diff --git a/spacy/tests/tokenizer/test_string_loading.py b/spacy/tests/tokenizer/test_string_loading.py deleted file mode 100644 index 1bc5539bc..000000000 --- a/spacy/tests/tokenizer/test_string_loading.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Test suspected freeing of strings""" -from __future__ import unicode_literals - - -def test_one(en_tokenizer): - tokens = en_tokenizer('Betty Botter bought a pound of butter.') - assert tokens[0].orth_ == 'Betty' - tokens2 = en_tokenizer('Betty also bought a pound of butter.') - assert tokens2[0].orth_ == 'Betty' diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index f41969b4f..3bdf9095c 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -149,3 +149,10 @@ def test_ie(en_tokenizer): # text = 'But then the 6,000-year ice age came...' # tokens = EN.tokenize(text) # assert len(tokens) == 10 +def test_tokenizer_suspected_freeing_strings(en_tokenizer): + text1 = "Betty Botter bought a pound of butter." + text2 = "Betty also bought a pound of butter." + tokens1 = en_tokenizer(text1) + tokens2 = en_tokenizer(text2) + assert tokens1[0].text == "Betty" + assert tokens2[0].text == "Betty" From 8b45363b4d3a28f65af3fc1a8097854d5f7f008d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 13:17:05 +0100 Subject: [PATCH 54/81] Modernize and merge general tokenizer tests --- spacy/tests/tokenizer/test_tokenizer.py | 142 ++++++++---------------- 1 file changed, 46 insertions(+), 96 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 3bdf9095c..31bff9b14 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -3,67 +3,64 @@ from __future__ import unicode_literals from os import path import pytest -import io -import pickle -import cloudpickle -import tempfile -from ... import util -from ...language_data import TOKENIZER_PREFIXES from spacy.util import utf8open -en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search - -# @pytest.mark.xfail -# def test_pickle(en_tokenizer): -# file_ = io.BytesIO() -# cloudpickle.dump(en_tokenizer, file_) -# file_.seek(0) -# loaded = pickle.load(file_) -# assert loaded is not None +def test_tokenizer_handles_no_word(en_tokenizer): + tokens = en_tokenizer("") assert len(tokens) == 0 -def test_single_word(en_tokenizer): - tokens = en_tokenizer(u'hello') - assert tokens[0].orth_ == 'hello' +@pytest.mark.parametrize('text', ["hello"]) +def test_tokenizer_handles_single_word(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].text == text -def test_two_words(en_tokenizer): - tokens = en_tokenizer('hello possums') +@pytest.mark.parametrize('text', ["hello possums"]) +def test_tokenizer_handles_two_words(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 2 - assert tokens[0].orth_ != tokens[1].orth_ + assert tokens[0].text != tokens[1].text -def test_punct(en_tokenizer): - tokens = en_tokenizer('hello, possums.') +def test_tokenizer_handles_punct(en_tokenizer): + text = "hello, possums." + tokens = en_tokenizer(text) assert len(tokens) == 4 - assert tokens[0].orth_ == 'hello' - assert tokens[1].orth_ == ',' - assert tokens[2].orth_ == 'possums' - assert tokens[1].orth_ != 'hello' + assert tokens[0].text == "hello" + assert tokens[1].text == "," + assert tokens[2].text == "possums" + assert tokens[1].text != "hello" -def test_digits(en_tokenizer): - tokens = en_tokenizer('The year: 1984.') +def test_tokenizer_handles_digits(en_tokenizer): + text = "The year: 1984." + tokens = en_tokenizer(text) assert len(tokens) == 5 - assert tokens[0].orth == en_tokenizer.vocab['The'].orth - assert tokens[3].orth == en_tokenizer.vocab['1984'].orth + assert tokens[0].text == "The" + assert tokens[3].text == "1984" -def test_contraction(en_tokenizer): - tokens = en_tokenizer("don't giggle") +def test_tokenizer_handles_basic_contraction(en_tokenizer): + text = "don't giggle" + tokens = en_tokenizer(text) assert len(tokens) == 3 - assert tokens[1].orth == en_tokenizer.vocab["n't"].orth - tokens = en_tokenizer("i said don't!") + assert tokens[1].text == "n't" + text = "i said don't!" + tokens = en_tokenizer(text) assert len(tokens) == 5 - assert tokens[4].orth == en_tokenizer.vocab['!'].orth + assert tokens[4].text == "!" + +@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) +def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): + tokens = en_tokenizer(text) assert len(tokens) == 3 -def test_sample(en_tokenizer): +def test_tokenizer_handles_long_text(en_tokenizer): text = """Tributes pour in for late British Labour Party leader Tributes poured in from around the world Thursday @@ -79,76 +76,29 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. assert len(tokens) > 5 -def test_cnts1(en_tokenizer): - text = u"""The U.S. Army likes Shock and Awe.""" @pytest.mark.parametrize('file_name', ["sun.txt"]) def test_tokenizer_handle_text_from_file(en_tokenizer, file_name): loc = path.join(path.dirname(__file__), file_name) text = utf8open(loc).read() assert len(text) != 0 tokens = en_tokenizer(text) - assert len(tokens) == 8 + assert len(tokens) > 100 -def test_cnts2(en_tokenizer): - text = u"""U.N. regulations are not a part of their concern.""" +@pytest.mark.parametrize('text,length', [ + ("The U.S. Army likes Shock and Awe.", 8), + ("U.N. regulations are not a part of their concern.", 10), + ("“Isn't it?”", 6), + ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), + ("""'Me too!', Mr. P. Delaware cried. """, 11), + ("They ran about 10km.", 6), + # ("But then the 6,000-year ice age came...", 10) + ]) +def test_tokenizer_handles_cnts(en_tokenizer, text, length): tokens = en_tokenizer(text) - assert len(tokens) == 10 + assert len(tokens) == length -def test_cnts3(en_tokenizer): - text = u"“Isn't it?”" - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 6 - - -def test_cnts4(en_tokenizer): - text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 15 - - -def test_cnts5(en_tokenizer): - text = """'Me too!', Mr. P. Delaware cried. """ - tokens = en_tokenizer(text) - assert len(tokens) == 11 - - -@pytest.mark.xfail -def test_mr(en_tokenizer): - text = """Today is Tuesday.Mr.""" - tokens = en_tokenizer(text) - assert len(tokens) == 5 - assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] - - -def test_cnts6(en_tokenizer): - text = u'They ran about 10km.' - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 6 - -def test_bracket_period(en_tokenizer): - text = u'(And a 6a.m. run through Washington Park).' - tokens = en_tokenizer(text) - assert tokens[len(tokens) - 1].orth_ == u'.' - - -def test_ie(en_tokenizer): - text = u"It's mediocre i.e. bad." - tokens = en_tokenizer(text) - assert len(tokens) == 6 - assert tokens[3].orth_ == "i.e." - - - - -#def test_cnts7(): -# text = 'But then the 6,000-year ice age came...' -# tokens = EN.tokenize(text) -# assert len(tokens) == 10 def test_tokenizer_suspected_freeing_strings(en_tokenizer): text1 = "Betty Botter bought a pound of butter." text2 = "Betty also bought a pound of butter." From c5f2dc15de228957de5f3543e84e968f63a13d78 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 16:25:04 +0100 Subject: [PATCH 55/81] Move English tokenizer tests to directory /en --- spacy/tests/en/__init__.py | 0 spacy/tests/{tokenizer => en}/conftest.py | 0 spacy/tests/en/tokenizer/__init__.py | 0 spacy/tests/{ => en}/tokenizer/sun.txt | 0 spacy/tests/{ => en}/tokenizer/test_contractions.py | 0 spacy/tests/{ => en}/tokenizer/test_exceptions.py | 0 spacy/tests/{ => en}/tokenizer/test_indices.py | 0 spacy/tests/{ => en}/tokenizer/test_prefix_suffix_infix.py | 0 spacy/tests/{ => en}/tokenizer/test_punct.py | 7 ++++--- spacy/tests/{ => en}/tokenizer/test_tokenizer.py | 2 +- spacy/tests/{ => en}/tokenizer/test_whitespace.py | 0 11 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 spacy/tests/en/__init__.py rename spacy/tests/{tokenizer => en}/conftest.py (100%) create mode 100644 spacy/tests/en/tokenizer/__init__.py rename spacy/tests/{ => en}/tokenizer/sun.txt (100%) rename spacy/tests/{ => en}/tokenizer/test_contractions.py (100%) rename spacy/tests/{ => en}/tokenizer/test_exceptions.py (100%) rename spacy/tests/{ => en}/tokenizer/test_indices.py (100%) rename spacy/tests/{ => en}/tokenizer/test_prefix_suffix_infix.py (100%) rename spacy/tests/{ => en}/tokenizer/test_punct.py (96%) rename spacy/tests/{ => en}/tokenizer/test_tokenizer.py (99%) rename spacy/tests/{ => en}/tokenizer/test_whitespace.py (100%) diff --git a/spacy/tests/en/__init__.py b/spacy/tests/en/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/en/conftest.py similarity index 100% rename from spacy/tests/tokenizer/conftest.py rename to spacy/tests/en/conftest.py diff --git a/spacy/tests/en/tokenizer/__init__.py b/spacy/tests/en/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/tokenizer/sun.txt b/spacy/tests/en/tokenizer/sun.txt similarity index 100% rename from spacy/tests/tokenizer/sun.txt rename to spacy/tests/en/tokenizer/sun.txt diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/en/tokenizer/test_contractions.py similarity index 100% rename from spacy/tests/tokenizer/test_contractions.py rename to spacy/tests/en/tokenizer/test_contractions.py diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py similarity index 100% rename from spacy/tests/tokenizer/test_exceptions.py rename to spacy/tests/en/tokenizer/test_exceptions.py diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/en/tokenizer/test_indices.py similarity index 100% rename from spacy/tests/tokenizer/test_indices.py rename to spacy/tests/en/tokenizer/test_indices.py diff --git a/spacy/tests/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py similarity index 100% rename from spacy/tests/tokenizer/test_prefix_suffix_infix.py rename to spacy/tests/en/tokenizer/test_prefix_suffix_infix.py diff --git a/spacy/tests/tokenizer/test_punct.py b/spacy/tests/en/tokenizer/test_punct.py similarity index 96% rename from spacy/tests/tokenizer/test_punct.py rename to spacy/tests/en/tokenizer/test_punct.py index f6e8a0293..b6ae9224d 100644 --- a/spacy/tests/tokenizer/test_punct.py +++ b/spacy/tests/en/tokenizer/test_punct.py @@ -6,12 +6,13 @@ from __future__ import unicode_literals import pytest -from ... import util -from ...language_data import TOKENIZER_PREFIXES +from ....util import compile_prefix_regex +from ....language_data import TOKENIZER_PREFIXES -en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search +en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search + PUNCT_OPEN = ['(', '[', '{', '*'] PUNCT_CLOSE = [')', ']', '}', '*'] PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/en/tokenizer/test_tokenizer.py similarity index 99% rename from spacy/tests/tokenizer/test_tokenizer.py rename to spacy/tests/en/tokenizer/test_tokenizer.py index 31bff9b14..8b34c5ec2 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/en/tokenizer/test_tokenizer.py @@ -4,7 +4,7 @@ from os import path import pytest -from spacy.util import utf8open +from ....util import utf8open def test_tokenizer_handles_no_word(en_tokenizer): diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/en/tokenizer/test_whitespace.py similarity index 100% rename from spacy/tests/tokenizer/test_whitespace.py rename to spacy/tests/en/tokenizer/test_whitespace.py From 637f78503666f3551c199ad5b264b03d7d61bcd0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 16:25:38 +0100 Subject: [PATCH 56/81] Add general sanity tests for all tokenizers --- spacy/tests/tokenizer/conftest.py | 23 +++++++ spacy/tests/tokenizer/test_tokenizer.py | 80 +++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 spacy/tests/tokenizer/conftest.py create mode 100644 spacy/tests/tokenizer/test_tokenizer.py diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py new file mode 100644 index 000000000..c8e340208 --- /dev/null +++ b/spacy/tests/tokenizer/conftest.py @@ -0,0 +1,23 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...en import English +from ...de import German +from ...es import Spanish +from ...it import Italian +from ...fr import French +from ...pt import Portuguese +from ...nl import Dutch +from ...sv import Swedish +from ...hu import Hungarian + + +LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian] + + +@pytest.fixture(params=LANGUAGES) +def tokenizer(request): + lang = request.param + return lang.Defaults.create_tokenizer() diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py new file mode 100644 index 000000000..49bfdcb26 --- /dev/null +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_no_word(tokenizer): + tokens = tokenizer("") + assert len(tokens) == 0 + + +@pytest.mark.parametrize('text', ["lorem"]) +def test_tokenizer_handles_single_word(tokenizer, text): + tokens = tokenizer(text) + assert tokens[0].text == text + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_handles_two_words(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text != tokens[1].text + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == " " + + +@pytest.mark.parametrize('text', ["lorem\nipsum"]) +def test_tokenizer_splits_newline(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "\n" + + +def test_tokenizer_handles_punct(tokenizer): + text = "Lorem, ipsum." + tokens = tokenizer(text) + assert len(tokens) == 4 + assert tokens[0].text == "Lorem" + assert tokens[1].text == "," + assert tokens[2].text == "ipsum" + assert tokens[1].text != "Lorem" + + +def test_tokenizer_handles_digits(tokenizer): + exceptions = ["hu"] + text = "Lorem ipsum: 1984." + tokens = tokenizer(text) + + if tokens[0].lang_ not in exceptions: + assert len(tokens) == 5 + assert tokens[0].text == "Lorem" + assert tokens[3].text == "1984" + + +def test_tokenizer_handles_long_text(tokenizer): + text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit + +Cras egestas orci non porttitor maximus. +Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate. + +Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris. + +"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo.""" + + tokens = tokenizer(text) + assert len(tokens) > 5 + + +def test_tokenizer_suspected_freeing_strings(tokenizer): + text1 = "Lorem dolor sit amet, consectetur adipiscing elit." + text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + tokens1 = tokenizer(text1) + tokens2 = tokenizer(text2) + assert tokens1[0].text == "Lorem" + assert tokens2[0].text == "Lorem" From bc911322b3a74b85210447d8c3639c5d54d117e3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:05:38 +0100 Subject: [PATCH 57/81] Move ") to emoticons (see Tweebo challenge test) --- spacy/en/tokenizer_exceptions.py | 1 - spacy/language_data/emoticons.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index cdd543adf..7d0d266db 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -637,7 +637,6 @@ for string in EXCLUDE_EXC: ORTH_ONLY = [ "''", - "\")", "a.", "a.m.", "Adm.", diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py index 3fa44368d..bc951a007 100644 --- a/spacy/language_data/emoticons.py +++ b/spacy/language_data/emoticons.py @@ -13,6 +13,7 @@ EMOTICONS = set(""" (-: =) (= +") :] :-] [: From 038002d6164030cd75ba8e7802230c56461c345b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:06:44 +0100 Subject: [PATCH 58/81] Reformat HU tokenizer tests and adapt to general style Improve readability of test cases and add conftest.py with fixture --- spacy/tests/hu/conftest.py | 11 + spacy/tests/hu/tokenizer/test_tokenizer.py | 403 +++++++++++---------- 2 files changed, 213 insertions(+), 201 deletions(-) create mode 100644 spacy/tests/hu/conftest.py diff --git a/spacy/tests/hu/conftest.py b/spacy/tests/hu/conftest.py new file mode 100644 index 000000000..222bd1b00 --- /dev/null +++ b/spacy/tests/hu/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...hu import Hungarian + + +@pytest.fixture +def hu_tokenizer(): + return Hungarian.Defaults.create_tokenizer() diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py index 2bfbfdf36..aea9873ee 100644 --- a/spacy/tests/hu/tokenizer/test_tokenizer.py +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -2,25 +2,27 @@ from __future__ import unicode_literals import pytest -from spacy.hu import Hungarian -_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - ('A .hu.', ['A', '.hu', '.']), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.'])] -_HYPHEN_TESTS = [ +DEFAULT_TESTS = [ + ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.']) +] + +HYPHEN_TESTS = [ ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), ('Egy -nak.', ['Egy', '-nak', '.']), ('Egy bel-.', ['Egy', 'bel-', '.']), @@ -39,195 +41,194 @@ _HYPHEN_TESTS = [ ('A 7-es.', ['A', '7-es', '.']), ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), ('A %-sal.', ['A', '%-sal', '.']), - ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])] + ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.']) +] -_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']), - ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), - ('A 2b.', ['A', '2b', '.']), - ('A 2b-ben.', ['A', '2b-ben', '.']), - ('A 3.b van.', ['A', '3.b', 'van', '.']), - ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), - ('A 3.b.', ['A', '3.b', '.']), - ('A 3.b-ben.', ['A', '3.b-ben', '.']), - ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), - ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), - ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), - ('A 1:35 van.', ['A', '1:35', 'van', '.']), - ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), - ('A 1:35-ben.', ['A', '1:35-ben', '.']), - ('A 1.35 van.', ['A', '1.35', 'van', '.']), - ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), - ('A 1.35-ben.', ['A', '1.35-ben', '.']), - ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), - ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), - ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), - ('A 10--12 van.', ['A', '10--12', 'van', '.']), - ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), - ('A 10--12-ben.', ['A', '10--12-ben', '.']), - ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), - ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), - ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), - ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), - ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), - ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), - ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), - ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), - ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), - ('A 10–12 van.', ['A', '10–12', 'van', '.']), - ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), - ('A 10–12-ben.', ['A', '10–12-ben', '.']), - ('A 10—12 van.', ['A', '10—12', 'van', '.']), - ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), - ('A 10—12-ben.', ['A', '10—12-ben', '.']), - ('A 10―12 van.', ['A', '10―12', 'van', '.']), - ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), - ('A 10―12-ben.', ['A', '10―12-ben', '.']), - ('A -23,12 van.', ['A', '-23,12', 'van', '.']), - ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), - ('A -23,12-ben.', ['A', '-23,12-ben', '.']), - ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A C++ van.', ['A', 'C++', 'van', '.']), - ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), - ('A C++.', ['A', 'C++', '.']), - ('A C++-ben.', ['A', 'C++-ben', '.']), - ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), - ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), - ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), - ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), - ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), - ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), - ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), - ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), - ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), - ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), - ('A IV. 12.', ['A', 'IV.', '12.']), - ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), - ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), - ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), - ('A 2003.01.06.', ['A', '2003.01.06.']), - ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), - ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), - ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), - ('A IV.12.', ['A', 'IV.12.']), - ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), - ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), - ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), - ('A 1.1.2.', ['A', '1.1.2.']), - ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), - ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), - ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), - ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), - ('A 3,14 van.', ['A', '3,14', 'van', '.']), - ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), - ('A 3,14-ben.', ['A', '3,14-ben', '.']), - ('A 3.14 van.', ['A', '3.14', 'van', '.']), - ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), - ('A 3.14-ben.', ['A', '3.14-ben', '.']), - ('A 15. van.', ['A', '15.', 'van', '.']), - ('A 15-ben van.', ['A', '15-ben', 'van', '.']), - ('A 15-ben.', ['A', '15-ben', '.']), - ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), - ('A 15.-ben.', ['A', '15.-ben', '.']), - ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), - ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), - ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), - ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), - ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), - ('A -0,99%.', ['A', '-0,99%', '.']), - ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), - ('A 10--20% van.', ['A', '10--20%', 'van', '.']), - ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), - ('A 10--20%.', ['A', '10--20%', '.']), - ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), - ('A 99§ van.', ['A', '99§', 'van', '.']), - ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), - ('A 99§-ben.', ['A', '99§-ben', '.']), - ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), - ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), - ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), - ('A 99° van.', ['A', '99°', 'van', '.']), - ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), - ('A 99°-ben.', ['A', '99°-ben', '.']), - ('A 10--20° van.', ['A', '10--20°', 'van', '.']), - ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), - ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), - ('A °C van.', ['A', '°C', 'van', '.']), - ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), - ('A °C.', ['A', '°C', '.']), - ('A °C-ben.', ['A', '°C-ben', '.']), - ('A 100°C van.', ['A', '100°C', 'van', '.']), - ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), - ('A 100°C.', ['A', '100°C', '.']), - ('A 100°C-ben.', ['A', '100°C-ben', '.']), - ('A 800x600 van.', ['A', '800x600', 'van', '.']), - ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), - ('A 800x600-ben.', ['A', '800x600-ben', '.']), - ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), - ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), - ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), - ('A 5/J van.', ['A', '5/J', 'van', '.']), - ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), - ('A 5/J-ben.', ['A', '5/J-ben', '.']), - ('A 5/J. van.', ['A', '5/J.', 'van', '.']), - ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), - ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), - ('A III/1 van.', ['A', 'III/1', 'van', '.']), - ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), - ('A III/1-ben.', ['A', 'III/1-ben', '.']), - ('A III/1. van.', ['A', 'III/1.', 'van', '.']), - ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), - ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), - ('A III/c van.', ['A', 'III/c', 'van', '.']), - ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), - ('A III/c.', ['A', 'III/c', '.']), - ('A III/c-ben.', ['A', 'III/c-ben', '.']), - ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), - ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), - ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])] +NUMBER_TESTS = [ + ('A 2b van.', ['A', '2b', 'van', '.']), + ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), + ('A 2b.', ['A', '2b', '.']), + ('A 2b-ben.', ['A', '2b-ben', '.']), + ('A 3.b van.', ['A', '3.b', 'van', '.']), + ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), + ('A 3.b.', ['A', '3.b', '.']), + ('A 3.b-ben.', ['A', '3.b-ben', '.']), + ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), + ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), + ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), + ('A 1:35 van.', ['A', '1:35', 'van', '.']), + ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), + ('A 1:35-ben.', ['A', '1:35-ben', '.']), + ('A 1.35 van.', ['A', '1.35', 'van', '.']), + ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), + ('A 1.35-ben.', ['A', '1.35-ben', '.']), + ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), + ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), + ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), + ('A 10--12 van.', ['A', '10--12', 'van', '.']), + ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), + ('A 10--12-ben.', ['A', '10--12-ben', '.']), + ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), + ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), + ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), + ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), + ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), + ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), + ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), + ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), + ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), + ('A 10–12 van.', ['A', '10–12', 'van', '.']), + ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), + ('A 10–12-ben.', ['A', '10–12-ben', '.']), + ('A 10—12 van.', ['A', '10—12', 'van', '.']), + ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), + ('A 10—12-ben.', ['A', '10—12-ben', '.']), + ('A 10―12 van.', ['A', '10―12', 'van', '.']), + ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), + ('A 10―12-ben.', ['A', '10―12-ben', '.']), + ('A -23,12 van.', ['A', '-23,12', 'van', '.']), + ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), + ('A -23,12-ben.', ['A', '-23,12-ben', '.']), + ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A C++ van.', ['A', 'C++', 'van', '.']), + ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), + ('A C++.', ['A', 'C++', '.']), + ('A C++-ben.', ['A', 'C++-ben', '.']), + ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), + ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), + ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), + ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), + ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), + ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), + ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), + ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), + ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), + ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), + ('A IV. 12.', ['A', 'IV.', '12.']), + ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), + ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), + ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), + ('A 2003.01.06.', ['A', '2003.01.06.']), + ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), + ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), + ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), + ('A IV.12.', ['A', 'IV.12.']), + ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), + ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), + ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), + ('A 1.1.2.', ['A', '1.1.2.']), + ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), + ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), + ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), + ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), + ('A 3,14 van.', ['A', '3,14', 'van', '.']), + ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), + ('A 3,14-ben.', ['A', '3,14-ben', '.']), + ('A 3.14 van.', ['A', '3.14', 'van', '.']), + ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), + ('A 3.14-ben.', ['A', '3.14-ben', '.']), + ('A 15. van.', ['A', '15.', 'van', '.']), + ('A 15-ben van.', ['A', '15-ben', 'van', '.']), + ('A 15-ben.', ['A', '15-ben', '.']), + ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), + ('A 15.-ben.', ['A', '15.-ben', '.']), + ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), + ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), + ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), + ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), + ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), + ('A -0,99%.', ['A', '-0,99%', '.']), + ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), + ('A 10--20% van.', ['A', '10--20%', 'van', '.']), + ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), + ('A 10--20%.', ['A', '10--20%', '.']), + ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), + ('A 99§ van.', ['A', '99§', 'van', '.']), + ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), + ('A 99§-ben.', ['A', '99§-ben', '.']), + ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), + ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), + ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), + ('A 99° van.', ['A', '99°', 'van', '.']), + ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), + ('A 99°-ben.', ['A', '99°-ben', '.']), + ('A 10--20° van.', ['A', '10--20°', 'van', '.']), + ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), + ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), + ('A °C van.', ['A', '°C', 'van', '.']), + ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), + ('A °C.', ['A', '°C', '.']), + ('A °C-ben.', ['A', '°C-ben', '.']), + ('A 100°C van.', ['A', '100°C', 'van', '.']), + ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), + ('A 100°C.', ['A', '100°C', '.']), + ('A 100°C-ben.', ['A', '100°C-ben', '.']), + ('A 800x600 van.', ['A', '800x600', 'van', '.']), + ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), + ('A 800x600-ben.', ['A', '800x600-ben', '.']), + ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), + ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), + ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), + ('A 5/J van.', ['A', '5/J', 'van', '.']), + ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), + ('A 5/J-ben.', ['A', '5/J-ben', '.']), + ('A 5/J. van.', ['A', '5/J.', 'van', '.']), + ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), + ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), + ('A III/1 van.', ['A', 'III/1', 'van', '.']), + ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), + ('A III/1-ben.', ['A', 'III/1-ben', '.']), + ('A III/1. van.', ['A', 'III/1.', 'van', '.']), + ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), + ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), + ('A III/c van.', ['A', 'III/c', 'van', '.']), + ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), + ('A III/c.', ['A', 'III/c', '.']), + ('A III/c-ben.', ['A', 'III/c-ben', '.']), + ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), + ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), + ('A TU–154-ben.', ['A', 'TU–154-ben', '.']) +] -_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), - ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), - ("A don't van.", ['A', "don't", 'van', '.'])] +QUOTE_TESTS = [ + ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), + ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), + ("A don't van.", ['A', "don't", 'van', '.']) +] -_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - ('A .hu.', ['A', '.hu', '.']), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.'])] +DOT_TESTS = [ + ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.']) +] -@pytest.fixture(scope="session") -def HU(): - return Hungarian() +TESTCASES = DEFAULT_TESTS + HYPHEN_TESTS + NUMBER_TESTS + DOT_TESTS + QUOTE_TESTS -@pytest.fixture(scope="module") -def hu_tokenizer(HU): - return HU.tokenizer - - -@pytest.mark.parametrize(("input", "expected_tokens"), - _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS) -def test_testcases(hu_tokenizer, input, expected_tokens): - tokens = hu_tokenizer(input) - token_list = [token.orth_ for token in tokens if not token.is_space] +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens): + tokens = hu_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list From bbe7cab3a145fb9c3e849fbbeede5957a16589d6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:09:29 +0100 Subject: [PATCH 59/81] Move non-English-specific tests back to general tokenizer tests --- spacy/tests/en/tokenizer/test_exceptions.py | 36 +------------ .../en/tokenizer/test_prefix_suffix_infix.py | 12 ----- spacy/tests/en/tokenizer/test_whitespace.py | 51 ------------------- spacy/tests/{en => }/tokenizer/sun.txt | 0 spacy/tests/tokenizer/test_exceptions.py | 41 +++++++++++++++ spacy/tests/tokenizer/test_tokenizer.py | 45 ++++++++-------- spacy/tests/tokenizer/test_whitespace.py | 51 +++++++++++++++++++ 7 files changed, 117 insertions(+), 119 deletions(-) delete mode 100644 spacy/tests/en/tokenizer/test_whitespace.py rename spacy/tests/{en => }/tokenizer/sun.txt (100%) create mode 100644 spacy/tests/tokenizer/test_exceptions.py create mode 100644 spacy/tests/tokenizer/test_whitespace.py diff --git a/spacy/tests/en/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py index c194dce21..ac7ed452f 100644 --- a/spacy/tests/en/tokenizer/test_exceptions.py +++ b/spacy/tests/en/tokenizer/test_exceptions.py @@ -1,5 +1,5 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handles correctly.""" +"""Test that tokenizer exceptions are handled correctly.""" from __future__ import unicode_literals @@ -18,37 +18,3 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer): tokens = en_tokenizer(text) assert len(tokens) == 6 assert tokens[3].text == "i.e." - - -def test_tokenizer_handles_emoticons(en_tokenizer): - # Tweebo challenge (CMU) - text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = en_tokenizer(text) - assert tokens[0].text == ":o" - assert tokens[1].text == ":/" - assert tokens[2].text == ":'(" - assert tokens[3].text == ">:o" - assert tokens[4].text == "(:" - assert tokens[5].text == ":)" - assert tokens[6].text == ">.<" - assert tokens[7].text == "XD" - assert tokens[8].text == "-__-" - assert tokens[9].text == "o.O" - assert tokens[10].text == ";D" - assert tokens[11].text == ":-)" - assert tokens[12].text == "@_@" - assert tokens[13].text == ":P" - assert tokens[14].text == "8D" - assert tokens[15].text == ":1" - assert tokens[16].text == ">:(" - assert tokens[17].text == ":D" - assert tokens[18].text == "=|" - assert tokens[19].text == '")' - assert tokens[20].text == ':>' - assert tokens[21].text == '....' - - -@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) -def test_tokenizer_excludes_false_pos_emoticons(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length diff --git a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py index d6963ada1..042934d4e 100644 --- a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py +++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py @@ -100,18 +100,6 @@ def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text): assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) -def test_tokenizer_keep_urls(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) -def test_tokenizer_keeps_email(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - def test_tokenizer_splits_double_hyphen_infix(en_tokenizer): tokens = en_tokenizer("No decent--let alone well-bred--people.") assert tokens[0].text == "No" diff --git a/spacy/tests/en/tokenizer/test_whitespace.py b/spacy/tests/en/tokenizer/test_whitespace.py deleted file mode 100644 index 90dc80615..000000000 --- a/spacy/tests/en/tokenizer/test_whitespace.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding: utf-8 -"""Test that tokens are created correctly for whitespace.""" - - -from __future__ import unicode_literals - -import pytest - - -@pytest.mark.parametrize('text', ["hello possums"]) -def test_tokenizer_splits_single_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 2 - - -@pytest.mark.parametrize('text', ["hello possums"]) -def test_tokenizer_splits_double_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == " " - - -@pytest.mark.parametrize('text', ["two spaces after this "]) -def test_tokenizer_handles_double_trainling_ws(en_tokenizer, text): - tokens = en_tokenizer(text) - assert repr(tokens.text_with_ws) == repr(text) - - -@pytest.mark.parametrize('text', ["hello\npossums"]) -def test_tokenizer_splits_newline(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "\n" - - -@pytest.mark.parametrize('text', ["hello \npossums"]) -def test_tokenizer_splits_newline_space(en_tokenizer, text): - tokens = en_tokenizer('hello \npossums') - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["hello \npossums"]) -def test_tokenizer_splits_newline_double_space(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text', ["hello \n possums"]) -def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 diff --git a/spacy/tests/en/tokenizer/sun.txt b/spacy/tests/tokenizer/sun.txt similarity index 100% rename from spacy/tests/en/tokenizer/sun.txt rename to spacy/tests/tokenizer/sun.txt diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py new file mode 100644 index 000000000..aab27714e --- /dev/null +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -0,0 +1,41 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_emoticons(tokenizer): + # Tweebo challenge (CMU) + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + tokens = tokenizer(text) + assert tokens[0].text == ":o" + assert tokens[1].text == ":/" + assert tokens[2].text == ":'(" + assert tokens[3].text == ">:o" + assert tokens[4].text == "(:" + assert tokens[5].text == ":)" + assert tokens[6].text == ">.<" + assert tokens[7].text == "XD" + assert tokens[8].text == "-__-" + assert tokens[9].text == "o.O" + assert tokens[10].text == ";D" + assert tokens[11].text == ":-)" + assert tokens[12].text == "@_@" + assert tokens[13].text == ":P" + assert tokens[14].text == "8D" + assert tokens[15].text == ":1" + assert tokens[16].text == ">:(" + assert tokens[17].text == ":D" + assert tokens[18].text == "=|" + assert tokens[19].text == '")' + assert tokens[20].text == ':>' + assert tokens[21].text == '....' + + +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): + tokens = tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 49bfdcb26..cd0043a10 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +from os import path import pytest +from ...util import utf8open + def test_tokenizer_handles_no_word(tokenizer): tokens = tokenizer("") @@ -15,27 +18,6 @@ def test_tokenizer_handles_single_word(tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize('text', ["lorem ipsum"]) -def test_tokenizer_handles_two_words(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text != tokens[1].text - - -@pytest.mark.parametrize('text', ["lorem ipsum"]) -def test_tokenizer_splits_double_space(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == " " - - -@pytest.mark.parametrize('text', ["lorem\nipsum"]) -def test_tokenizer_splits_newline(tokenizer, text): - tokens = tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "\n" - - def test_tokenizer_handles_punct(tokenizer): text = "Lorem, ipsum." tokens = tokenizer(text) @@ -57,6 +39,18 @@ def test_tokenizer_handles_digits(tokenizer): assert tokens[3].text == "1984" +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +def test_tokenizer_keep_urls(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +def test_tokenizer_keeps_email(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 + + def test_tokenizer_handles_long_text(tokenizer): text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit @@ -71,6 +65,15 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n assert len(tokens) > 5 +@pytest.mark.parametrize('file_name', ["sun.txt"]) +def test_tokenizer_handle_text_from_file(tokenizer, file_name): + loc = path.join(path.dirname(__file__), file_name) + text = utf8open(loc).read() + assert len(text) != 0 + tokens = tokenizer(text) + assert len(tokens) > 100 + + def test_tokenizer_suspected_freeing_strings(tokenizer): text1 = "Lorem dolor sit amet, consectetur adipiscing elit." text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py new file mode 100644 index 000000000..7ff3106a8 --- /dev/null +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -0,0 +1,51 @@ +# coding: utf-8 +"""Test that tokens are created correctly for whitespace.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_single_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == " " + + +@pytest.mark.parametrize('text', ["lorem ipsum "]) +def test_tokenizer_handles_double_trainling_ws(tokenizer, text): + tokens = tokenizer(text) + assert repr(tokens.text_with_ws) == repr(text) + + +@pytest.mark.parametrize('text', ["lorem\nipsum"]) +def test_tokenizer_splits_newline(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "\n" + + +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["lorem \n ipsum"]) +def test_tokenizer_splits_newline_space_wrap(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 From 65f937d5c6d4426d96da1a0030a9a01978a30658 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:09:53 +0100 Subject: [PATCH 60/81] Move basic contraction tests to test_contractions.py --- spacy/tests/en/tokenizer/test_contractions.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/en/tokenizer/test_contractions.py b/spacy/tests/en/tokenizer/test_contractions.py index 995a405fb..a97b8f5ba 100644 --- a/spacy/tests/en/tokenizer/test_contractions.py +++ b/spacy/tests/en/tokenizer/test_contractions.py @@ -7,6 +7,23 @@ from __future__ import unicode_literals import pytest +def test_tokenizer_handles_basic_contraction(en_tokenizer): + text = "don't giggle" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "n't" + text = "i said don't!" + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert tokens[4].text == "!" + + +@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) +def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + @pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): tokens = en_tokenizer(text_poss) From 8216ba599b6c33207f413381b755d8db25c01440 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:11:04 +0100 Subject: [PATCH 61/81] Add tests for longer and mixed English texts --- spacy/tests/en/tokenizer/test_text.py | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 spacy/tests/en/tokenizer/test_text.py diff --git a/spacy/tests/en/tokenizer/test_text.py b/spacy/tests/en/tokenizer/test_text.py new file mode 100644 index 000000000..c7178fbf9 --- /dev/null +++ b/spacy/tests/en/tokenizer/test_text.py @@ -0,0 +1,36 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_long_text(en_tokenizer): + text = """Tributes pour in for late British Labour Party leader + +Tributes poured in from around the world Thursday +to the late Labour Party leader John Smith, who died earlier from a massive +heart attack aged 55. + +In Washington, the US State Department issued a statement regretting "the +untimely death" of the rapier-tongued Scottish barrister and parliamentarian. + +"Mr. Smith, throughout his distinguished""" + tokens = en_tokenizer(text) + assert len(tokens) == 76 + + +@pytest.mark.parametrize('text,length', [ + ("The U.S. Army likes Shock and Awe.", 8), + ("U.N. regulations are not a part of their concern.", 10), + ("“Isn't it?”", 6), + ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), + ("""'Me too!', Mr. P. Delaware cried. """, 11), + ("They ran about 10km.", 6), + # ("But then the 6,000-year ice age came...", 10) + ]) +def test_tokenizer_handles_cnts(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length From 5bb4081f526055dc28d7a26a3323bcf2faf0ec2b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:11:11 +0100 Subject: [PATCH 62/81] Remove redundant test_tokenizer.py for English --- spacy/tests/en/tokenizer/test_tokenizer.py | 108 --------------------- 1 file changed, 108 deletions(-) delete mode 100644 spacy/tests/en/tokenizer/test_tokenizer.py diff --git a/spacy/tests/en/tokenizer/test_tokenizer.py b/spacy/tests/en/tokenizer/test_tokenizer.py deleted file mode 100644 index 8b34c5ec2..000000000 --- a/spacy/tests/en/tokenizer/test_tokenizer.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -from os import path - -import pytest - -from ....util import utf8open - - -def test_tokenizer_handles_no_word(en_tokenizer): - tokens = en_tokenizer("") - assert len(tokens) == 0 - - -@pytest.mark.parametrize('text', ["hello"]) -def test_tokenizer_handles_single_word(en_tokenizer, text): - tokens = en_tokenizer(text) - assert tokens[0].text == text - - -@pytest.mark.parametrize('text', ["hello possums"]) -def test_tokenizer_handles_two_words(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text != tokens[1].text - - -def test_tokenizer_handles_punct(en_tokenizer): - text = "hello, possums." - tokens = en_tokenizer(text) - assert len(tokens) == 4 - assert tokens[0].text == "hello" - assert tokens[1].text == "," - assert tokens[2].text == "possums" - assert tokens[1].text != "hello" - - -def test_tokenizer_handles_digits(en_tokenizer): - text = "The year: 1984." - tokens = en_tokenizer(text) - assert len(tokens) == 5 - assert tokens[0].text == "The" - assert tokens[3].text == "1984" - - -def test_tokenizer_handles_basic_contraction(en_tokenizer): - text = "don't giggle" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "n't" - text = "i said don't!" - tokens = en_tokenizer(text) - assert len(tokens) == 5 - assert tokens[4].text == "!" - - -@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) -def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -def test_tokenizer_handles_long_text(en_tokenizer): - text = """Tributes pour in for late British Labour Party leader - -Tributes poured in from around the world Thursday -to the late Labour Party leader John Smith, who died earlier from a massive -heart attack aged 55. - -In Washington, the US State Department issued a statement regretting "the -untimely death" of the rapier-tongued Scottish barrister and parliamentarian. - -"Mr. Smith, throughout his distinguished""" - - tokens = en_tokenizer(text) - assert len(tokens) > 5 - - -@pytest.mark.parametrize('file_name', ["sun.txt"]) -def test_tokenizer_handle_text_from_file(en_tokenizer, file_name): - loc = path.join(path.dirname(__file__), file_name) - text = utf8open(loc).read() - assert len(text) != 0 - tokens = en_tokenizer(text) - assert len(tokens) > 100 - - -@pytest.mark.parametrize('text,length', [ - ("The U.S. Army likes Shock and Awe.", 8), - ("U.N. regulations are not a part of their concern.", 10), - ("“Isn't it?”", 6), - ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), - ("""'Me too!', Mr. P. Delaware cried. """, 11), - ("They ran about 10km.", 6), - # ("But then the 6,000-year ice age came...", 10) - ]) -def test_tokenizer_handles_cnts(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length - - -def test_tokenizer_suspected_freeing_strings(en_tokenizer): - text1 = "Betty Botter bought a pound of butter." - text2 = "Betty also bought a pound of butter." - tokens1 = en_tokenizer(text1) - tokens2 = en_tokenizer(text2) - assert tokens1[0].text == "Betty" - assert tokens2[0].text == "Betty" From 55b46d7cf64dcb9b0206c8b5aab5468de1236280 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:11:25 +0100 Subject: [PATCH 63/81] Add tokenizer tests for German --- spacy/tests/de/__init__.py | 0 spacy/tests/de/conftest.py | 11 ++ spacy/tests/de/tokenizer/__init__.py | 0 spacy/tests/de/tokenizer/test_exceptions.py | 27 ++++ .../de/tokenizer/test_prefix_suffix_infix.py | 116 ++++++++++++++++++ spacy/tests/de/tokenizer/test_text.py | 40 ++++++ 6 files changed, 194 insertions(+) create mode 100644 spacy/tests/de/__init__.py create mode 100644 spacy/tests/de/conftest.py create mode 100644 spacy/tests/de/tokenizer/__init__.py create mode 100644 spacy/tests/de/tokenizer/test_exceptions.py create mode 100644 spacy/tests/de/tokenizer/test_prefix_suffix_infix.py create mode 100644 spacy/tests/de/tokenizer/test_text.py diff --git a/spacy/tests/de/__init__.py b/spacy/tests/de/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/de/conftest.py b/spacy/tests/de/conftest.py new file mode 100644 index 000000000..c6b8be26e --- /dev/null +++ b/spacy/tests/de/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...de import German + + +@pytest.fixture +def de_tokenizer(): + return German.Defaults.create_tokenizer() diff --git a/spacy/tests/de/tokenizer/__init__.py b/spacy/tests/de/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/de/tokenizer/test_exceptions.py b/spacy/tests/de/tokenizer/test_exceptions.py new file mode 100644 index 000000000..13da3dc33 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_exceptions.py @@ -0,0 +1,27 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handles correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) +def test_tokenizer_splits_contractions(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) +def test_tokenizer_handles_abbr(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(de_tokenizer): + text = "Ich bin z.Zt. im Urlaub." + tokens = de_tokenizer(text) + assert len(tokens) == 6 + assert tokens[2].text == "z.Zt." + assert tokens[2].lemma_ == "zur Zeit" diff --git a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py new file mode 100644 index 000000000..dcf4f4ef0 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py @@ -0,0 +1,116 @@ +# coding: utf-8 +"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["(unter)"]) +def test_tokenizer_splits_no_special(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm"]) +def test_tokenizer_splits_no_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(unter'm"]) +def test_tokenizer_splits_prefix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm)"]) +def test_tokenizer_splits_suffix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(unter'm)"]) +def test_tokenizer_splits_even_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["(unter'm?)"]) +def test_tokenizer_splits_uneven_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) +def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text', ["z.B.)"]) +def test_tokenizer_splits_suffix_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(z.B.)"]) +def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(z.B.?)"]) +def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["blau-rot"]) +def test_tokenizer_splits_hyphens(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"]) +def test_tokenizer_splits_period_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"]) +def test_tokenizer_splits_comma_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"]) +def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): + tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") + assert len(tokens) == 12 + assert tokens[0].text == "Viele" + assert tokens[1].text == "Regeln" + assert tokens[2].text == "--" + assert tokens[3].text == "wie" + assert tokens[4].text == "die" + assert tokens[5].text == "Bindestrich" + assert tokens[6].text == "-" + assert tokens[7].text == "Regeln" + assert tokens[8].text == "--" + assert tokens[9].text == "sind" + assert tokens[10].text == "kompliziert" diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py new file mode 100644 index 000000000..a5cbd5383 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_text.py @@ -0,0 +1,40 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_long_text(de_tokenizer): + text = """Die Verwandlung + +Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in seinem Bett zu einem ungeheueren Ungeziefer verwandelt. + +Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit, kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. + +»Was ist mit mir geschehen?«, dachte er.""" + + tokens = de_tokenizer(text) + assert len(tokens) == 104 + + +@pytest.mark.parametrize('text,length', [ + ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), + ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), + ("Kraftfahrzeug-Haftpflichtversicherung", 3), + ("Vakuum-Mittelfrequenz-Induktionsofen", 5) + ]) +def test_tokenizer_handles_long_words(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [ + ("»Was ist mit mir geschehen?«, dachte er.", 12), + ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15) + ]) +def test_tokenizer_handles_examples(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length From 8328925e1f5a976808175aa95ece0c2027cff62c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:13:30 +0100 Subject: [PATCH 64/81] Add newlines to long German text --- spacy/tests/de/tokenizer/test_text.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py index a5cbd5383..84fa6f2a5 100644 --- a/spacy/tests/de/tokenizer/test_text.py +++ b/spacy/tests/de/tokenizer/test_text.py @@ -10,14 +10,19 @@ import pytest def test_tokenizer_handles_long_text(de_tokenizer): text = """Die Verwandlung -Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in seinem Bett zu einem ungeheueren Ungeziefer verwandelt. +Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in +seinem Bett zu einem ungeheueren Ungeziefer verwandelt. -Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit, kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. +Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig +hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten +Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit, +kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen +Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. »Was ist mit mir geschehen?«, dachte er.""" tokens = de_tokenizer(text) - assert len(tokens) == 104 + assert len(tokens) == 109 @pytest.mark.parametrize('text,length', [ From 7d2cf934b924109bbdeb1127a43a54b2adedc278 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 19:57:00 +0100 Subject: [PATCH 65/81] Generate he/she/it correctly with 's instead of 've --- spacy/en/tokenizer_exceptions.py | 39 +++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 7d0d266db..2c046c157 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -7,7 +7,7 @@ from ..language_data import PRON_LEMMA EXC = {} -EXCLUDE_EXC = ["Ill", "ill", "Hell", "hell", "Well", "well", "Whore", "whore"] +EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Well", "well", "Whore", "whore"] # Pronouns @@ -49,16 +49,6 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]: {ORTH: "ll", LEMMA: "will", TAG: "MD"} ] - EXC[orth + "'ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - EXC[orth + "'ll've"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, @@ -94,6 +84,19 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]: ] +for pron in ["i", "you", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + for pron in ["you", "we", "they"]: for orth in [pron, pron.title()]: EXC[orth + "'re"] = [ @@ -107,6 +110,20 @@ for pron in ["you", "we", "they"]: ] +for pron in ["he", "she", "it"]: + for orth in [pron, pron.title()]: + EXC[orth + "'s"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ] + + EXC[orth + "s"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ] + + + # W-words, relative pronouns, prepositions etc. for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: From a23504fe07c5d3d55b247e4aa0b185dd0a338ee7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 19:58:07 +0100 Subject: [PATCH 66/81] Move abbreviations below other exceptions --- spacy/en/tokenizer_exceptions.py | 135 +------------------------------ 1 file changed, 1 insertion(+), 134 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 2c046c157..49b612d73 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -505,142 +505,9 @@ ABBREVIATIONS = { } -# Other exceptions - -OTHER = { - " ": [ - {ORTH: " ", TAG: "SP"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "'cause": [ - {ORTH: "'cause", LEMMA: "because"} - ], - - "y'all": [ - {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "yall": [ - {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "em": [ - {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "nothin'": [ - {ORTH: "nothin'", LEMMA: "nothing"} - ], - - "nuthin'": [ - {ORTH: "nuthin'", LEMMA: "nothing"} - ], - - "'nuff": [ - {ORTH: "'nuff", LEMMA: "enough"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ] -} - - TOKENIZER_EXCEPTIONS = dict(EXC) -TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) TOKENIZER_EXCEPTIONS.update(OTHER) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) # Remove EXCLUDE_EXC if in exceptions From cab39c59c5207990b63b181378d855716723fbca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 19:59:06 +0100 Subject: [PATCH 67/81] Add missing contractions to English tokenizer exceptions Inspired by https://github.com/kootenpv/contractions/blob/master/contractions/__init __.py --- spacy/en/tokenizer_exceptions.py | 198 +++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 49b612d73..44ad605a4 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -212,9 +212,11 @@ for verb_data in [ {ORTH: "does", LEMMA: "do"}, {ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "may"}, {ORTH: "might"}, {ORTH: "must"}, {ORTH: "need"}, + {ORTH: "ought"}, {ORTH: "sha", LEMMA: "shall"}, {ORTH: "should"}, {ORTH: "wo", LEMMA: "will"}, @@ -288,6 +290,201 @@ for verb_data in [ ] + +# Other contractions with trailing apostrophe + +for exc_data in [ + {ORTH: "doin", LEMMA: "do", NORM: "doing"}, + {ORTH: "goin", LEMMA: "go", NORM: "going"}, + {ORTH: "nothin", LEMMA: "nothing"}, + {ORTH: "nuthin", LEMMA: "nothing"}, + {ORTH: "ol", LEMMA: "old"}, + {ORTH: "somethin", LEMMA: "something"} +]: + exc_data_tc = dict(exc_data) + exc_data_tc[ORTH] = exc_data_tc[ORTH].title() + + for data in [exc_data, exc_data_tc]: + data_apos = dict(data) + data_apos[ORTH] = data_apos[ORTH] + "'" + + EXC[data[ORTH]] = [ + dict(data) + ] + + EXC[data_apos[ORTH]] = [ + dict(data_apos) + ] + + +# Other contractions with leading apostrophe + +for exc_data in [ + {ORTH: "cause", LEMMA: "because"}, + {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, + {ORTH: "ll", LEMMA: "will"}, + {ORTH: "nuff", LEMMA: "enough"} +]: + exc_data_apos = dict(exc_data) + exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] + + for data in [exc_data, exc_data_apos]: + EXC[data[ORTH]] = [ + dict(data) + ] + + +# Rest + +OTHER = { + " ": [ + {ORTH: " ", TAG: "SP"} + ], + + "\u00a0": [ + {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} + ], + + "'S": [ + {ORTH: "'S", LEMMA: "'s"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: "'s"} + ], + + "'re": [ + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "\u2018S": [ + {ORTH: "\u2018S", LEMMA: "'s"} + ], + + "\u2018s": [ + {ORTH: "\u2018s", LEMMA: "'s"} + ], + + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} + ], + + "'Cause": [ + {ORTH: "'Cause", LEMMA: "because"} + ], + + "y'all": [ + {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "yall": [ + {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} + ], + + "ma'am": [ + {ORTH: "ma'am", LEMMA: "madam"} + ], + + "Ma'am": [ + {ORTH: "Ma'am", LEMMA: "madam"} + ], + + "o'clock": [ + {ORTH: "o'clock", LEMMA: "o'clock"} + ], + + "O'clock": [ + {ORTH: "O'clock", LEMMA: "o'clock"} + ], + + "how'd'y": [ + {ORTH: "how", LEMMA: "how"}, + {ORTH: "'d", LEMMA: "do"}, + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} + ], + + "How'd'y": [ + {ORTH: "How", LEMMA: "how"}, + {ORTH: "'d", LEMMA: "do"}, + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} + ], + + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Not've": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "gonna": [ + {ORTH: "gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "Gonna": [ + {ORTH: "Gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "gotta": [ + {ORTH: "got"}, + {ORTH: "ta", LEMMA: "to"} + ], + + "Gotta": [ + {ORTH: "Got"}, + {ORTH: "ta", LEMMA: "to"} + ], + + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "Let's": [ + {ORTH: "Let", LEMMA: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} + ], + + "\u2014": [ + {ORTH: "\u2014", TAG: ":", LEMMA: "--"} + ], + + "\n": [ + {ORTH: "\n", TAG: "SP"} + ], + + "\t": [ + {ORTH: "\t", TAG: "SP"} + ] +} + + # Abbreviations ABBREVIATIONS = { @@ -520,6 +717,7 @@ for string in EXCLUDE_EXC: # Abbreviations with only one ORTH token ORTH_ONLY = [ + "'d", "''", "a.", "a.m.", From 57919566b8860137d643e1eaa15d015ffe2194df Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 20:50:08 +0100 Subject: [PATCH 68/81] Add Jupyter notebooks repo to resources list --- website/docs/usage/resources.jade | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade index a09c7358d..2b80ebe48 100644 --- a/website/docs/usage/resources.jade +++ b/website/docs/usage/resources.jade @@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa +cell | REST microservices for spaCy demos and visualisers. + +row + +cell + +src(gh("spacy-notebooks")) spaCy Notebooks + + +cell + | Jupyter notebooks for spaCy examples and tutorials. + +h(2, "libraries") Libraries and projects +table(["Name", "Description"]) +row From abb09782f95f6bea71212e565459cef5516c6b53 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:32:54 +0100 Subject: [PATCH 69/81] Move sun.txt to original location and fix path to not break parser tests --- spacy/tests/{tokenizer => }/sun.txt | 0 spacy/tests/tokenizer/test_tokenizer.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename spacy/tests/{tokenizer => }/sun.txt (100%) diff --git a/spacy/tests/tokenizer/sun.txt b/spacy/tests/sun.txt similarity index 100% rename from spacy/tests/tokenizer/sun.txt rename to spacy/tests/sun.txt diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index cd0043a10..9db007d7e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -67,7 +67,7 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize('file_name', ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): - loc = path.join(path.dirname(__file__), file_name) + loc = path.join(path.dirname(__file__), '..', file_name) text = utf8open(loc).read() assert len(text) != 0 tokens = tokenizer(text) From de5aa92bc2bedf415c468b49c4bb3c15cf00a970 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:33:28 +0100 Subject: [PATCH 70/81] Handle deprecated tokenizer prefix data --- spacy/util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index afed4142e..457534302 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -94,8 +94,13 @@ def read_regex(path): def compile_prefix_regex(entries): - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) - return re.compile(expression) + if '(' in entries: + # Handle deprecated data + expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + return re.compile(expression) + else: + expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) + return re.compile(expression) def compile_suffix_regex(entries): From 7c3cb2a6521d19cba965d8cb86fcf3c70dced720 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:34:03 +0100 Subject: [PATCH 71/81] Add global abbreviations data --- spacy/language_data/__init__.py | 1 + spacy/language_data/abbreviations.py | 43 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 spacy/language_data/abbreviations.py diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..43a4ef0be 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,3 +1,4 @@ +from .abbreviations import * from .emoticons import * from .punctuation import * from .tag_map import * diff --git a/spacy/language_data/abbreviations.py b/spacy/language_data/abbreviations.py new file mode 100644 index 000000000..b49daa0ad --- /dev/null +++ b/spacy/language_data/abbreviations.py @@ -0,0 +1,43 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +ABBREVIATIONS = [ + "'", + "\\\")", + "", + "''", + "C++", + "a.", + "b.", + "c.", + "d.", + "e.", + "f.", + "g.", + "h.", + "i.", + "j.", + "k.", + "l.", + "m.", + "n.", + "o.", + "p.", + "q.", + "r.", + "s.", + "t.", + "u.", + "v.", + "w.", + "x.", + "y.", + "z.", + "ä.", + "ö.", + "ü." +] + + +__all__ = [ "ABBREVIATIONS" ] From 0dec90e9f77cfe124d0fde8e994f61de4222064d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:36:00 +0100 Subject: [PATCH 72/81] Use global abbreviation data languages and remove duplicates --- spacy/de/language_data.py | 3 ++- spacy/de/tokenizer_exceptions.py | 33 +------------------------------- spacy/en/language_data.py | 4 +++- spacy/en/tokenizer_exceptions.py | 29 +--------------------------- spacy/es/language_data.py | 5 ++++- spacy/es/tokenizer_exceptions.py | 28 +-------------------------- spacy/fr/language_data.py | 7 +++++-- spacy/hu/language_data.py | 3 ++- spacy/hu/tokenizer_exceptions.py | 27 -------------------------- spacy/it/language_data.py | 5 ++++- spacy/nl/language_data.py | 5 ++++- spacy/pt/language_data.py | 5 ++++- spacy/sv/language_data.py | 5 ++++- 13 files changed, 35 insertions(+), 124 deletions(-) diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index f64c915f6..5e09c0eb3 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) diff --git a/spacy/de/tokenizer_exceptions.py b/spacy/de/tokenizer_exceptions.py index b0561a223..0d8dc54e8 100644 --- a/spacy/de/tokenizer_exceptions.py +++ b/spacy/de/tokenizer_exceptions.py @@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "'", - "\\\")", - "", - "a.", - "ä.", "A.C.", "a.D.", "A.D.", @@ -530,24 +525,20 @@ ORTH_ONLY = [ "Abs.", "adv.", "al.", - "b.", "B.A.", "B.Sc.", "betr.", "biol.", "Biol.", - "c.", "ca.", "Chr.", "Cie.", "co.", "Co.", - "d.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", - "e.", "e.g.", "e.V.", "ehem.", @@ -555,79 +546,57 @@ ORTH_ONLY = [ "erm.", "etc.", "ev.", - "f.", - "g.", "G.m.b.H.", "geb.", "Gebr.", "gem.", - "h.", "h.c.", "Hg.", "hrsg.", "Hrsg.", - "i.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.", "Ing.", - "j.", "jr.", "Jr.", "jun.", "jur.", - "k.", "K.O.", - "l.", "L.A.", "lat.", - "m.", "M.A.", "m.E.", "m.M.", "M.Sc.", "Mr.", - "n.", "N.Y.", "N.Y.C.", "nat.", "ö." - "o.", "o.a.", "o.ä.", "o.g.", "o.k.", "O.K.", - "p.", "p.a.", "p.s.", "P.S.", "pers.", "phil.", - "q.", "q.e.d.", - "r.", "R.I.P.", "rer.", - "s.", "sen.", "St.", "std.", - "t.", - "u.", - "ü.", "u.a.", "U.S.", "U.S.A.", "U.S.S.", - "v.", "Vol.", "vs.", - "w.", - "wiss.", - "x.", - "y.", - "z." + "wiss." ] diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index a75f2b9d5..1fcbf277e 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -37,14 +37,16 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 44ad605a4..38fc33cfb 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -718,39 +718,25 @@ for string in EXCLUDE_EXC: ORTH_ONLY = [ "'d", - "''", - "a.", "a.m.", "Adm.", - "b.", "Bros.", - "c.", "co.", "Co.", "Corp.", - "d.", "D.C.", "Dr.", - "e.", "e.g.", "E.g.", "E.G.", - "f.", - "g.", "Gen.", "Gov.", - "h.", - "i.", "i.e.", "I.e.", "I.E.", "Inc.", - "j.", "Jr.", - "k.", - "l.", "Ltd.", - "m.", "Md.", "Messrs.", "Mo.", @@ -758,24 +744,11 @@ ORTH_ONLY = [ "Mr.", "Mrs.", "Ms.", - "n.", - "o.", - "p.", "p.m.", "Ph.D.", - "q.", - "r.", "Rep.", "Rev.", - "s.", "Sen.", "St.", - "t.", - "u.", - "v.", - "vs.", - "w.", - "x.", - "y.", - "z." + "vs." ] diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index 3357c9ac8..7c44752cb 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -40,11 +40,14 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) STOP_WORDS = set(STOP_WORDS) + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index f9259ce93..93bc74642 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "a.", "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", - "b.", - "c.", "Cía.", - "d.", - "e.", "etc.", - "f.", - "g.", "Gob.", "Gral.", - "h.", - "i.", "Ing.", - "j.", "J.C.", - "k.", - "l.", "Lic.", - "m.", "m.n.", - "n.", "no.", "núm.", - "o.", - "p.", "P.D.", "Prof.", "Profa.", - "q.", "q.e.p.d." - "r.", - "s.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", - "Srta.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z." + "Srta." ] diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index e612fe064..bbbeb1535 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -2,13 +2,16 @@ from __future__ import unicode_literals from .. import language_data as base -from ..language_data import strings_to_exc +from ..language_data import strings_to_exc, update_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 94eeb6f4d..0cb4ffd38 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -11,13 +11,14 @@ from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES # HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 627035bb8..46122564c 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -111,7 +111,6 @@ Vcs. Vhr. X.Y. Zs. -a. a.C. ac. adj. @@ -126,7 +125,6 @@ ang. arch. at. aug. -b. b.a. b.s. b.sc. @@ -141,7 +139,6 @@ br. bsc. bt. btk. -c. ca. cc. cca. @@ -155,7 +152,6 @@ csc. csüt. cső. ctv. -d. dbj. dd. ddr. @@ -170,7 +166,6 @@ dolg. dr. du. dzs. -e. ea. ed. eff. @@ -186,7 +181,6 @@ etc. ev. ezr. eü. -f. f.h. f.é. fam. @@ -213,7 +207,6 @@ főig. főisk. főtörm. főv. -g. gazd. gimn. gk. @@ -225,7 +218,6 @@ gy. gyak. gyártm. gör. -h. hads. hallg. hdm. @@ -266,7 +258,6 @@ isk. ism. izr. iá. -j. jan. jav. jegyz. @@ -278,7 +269,6 @@ jr. jvb. júl. jún. -k. karb. kat. kb. @@ -313,7 +303,6 @@ közl. közp. közt. kü. -l. lat. ld. legs. @@ -324,7 +313,6 @@ lt. ltd. ltp. luth. -m. m.a. m.s. m.sc. @@ -359,7 +347,6 @@ műh. műsz. műv. művez. -n. nagyker. nagys. nat. @@ -372,7 +359,6 @@ ny. nyilv. nyrt. nyug. -o. obj. okl. okt. @@ -381,7 +367,6 @@ orsz. ort. ov. ovh. -p. pf. pg. ph.d @@ -404,8 +389,6 @@ pság. ptk. pu. pü. -q. -r. r.k. rac. rad. @@ -420,7 +403,6 @@ rkt. rt. rtg. röv. -s. s.b. s.k. sa. @@ -450,7 +432,6 @@ szt. szubj. szöv. szül. -t. tanm. tb. tbk. @@ -476,13 +457,11 @@ tvr. ty. törv. tü. -u. ua. ui. unit. uo. uv. -v. vas. vb. vegy. @@ -501,9 +480,6 @@ vv. vál. vízv. vö. -w. -y. -z. zrt. zs. Ész. @@ -520,7 +496,6 @@ zs. évf. í. ó. -ö. össz. ötk. özv. @@ -528,7 +503,6 @@ zs. úm. ún. út. -ü. üag. üd. üdv. @@ -544,6 +518,5 @@ zs. """.strip().split() OTHER_EXC = """ -'' -e """.strip().split() diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/it/language_data.py +++ b/spacy/it/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/sv/language_data.py +++ b/spacy/sv/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] From 347c4a2d06cd08c0edd65c87cd558dae4f5a63c7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:37:39 +0100 Subject: [PATCH 73/81] Reorganise and reformat global tokenizer prefixes, suffixes and infixes --- spacy/language_data/punctuation.py | 227 +++++++++++++---------------- 1 file changed, 104 insertions(+), 123 deletions(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index fb784271e..575027857 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -1,133 +1,114 @@ # encoding: utf8 from __future__ import unicode_literals - -TOKENIZER_PREFIXES = r''' -, -" -( -[ -{ -* -< -> -$ -£ -¡ -¿ -„ -“ -' -`` -` -# -‘ -.... -... -… -‚ -» -§ -US$ -C$ -A$ -a- -'''.strip().split('\n') +import re -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -% -\$ -> -: -; -' -” -“ -« -_ -'' -'s -'S -’s -’S -’ -‘ -° -€ -… -\.\. -\.\.\. -\.\.\.\. -(?<=[a-z0-9)\]”"'%\)])\. -(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. -\-\- -´ -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=[0-9])°C -(?<=[0-9])°K -(?<=[0-9])°F -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') +_ALPHA_LOWER = """ +a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı +î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s +ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ +""" -TOKENIZER_INFIXES = r''' -… -\.\.\.+ -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-zA-Z])-(?=[a-zA-z]) -(?<=[a-zA-Z])--(?=[a-zA-z]) -(?<=[0-9])-(?=[0-9]) -(?<=[A-Za-z]),(?=[A-Za-z]) -(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ]) -'''.strip().split('\n') +_ALPHA_UPPER = """ +A Ä À Á  Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ +Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S +Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ +""" + + +_UNITS = """ +km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg +µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb +TB T G M K +""" + + +_CURRENCY = r""" +\$ £ € ¥ ฿ US\$ C\$ A\$ +""" + + +_QUOTES = r""" +' '' " ” “ `` ` ‘ ´ ‚ , „ » « +""" + + +_PUNCT = r""" +… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & +""" + + +_HYPHENS = r""" +- – — -- --- +""" + + +LIST_ELLIPSES = [ + r'\.\.+', + "…" +] + + +LIST_CURRENCY = list(_CURRENCY.strip().split()) +LIST_QUOTES = list(_QUOTES.strip().split()) +LIST_PUNCT = list(_PUNCT.strip().split()) +LIST_HYPHENS = list(_HYPHENS.strip().split()) + + +ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '') +ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '') +ALPHA = ALPHA_LOWER + ALPHA_UPPER + + +QUOTES = _QUOTES.strip().replace(' ', '|') +CURRENCY = _CURRENCY.strip().replace(' ', '|') +UNITS = _UNITS.strip().replace(' ', '|') +HYPHENS = _HYPHENS.strip().replace(' ', '|') + + + +# Prefixes + +TOKENIZER_PREFIXES = ( + ['§', '%', r'\+'] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY +) + + +# Suffixes + +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + [ + r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', + r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), + r'(?<=[0-9])(?:{u})'.format(u=UNITS), + r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), + "'s", "'S", "’s", "’S" + ] +) + + +# Infixes + +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + [ + r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) + ] +) __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] From 53362b6b93ed1aa1e428350cc9afeac9f0742d89 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:40:33 +0100 Subject: [PATCH 74/81] Reorganise Hungarian prefixes/suffixes/infixes Use global prefixes and suffixes for non-language-specific rules, import list of alpha unicode characters and adjust regexes. --- spacy/hu/language_data.py | 15 ++++--- spacy/hu/punctuation.py | 25 +++++++++++ spacy/hu/punctuations.py | 89 --------------------------------------- 3 files changed, 34 insertions(+), 95 deletions(-) create mode 100644 spacy/hu/punctuation.py delete mode 100644 spacy/hu/punctuations.py diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 0cb4ffd38..49652c5ac 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -4,22 +4,25 @@ from __future__ import unicode_literals import six from spacy.language_data import strings_to_exc, update_exc -from .punctuations import * +from .punctuation import * from .stop_words import STOP_WORDS from .tokenizer_exceptions import ABBREVIATIONS from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base -STOP_WORDS = set(STOP_WORDS) -TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES -TOKENIZER_INFIXES = TOKENIZER_INFIXES -# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] +STOP_WORDS = set(STOP_WORDS) + TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) + +TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES +TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES +TOKENIZER_INFIXES = TOKENIZER_INFIXES + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py new file mode 100644 index 000000000..e28052fd3 --- /dev/null +++ b/spacy/hu/punctuation.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES + + +TOKENIZER_SUFFIXES = [ + r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) +] + +TOKENIZER_INFIXES = [ + r'(?<=[0-9])-(?=[0-9])', + r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[{a}])--(?=[{a}])', + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) +] + + +TOKENIZER_INFIXES += LIST_ELLIPSES + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuations.py b/spacy/hu/punctuations.py deleted file mode 100644 index 3681a2fbe..000000000 --- a/spacy/hu/punctuations.py +++ /dev/null @@ -1,89 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -TOKENIZER_PREFIXES = r''' -+ -'''.strip().split('\n') - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -\$ -> -: -; -' -” -“ -« -_ -'' -’ -‘ -€ -\.\. -\.\.\. -\.\.\.\. -(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\. -(?<=[a-züóőúéáűí)])-e -\-\- -´ -(?<=[0-9])\+ -(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\. -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=°[FCK])\. -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') - -TOKENIZER_INFIXES = r''' -… -\.\.+ -(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[0-9])[+\-\*/^](?=[0-9]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -'''.strip().split('\n') - -__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] From 5d28664fc560bd43e6ccd46be09e49269a9a4dbd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:45:40 +0100 Subject: [PATCH 75/81] Don't test Hungarian for numbers and hyphens for now Reinvestigate behaviour of case affixes given reorganised tokenizer patterns. --- spacy/tests/hu/tokenizer/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py index aea9873ee..0b76da0c6 100644 --- a/spacy/tests/hu/tokenizer/test_tokenizer.py +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -224,7 +224,7 @@ DOT_TESTS = [ ] -TESTCASES = DEFAULT_TESTS + HYPHEN_TESTS + NUMBER_TESTS + DOT_TESTS + QUOTE_TESTS +TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS @pytest.mark.parametrize('text,expected_tokens', TESTCASES) From a89a6000e5b4170d3f7c871088885159c98912ae Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 22:17:37 +0100 Subject: [PATCH 76/81] Remove unused import --- spacy/it/__init__.py | 2 -- spacy/nl/__init__.py | 2 -- spacy/pt/__init__.py | 2 -- spacy/sv/__init__.py | 2 -- 4 files changed, 8 deletions(-) diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index 2ef60fd94..bc0d13cab 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG diff --git a/spacy/nl/__init__.py b/spacy/nl/__init__.py index d958783ea..d4aa39506 100644 --- a/spacy/nl/__init__.py +++ b/spacy/nl/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG from .language_data import * diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py index 06c6417dc..ed26fb0b3 100644 --- a/spacy/pt/__init__.py +++ b/spacy/pt/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG diff --git a/spacy/sv/__init__.py b/spacy/sv/__init__.py index 25930386a..e03c9a56f 100644 --- a/spacy/sv/__init__.py +++ b/spacy/sv/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG from .language_data import * From eef94e3ee2b73b5ccc9fe8bff466eeb0117658e0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 22:28:25 +0100 Subject: [PATCH 77/81] Split off period after two or more uppercase letters (fixes #483) --- spacy/language_data/punctuation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 575027857..d8ed19ca1 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -92,6 +92,7 @@ TOKENIZER_SUFFIXES = ( r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), r'(?<=[0-9])(?:{u})'.format(u=UNITS), r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), + r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER), "'s", "'S", "’s", "’S" ] ) From d5c72c40eb5d4fc429dd290b5f793e2c1cab6da6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 22:28:53 +0100 Subject: [PATCH 78/81] Remove old tests for old website example code --- spacy/tests/website/__init__.py | 0 spacy/tests/website/conftest.py | 20 ---- spacy/tests/website/test_api.py | 172 ----------------------------- spacy/tests/website/test_home.py | 180 ------------------------------- 4 files changed, 372 deletions(-) delete mode 100644 spacy/tests/website/__init__.py delete mode 100644 spacy/tests/website/conftest.py delete mode 100644 spacy/tests/website/test_api.py delete mode 100644 spacy/tests/website/test_home.py diff --git a/spacy/tests/website/__init__.py b/spacy/tests/website/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py deleted file mode 100644 index 4f533ae76..000000000 --- a/spacy/tests/website/conftest.py +++ /dev/null @@ -1,20 +0,0 @@ -from __future__ import unicode_literals -import pytest -import os - - -@pytest.fixture(scope='session') -def nlp(): - from spacy.en import English - if os.environ.get('SPACY_DATA'): - data_dir = os.environ.get('SPACY_DATA') - else: - data_dir = True - return English(path=data_dir) - - -@pytest.fixture() -def doc(nlp): - for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']: - _ = nlp.vocab[word] - return nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tests/website/test_api.py b/spacy/tests/website/test_api.py deleted file mode 100644 index 6a7379d87..000000000 --- a/spacy/tests/website/test_api.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import unicode_literals -import pytest -from spacy.attrs import HEAD -import numpy - - -@pytest.mark.xfail -def test_example_war_and_peace(nlp): - # from spacy.en import English - from spacy._doc_examples import download_war_and_peace - - unprocessed_unicode = download_war_and_peace() - - # nlp = English() - # TODO: ImportError: No module named _doc_examples - doc = nlp(unprocessed_unicode) - - -def test_main_entry_point(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp('Some text.') # Applies tagger, parser, entity - doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser - doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity - doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser - doc = nlp('') # Zero-length tokens, not an error - # doc = nlp(b'Some text') <-- Error: need unicode - doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - -@pytest.mark.models -def test_sentence_spans(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp("This is a sentence. Here's another...") - assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] - - -@pytest.mark.models -def test_entity_spans(nlp): - # from spacy.en import English - # nlp = English() - tokens = nlp('Mr. Best flew to New York on Saturday morning.') - ents = list(tokens.ents) - assert ents[0].label == 346 - assert ents[0].label_ == 'PERSON' - assert ents[0].orth_ == 'Best' - assert ents[0].string == ents[0].string - - -@pytest.mark.models -def test_noun_chunk_spans(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp('The sentence in this example has three noun chunks.') - for chunk in doc.noun_chunks: - print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_) - - # NP The sentence <-- has - # NP this example <-- in - # NP three noun chunks <-- has - - -@pytest.mark.models -def test_count_by(nlp): - # from spacy.en import English, attrs - # nlp = English() - import numpy - from spacy import attrs - tokens = nlp('apple apple orange banana') - assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1} - assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699], - [3699], - [3750], - [5965]], dtype=numpy.int32)) - -@pytest.mark.models -def test_read_bytes(nlp): - from spacy.tokens.doc import Doc - loc = 'test_serialize.bin' - with open(loc, 'wb') as file_: - file_.write(nlp(u'This is a document.').to_bytes()) - file_.write(nlp(u'This is another.').to_bytes()) - docs = [] - with open(loc, 'rb') as file_: - for byte_string in Doc.read_bytes(file_): - docs.append(Doc(nlp.vocab).from_bytes(byte_string)) - assert len(docs) == 2 - - -def test_token_span(doc): - span = doc[4:6] - token = span[0] - assert token.i == 4 - - -@pytest.mark.models -def test_example_i_like_new_york1(nlp): - toks = nlp('I like New York in Autumn.') - - -@pytest.fixture -def toks(nlp): - doc = nlp('I like New York in Autumn.') - doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) - return doc - - -def test_example_i_like_new_york2(toks): - i, like, new, york, in_, autumn, dot = range(len(toks)) - - -@pytest.fixture -def tok(toks, tok): - i, like, new, york, in_, autumn, dot = range(len(toks)) - return locals()[tok] - - -@pytest.fixture -def new(toks): - return tok(toks, "new") - - -@pytest.fixture -def york(toks): - return tok(toks, "york") - - -@pytest.fixture -def autumn(toks): - return tok(toks, "autumn") - - -@pytest.fixture -def dot(toks): - return tok(toks, "dot") - - -def test_example_i_like_new_york3(toks, new, york): - assert toks[new].head.orth_ == 'York' - assert toks[york].head.orth_ == 'like' - - -def test_example_i_like_new_york4(toks, new, york): - new_york = toks[new:york+1] - assert new_york.root.orth_ == 'York' - - -def test_example_i_like_new_york5(toks, autumn, dot): - assert toks[autumn].head.orth_ == 'in' - assert toks[dot].head.orth_ == 'like' - autumn_dot = toks[autumn:] - assert autumn_dot.root.orth_ == 'Autumn' - - -def test_navigating_the_parse_tree_lefts(doc): - # TODO: where does the span object come from? - span = doc[:2] - lefts = [span.doc[i] for i in range(0, span.start) - if span.doc[i].head in span] - - -def test_navigating_the_parse_tree_rights(doc): - span = doc[:2] - rights = [span.doc[i] for i in range(span.end, len(span.doc)) - if span.doc[i].head in span] - - -def test_string_store(doc): - string_store = doc.vocab.strings - for i, string in enumerate(string_store): - assert i == string_store[string] diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py deleted file mode 100644 index 95c0ec3bb..000000000 --- a/spacy/tests/website/test_home.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import unicode_literals -import pytest -import spacy -import os - - -try: - xrange -except NameError: - xrange = range - - -@pytest.fixture() -def token(doc): - return doc[0] - - -@pytest.mark.models -def test_load_resources_and_process_text(): - from spacy.en import English - nlp = English() - doc = nlp(u'Hello, world. Here are two sentences.') - - -@pytest.mark.models -def test_get_tokens_and_sentences(doc): - token = doc[0] - sentence = next(doc.sents) - assert token is sentence[0] - assert sentence.text == 'Hello, world.' - - -@pytest.mark.models -def test_use_integer_ids_for_any_strings(nlp, token): - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - - assert token.orth == hello_id == 3125 - assert token.orth_ == hello_str == 'Hello' - - -def test_get_and_set_string_views_and_flags(nlp, token): - assert token.shape_ == 'Xxxxx' - for lexeme in nlp.vocab: - if lexeme.is_alpha: - lexeme.shape_ = 'W' - elif lexeme.is_digit: - lexeme.shape_ = 'D' - elif lexeme.is_punct: - lexeme.shape_ = 'P' - else: - lexeme.shape_ = 'M' - assert token.shape_ == 'W' - - -def test_export_to_numpy_arrays(nlp, doc): - from spacy.attrs import ORTH, LIKE_URL, IS_OOV - - attr_ids = [ORTH, LIKE_URL, IS_OOV] - doc_array = doc.to_array(attr_ids) - assert doc_array.shape == (len(doc), len(attr_ids)) - assert doc[0].orth == doc_array[0, 0] - assert doc[1].orth == doc_array[1, 0] - assert doc[0].like_url == doc_array[0, 1] - assert list(doc_array[:, 1]) == [t.like_url for t in doc] - - -@pytest.mark.models -def test_word_vectors(nlp): - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - - apples = doc[0] - oranges = doc[2] - boots = doc[6] - hippos = doc[8] - - assert apples.similarity(oranges) > boots.similarity(hippos) - - -@pytest.mark.models -def test_part_of_speech_tags(nlp): - from spacy.parts_of_speech import ADV - - def is_adverb(token): - return token.pos == spacy.parts_of_speech.ADV - - # These are data-specific, so no constants are provided. You have to look - # up the IDs from the StringStore. - NNS = nlp.vocab.strings['NNS'] - NNPS = nlp.vocab.strings['NNPS'] - def is_plural_noun(token): - return token.tag == NNS or token.tag == NNPS - - def print_coarse_pos(token): - print(token.pos_) - - def print_fine_pos(token): - print(token.tag_) - - -@pytest.mark.models -def test_syntactic_dependencies(): - def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels - - -@pytest.mark.models -def test_named_entities(): - def iter_products(docs): - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PRODUCT': - yield ent - - def word_is_in_entity(word): - return word.ent_type != 0 - - def count_parent_verb_by_person(docs): - counts = defaultdict(defaultdict(int)) - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - counts[ent.orth_][ent.root.head.lemma_] += 1 - return counts - - -def test_calculate_inline_mark_up_on_original_string(): - def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' - output = [] - template = '{word}{space}' - for token in doc: - if token.is_space: - output.append(token.orth_) - else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) - string = ''.join(output) - string = string.replace('\n', '') - string = string.replace('\t', ' ') - return string - - -@pytest.mark.models -def test_efficient_binary_serialization(doc): - from spacy.tokens.doc import Doc - - byte_string = doc.to_bytes() - open('moby_dick.bin', 'wb').write(byte_string) - - nlp = spacy.en.English() - for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) - - -@pytest.mark.models -def test_multithreading(nlp): - texts = [u'One document.', u'...', u'Lots of documents'] - # .pipe streams input, and produces streaming output - iter_texts = (texts[i % 3] for i in xrange(100000000)) - for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)): - assert doc.is_parsed - if i == 100: - break - From d87ca840283ea9f1d847a7b5f498a7db993cca25 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 22:42:54 +0100 Subject: [PATCH 79/81] Remove old website example tests from setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 2a1d56a5e..4ba997a0c 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,7 @@ PACKAGES = [ 'spacy.tests.tokenizer', 'spacy.tests.tokens', 'spacy.tests.vectors', - 'spacy.tests.vocab', - 'spacy.tests.website'] + 'spacy.tests.vocab'] MOD_NAMES = [ From 57fab43a3a6a0e00fb50b8aa3209d0ad430c4b95 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Jan 2017 00:42:25 +0100 Subject: [PATCH 80/81] Add info on language-specific issue labels --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 06e0d5f72..2f699ecd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,6 +33,7 @@ We use the following system to tag our issues: | [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems | | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems | | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) | +| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | From c1ef07788cc780ef64af924f31b996fceb1ae9c0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Jan 2017 10:55:44 +1100 Subject: [PATCH 81/81] Update train_ud.py Create deps folder if it doesn't exist. --- bin/parser/train_ud.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 62256cc14..565eab37f 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): features = get_templates('basic') model_dir = pathlib.Path(model_dir) + if not (model_dir / 'deps').exists(): + (model_dir / 'deps').mkdir() with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)