diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 06e0d5f72..2f699ecd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,6 +33,7 @@ We use the following system to tag our issues: | [`install`](https://github.com/explosion/spaCy/labels/install) | Installation problems | | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems | | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) | +| [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific languages, models and data | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | diff --git a/README.rst b/README.rst index c48c3479b..6e03aa0c4 100644 --- a/README.rst +++ b/README.rst @@ -3,8 +3,10 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. -It was designed from day 1 to be used in real products. It's commercial -open-source software, released under the MIT license. +It was designed from day one to be used in real products. spaCy currently supports +English and German, as well as tokenization for Chinese, Spanish, Italian, French, +Portuguese, Dutch, Swedish and Hungarian. It's commercial open-source software, +released under the MIT license. 💫 **Version 1.5 out now!** `Read the release notes here. `_ @@ -24,7 +26,7 @@ open-source software, released under the MIT license. :target: https://pypi.python.org/pypi/spacy :alt: pypi Version -.. image:: https://badges.gitter.im/spaCy-users.png +.. image:: https://badges.gitter.im/explosion.png :target: https://gitter.im/explosion/spaCy :alt: spaCy on Gitter diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 62256cc14..565eab37f 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -71,6 +71,8 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): features = get_templates('basic') model_dir = pathlib.Path(model_dir) + if not (model_dir / 'deps').exists(): + (model_dir / 'deps').mkdir() with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) diff --git a/setup.py b/setup.py index 2a1d56a5e..4ba997a0c 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,7 @@ PACKAGES = [ 'spacy.tests.tokenizer', 'spacy.tests.tokens', 'spacy.tests.vectors', - 'spacy.tests.vocab', - 'spacy.tests.website'] + 'spacy.tests.vocab'] MOD_NAMES = [ diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index f64c915f6..5e09c0eb3 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) diff --git a/spacy/de/tokenizer_exceptions.py b/spacy/de/tokenizer_exceptions.py index b0561a223..0d8dc54e8 100644 --- a/spacy/de/tokenizer_exceptions.py +++ b/spacy/de/tokenizer_exceptions.py @@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "'", - "\\\")", - "", - "a.", - "ä.", "A.C.", "a.D.", "A.D.", @@ -530,24 +525,20 @@ ORTH_ONLY = [ "Abs.", "adv.", "al.", - "b.", "B.A.", "B.Sc.", "betr.", "biol.", "Biol.", - "c.", "ca.", "Chr.", "Cie.", "co.", "Co.", - "d.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", - "e.", "e.g.", "e.V.", "ehem.", @@ -555,79 +546,57 @@ ORTH_ONLY = [ "erm.", "etc.", "ev.", - "f.", - "g.", "G.m.b.H.", "geb.", "Gebr.", "gem.", - "h.", "h.c.", "Hg.", "hrsg.", "Hrsg.", - "i.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.", "Ing.", - "j.", "jr.", "Jr.", "jun.", "jur.", - "k.", "K.O.", - "l.", "L.A.", "lat.", - "m.", "M.A.", "m.E.", "m.M.", "M.Sc.", "Mr.", - "n.", "N.Y.", "N.Y.C.", "nat.", "ö." - "o.", "o.a.", "o.ä.", "o.g.", "o.k.", "O.K.", - "p.", "p.a.", "p.s.", "P.S.", "pers.", "phil.", - "q.", "q.e.d.", - "r.", "R.I.P.", "rer.", - "s.", "sen.", "St.", "std.", - "t.", - "u.", - "ü.", "u.a.", "U.S.", "U.S.A.", "U.S.S.", - "v.", "Vol.", "vs.", - "w.", - "wiss.", - "x.", - "y.", - "z." + "wiss." ] diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index a75f2b9d5..1fcbf277e 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -37,14 +37,16 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 398ae486b..38fc33cfb 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -5,935 +5,355 @@ from ..symbols import * from ..language_data import PRON_LEMMA -TOKENIZER_EXCEPTIONS = { - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], +EXC = {} - "Theydve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], +EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Well", "well", "Whore", "whore"] - "shouldn't've": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - "There'll": [ - {ORTH: "There"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], +# Pronouns - "howll": [ - {ORTH: "how"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], +for pron in ["i"]: + for orth in [pron, pron.title()]: + EXC[orth + "'m"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} + ] - "Hadn't've": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], + EXC[orth + "m"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } + ] - "who'll": [ - {ORTH: "who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], + EXC[orth + "'ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] - "aint": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], + EXC[orth + "ma"] = [ + {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", LEMMA: "be", NORM: "am"}, + {ORTH: "a", LEMMA: "going to", NORM: "gonna"} + ] + +for pron in ["i", "you", "he", "she", "it", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for pron in ["i", "you", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for pron in ["you", "we", "they"]: + for orth in [pron, pron.title()]: + EXC[orth + "'re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + +for pron in ["he", "she", "it"]: + for orth in [pron, pron.title()]: + EXC[orth + "'s"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ] + + EXC[orth + "s"] = [ + {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ] + + + +# W-words, relative pronouns, prepositions etc. + +for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: + for orth in [word, word.title()]: + EXC[orth + "'s"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'s"} + ] + + EXC[orth + "s"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "s"} + ] + + EXC[orth + "'ll"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "ll"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ] + + EXC[orth + "'ll've"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "llve"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'re"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "re"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ] + + EXC[orth + "'ve"] = [ + {ORTH: orth}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "ve"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "'d"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'d"} + ] + + EXC[orth + "d"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "d"} + ] + + EXC[orth + "'d've"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[orth + "dve"] = [ + {ORTH: orth, LEMMA: word}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +# Verbs + +for verb_data in [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "could", TAG: "MD"}, + {ORTH: "do", LEMMA: "do"}, + {ORTH: "does", LEMMA: "do"}, + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "may"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "need"}, + {ORTH: "ought"}, + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "should"}, + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "would"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "n't've"] = [ + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ntve"] = [ + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "might"}, + {ORTH: "must"}, + {ORTH: "should"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "'ve"] = [ + dict(data), + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ] + + EXC[data[ORTH] + "ve"] = [ + dict(data), + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ] + + +for verb_data in [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "was", LEMMA: "be"}, + {ORTH: "were", LEMMA: "be"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "n't"] = [ + dict(data), + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ] + + EXC[data[ORTH] + "nt"] = [ + dict(data), + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ] + + + +# Other contractions with trailing apostrophe + +for exc_data in [ + {ORTH: "doin", LEMMA: "do", NORM: "doing"}, + {ORTH: "goin", LEMMA: "go", NORM: "going"}, + {ORTH: "nothin", LEMMA: "nothing"}, + {ORTH: "nuthin", LEMMA: "nothing"}, + {ORTH: "ol", LEMMA: "old"}, + {ORTH: "somethin", LEMMA: "something"} +]: + exc_data_tc = dict(exc_data) + exc_data_tc[ORTH] = exc_data_tc[ORTH].title() + + for data in [exc_data, exc_data_tc]: + data_apos = dict(data) + data_apos[ORTH] = data_apos[ORTH] + "'" + + EXC[data[ORTH]] = [ + dict(data) + ] + + EXC[data_apos[ORTH]] = [ + dict(data_apos) + ] + + +# Other contractions with leading apostrophe + +for exc_data in [ + {ORTH: "cause", LEMMA: "because"}, + {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, + {ORTH: "ll", LEMMA: "will"}, + {ORTH: "nuff", LEMMA: "enough"} +]: + exc_data_apos = dict(exc_data) + exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] + + for data in [exc_data, exc_data_apos]: + EXC[data[ORTH]] = [ + dict(data) + ] + + +# Rest + +OTHER = { " ": [ - {TAG: "SP", ORTH: " "} - ], - - "Shouldnt": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "when's": [ - {ORTH: "when"}, - {ORTH: "'s", LEMMA: "be"} - ], - - "Didnt": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "itll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Who're": [ - {ORTH: "Who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Ain't": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Can't": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyre": [ - {ORTH: "Why"}, - {ORTH: "re"} - ], - - "Aren't": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Neednt": [ - {ORTH: "Need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "should've": [ - {ORTH: "should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "shouldn't": [ - {ORTH: "should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Idve": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "weve": [ - {ORTH: "we"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Ive": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Youdve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weren't": [ - {ORTH: "Were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "werent": [ - {ORTH: "were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyre": [ - {ORTH: "why"}, - {ORTH: "re"} - ], - - "I'm": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "She'd've": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "we'll": [ - {ORTH: "we"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Don't": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Whyll": [ - {ORTH: "Why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "they've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "wasn't": [ - {ORTH: "was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "could've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what've": [ - {ORTH: "what"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "havent": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Who've": [ - {ORTH: "Who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shan't": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "i'll": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "you'd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "whens": [ - {ORTH: "when"}, - {ORTH: "s", LEMMA: "be"} - ], - - "whys": [ - {ORTH: "why"}, - {ORTH: "s"} - ], - - "Whereve": [ - {ORTH: "Where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} + {ORTH: " ", TAG: "SP"} ], "\u00a0": [ {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} ], - "there'd": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "hadn't've": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "whatll": [ - {ORTH: "what"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't've": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "there's": [ - {ORTH: "there"}, - {ORTH: "'s"} - ], - - "Who'll": [ - {ORTH: "Who"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "youll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldve": [ - {ORTH: "would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldnt": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Thered": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youre": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be"} - ], - - "Couldn't've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "who're": [ - {ORTH: "who"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Whys": [ - {ORTH: "Why"}, - {ORTH: "s"} - ], - - "mightn't've": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wholl": [ - {ORTH: "Who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hadn't": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Havent": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Whatve": [ - {ORTH: "What"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Thats": [ - {ORTH: "That"}, - {ORTH: "s"} - ], - - "Howll": [ - {ORTH: "How"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "wouldn't": [ - {ORTH: "would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "You'll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Cant": [ - {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "i'd": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "weren't": [ - {ORTH: "were"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "would've": [ - {ORTH: "would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "i'm": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "why'll": [ - {ORTH: "why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "we'd've": [ - {ORTH: "we"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Shouldve": [ - {ORTH: "Should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "can't": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "thats": [ - {ORTH: "that"}, - {ORTH: "s"} - ], - - "Hes": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Needn't": [ - {ORTH: "Need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "It's": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Why're": [ - {ORTH: "Why"}, - {ORTH: "'re", LEMMA: "be"} - ], - - "Hed": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Mt.": [ - {ORTH: "Mt.", LEMMA: "Mount"} - ], - - "couldn't": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What've": [ - {ORTH: "What"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "theydve": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "aren't": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Mightn't": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - "'S": [ {ORTH: "'S", LEMMA: "'s"} ], - "I've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + "'s": [ + {ORTH: "'s", LEMMA: "'s"} ], - "Whered": [ - {ORTH: "Where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Itdve": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'ma": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "whos": [ - {ORTH: "who"}, - {ORTH: "s"} - ], - - "They'd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "What'll": [ - {ORTH: "What"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustve": [ - {ORTH: "Must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "whod": [ - {ORTH: "who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "mightntve": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "I'd've": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Must've": [ - {ORTH: "Must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "it'd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "what're": [ - {ORTH: "what"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Wasn't": [ - {ORTH: "Was"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "what's": [ - {ORTH: "what"}, - {ORTH: "'s"} - ], - - "he'd've": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "She'd": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shedve": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "ain't": [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "She's": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "i'd've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We'd've": [ - {ORTH: "We"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "must've": [ - {ORTH: "must"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "That's": [ - {ORTH: "That"}, - {ORTH: "'s"} - ], - - "whatre": [ - {ORTH: "what"}, - {ORTH: "re"} - ], - - "you'd've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Dont": [ - {ORTH: "Do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "thered": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Youd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "couldn't've": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whens": [ - {ORTH: "When"}, - {ORTH: "s"} - ], - - "Isnt": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "mightve": [ - {ORTH: "might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "didnt": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "ive": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "It'd've": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ], - - "Itll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "didn't": [ - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "cant": [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "im": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} - ], - - "they'd've": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Hadntve": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Weve": [ - {ORTH: "We"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightnt": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "youdve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Shedve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theyd": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Hadn't": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "What're": [ - {ORTH: "What"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "He'll": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "wholl": [ - {ORTH: "who"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "They're": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "shouldnt": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "whered": [ - {ORTH: "where"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "youve": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "mustve": [ - {ORTH: "must"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Youve": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "therell": [ - {ORTH: "there"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "might've": [ - {ORTH: "might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mustn't": [ - {ORTH: "Must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wheres": [ - {ORTH: "where"}, - {ORTH: "s"} - ], - - "they're": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "idve": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hows": [ - {ORTH: "how"}, - {ORTH: "s"} - ], - - "youre": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Didn't": [ - {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Im": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"} - ], - - "howd": [ - {ORTH: "how"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "you've": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "You're": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "she'll": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "don't": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "itd": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Hedve": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "isnt": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "won't": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "We're": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + "'re": [ {ORTH: "'re", LEMMA: "be", NORM: "are"} ], @@ -945,391 +365,80 @@ TOKENIZER_EXCEPTIONS = { {ORTH: "\u2018s", LEMMA: "'s"} ], - "dont": [ - {ORTH: "do", LEMMA: "do"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} ], - "ima": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} + "'Cause": [ + {ORTH: "'Cause", LEMMA: "because"} ], - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: "us"} + "y'all": [ + {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} ], - "he's": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} + "yall": [ + {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, + {ORTH: "all"} ], - "we've": [ - {ORTH: "we"}, + "ma'am": [ + {ORTH: "ma'am", LEMMA: "madam"} + ], + + "Ma'am": [ + {ORTH: "Ma'am", LEMMA: "madam"} + ], + + "o'clock": [ + {ORTH: "o'clock", LEMMA: "o'clock"} + ], + + "O'clock": [ + {ORTH: "O'clock", LEMMA: "o'clock"} + ], + + "how'd'y": [ + {ORTH: "how", LEMMA: "how"}, + {ORTH: "'d", LEMMA: "do"}, + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} + ], + + "How'd'y": [ + {ORTH: "How", LEMMA: "how"}, + {ORTH: "'d", LEMMA: "do"}, + {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"} + ], + + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "'ve", LEMMA: "have", TAG: "VB"} ], - "What's": [ - {ORTH: "What"}, - {ORTH: "'s"} - ], - - "Who's": [ - {ORTH: "Who"}, - {ORTH: "'s"} - ], - - "hedve": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "ve", LEMMA: "have", TAG: "VB"} ], - "he'd": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "When's": [ - {ORTH: "When"}, - {ORTH: "'s"} - ], - - "Mightn't've": [ - {ORTH: "Might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "We've": [ - {ORTH: "We"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Couldntve": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Who'd": [ - {ORTH: "Who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "haven't": [ - {ORTH: "have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "arent": [ - {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "You'd've": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldn't": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "who's": [ - {ORTH: "who"}, - {ORTH: "'s"} - ], - - "Mightve": [ - {ORTH: "Might"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Theredve": [ - {ORTH: "There"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "theredve": [ - {ORTH: "there"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who'd": [ - {ORTH: "who"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Where's": [ - {ORTH: "Where"}, - {ORTH: "'s"} - ], - - "wont": [ - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "she'd've": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Should've": [ - {ORTH: "Should"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "theyre": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re"} - ], - - "Wouldntve": [ - {ORTH: "Would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Where've": [ - {ORTH: "Where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mustn't": [ - {ORTH: "must"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "isn't": [ - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Aint": [ - {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "why's": [ - {ORTH: "why"}, - {ORTH: "'s"} - ], - - "There'd": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "They'll": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "how'll": [ - {ORTH: "how"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Wedve": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldntve": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "There's": [ - {ORTH: "There"}, - {ORTH: "'s"} - ], - - "we'd": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whod": [ - {ORTH: "Who"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "whatve": [ - {ORTH: "what"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wouldve": [ - {ORTH: "Would"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "there'll": [ - {ORTH: "there"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "needn't": [ - {ORTH: "need"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "shouldntve": [ - {ORTH: "should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "why're": [ - {ORTH: "why"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ], - - "Doesnt": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whereve": [ - {ORTH: "where"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "they'll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "I'd": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Might've": [ - {ORTH: "Might"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "mightnt": [ - {ORTH: "might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - "Not've": [ {ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "'ve", LEMMA: "have", TAG: "VB"} ], - "mightn't": [ - {ORTH: "might"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} ], - "you're": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} ], - "They've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "what'll": [ - {ORTH: "what"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Could've": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Would've": [ - {ORTH: "Would"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Isn't": [ - {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: "us"} - ], - - "She'll": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "You'd": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "wouldnt": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Why'll": [ - {ORTH: "Why"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Where'd": [ - {ORTH: "Where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Theyre": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "Won't": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Couldn't": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "it's": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "it'll": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "They'd've": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Ima": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ma"} + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} ], "gonna": [ @@ -1342,452 +451,45 @@ TOKENIZER_EXCEPTIONS = { {ORTH: "na", LEMMA: "to"} ], - "whats": [ - {ORTH: "what"}, - {ORTH: "s"} + "gotta": [ + {ORTH: "got"}, + {ORTH: "ta", LEMMA: "to"} ], - "How's": [ - {ORTH: "How"}, - {ORTH: "'s"} + "Gotta": [ + {ORTH: "Got"}, + {ORTH: "ta", LEMMA: "to"} ], - "Shouldntve": [ - {ORTH: "Should"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} ], - "youd": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "Whatll": [ - {ORTH: "What"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "Wouldn't've": [ - {ORTH: "Would"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "How'd": [ - {ORTH: "How"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "doesnt": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shouldn't": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "He'd've": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Mightntve": [ - {ORTH: "Might"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "couldnt": [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Haven't": [ - {ORTH: "Have", TAG: "VB"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "doesn't": [ - {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Hasn't": [ - {ORTH: "Has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "how's": [ - {ORTH: "how"}, - {ORTH: "'s"} - ], - - "hes": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "he'll": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "hed": [ - {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "how'd": [ - {ORTH: "how"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "we're": [ - {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM :"are"} - ], - - "Hadnt": [ - {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Shant": [ - {ORTH: "Sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Theyve": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Hows": [ - {ORTH: "How"}, - {ORTH: "s"} - ], - - "We'll": [ - {ORTH: "We"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "i've": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whove": [ - {ORTH: "Who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "i'ma": [ - {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ma"} - ], - - "Howd": [ - {ORTH: "How"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "hadnt": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "shant": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "There'd've": [ - {ORTH: "There"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "I'll": [ - {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Why's": [ - {ORTH: "Why"}, - {ORTH: "'s"} - ], - - "Shouldn't've": [ - {ORTH: "Should"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Wasnt": [ - {ORTH: "Was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whove": [ - {ORTH: "who"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "hasn't": [ - {ORTH: "has"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "wouldntve": [ - {ORTH: "would"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Wheres": [ - {ORTH: "Where"}, - {ORTH: "s"} - ], - - "How'll": [ - {ORTH: "How"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "there'd've": [ - {ORTH: "there"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whos": [ - {ORTH: "Who"}, - {ORTH: "s"} - ], - - "shes": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "Doesn't": [ - {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Arent": [ - {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Hasnt": [ - {ORTH: "Has", LEMMA: "have"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He's": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "wasnt": [ - {ORTH: "was"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "whyll": [ - {ORTH: "why"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "mustnt": [ - {ORTH: "must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "He'd": [ - {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Shes": [ - {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "s"} - ], - - "where've": [ - {ORTH: "where"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Youll": [ - {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "hasnt": [ - {ORTH: "has"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "theyll": [ - {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "it'd've": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "itdve": [ - {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "wedve": [ - {ORTH: "we"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Werent": [ - {ORTH: "Were"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "Therell": [ - {ORTH: "There"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ], - - "shan't": [ - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ], - - "Wont": [ - {ORTH: "Wo", LEMMA: "will"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "hadntve": [ - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "who've": [ - {ORTH: "who"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Whatre": [ - {ORTH: "What"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "where'd": [ - {ORTH: "where"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "shouldve": [ - {ORTH: "should"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "where's": [ - {ORTH: "where"}, - {ORTH: "'s"} - ], - - "neednt": [ - {ORTH: "need"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "It'll": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "We'd": [ - {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], - - "Whats": [ - {ORTH: "What"}, - {ORTH: "s"} + "Let's": [ + {ORTH: "Let", LEMMA: "let"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} ], "\u2014": [ {ORTH: "\u2014", TAG: ":", LEMMA: "--"} ], - "Itd": [ - {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} + "\n": [ + {ORTH: "\n", TAG: "SP"} ], - "she'd": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ], + "\t": [ + {ORTH: "\t", TAG: "SP"} + ] +} - "Mustnt": [ - {ORTH: "Must"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], +# Abbreviations - "you'll": [ - {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ], - - "Theyd": [ - {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ], - - "she's": [ - {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"} - ], - - "Couldnt": [ - {ORTH: "Could", TAG: "MD"}, - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ], - - "that's": [ - {ORTH: "that"}, - {ORTH: "'s"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} +ABBREVIATIONS = { + "Mt.": [ + {ORTH: "Mt.", LEMMA: "Mount"} ], "Ak.": [ @@ -2000,41 +702,41 @@ TOKENIZER_EXCEPTIONS = { } +TOKENIZER_EXCEPTIONS = dict(EXC) +TOKENIZER_EXCEPTIONS.update(OTHER) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) + + +# Remove EXCLUDE_EXC if in exceptions + +for string in EXCLUDE_EXC: + if string in TOKENIZER_EXCEPTIONS: + TOKENIZER_EXCEPTIONS.pop(string) + + +# Abbreviations with only one ORTH token + ORTH_ONLY = [ - "''", - "\")", - "a.", + "'d", "a.m.", "Adm.", - "b.", "Bros.", - "c.", "co.", "Co.", "Corp.", - "d.", "D.C.", "Dr.", - "e.", "e.g.", "E.g.", "E.G.", - "f.", - "g.", "Gen.", "Gov.", - "h.", - "i.", "i.e.", "I.e.", "I.E.", "Inc.", - "j.", "Jr.", - "k.", - "l.", "Ltd.", - "m.", "Md.", "Messrs.", "Mo.", @@ -2042,24 +744,11 @@ ORTH_ONLY = [ "Mr.", "Mrs.", "Ms.", - "n.", - "o.", - "p.", "p.m.", "Ph.D.", - "q.", - "r.", "Rep.", "Rev.", - "s.", "Sen.", "St.", - "t.", - "u.", - "v.", - "vs.", - "w.", - "x.", - "y.", - "z." + "vs." ] diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index 3357c9ac8..7c44752cb 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -40,11 +40,14 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) STOP_WORDS = set(STOP_WORDS) + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index f9259ce93..93bc74642 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "a.", "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", - "b.", - "c.", "Cía.", - "d.", - "e.", "etc.", - "f.", - "g.", "Gob.", "Gral.", - "h.", - "i.", "Ing.", - "j.", "J.C.", - "k.", - "l.", "Lic.", - "m.", "m.n.", - "n.", "no.", "núm.", - "o.", - "p.", "P.D.", "Prof.", "Profa.", - "q.", "q.e.p.d." - "r.", - "s.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", - "Srta.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z." + "Srta." ] diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index e612fe064..bbbeb1535 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -2,13 +2,16 @@ from __future__ import unicode_literals from .. import language_data as base -from ..language_data import strings_to_exc +from ..language_data import strings_to_exc, update_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 94eeb6f4d..49652c5ac 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -4,21 +4,25 @@ from __future__ import unicode_literals import six from spacy.language_data import strings_to_exc, update_exc -from .punctuations import * +from .punctuation import * from .stop_words import STOP_WORDS from .tokenizer_exceptions import ABBREVIATIONS from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base + STOP_WORDS = set(STOP_WORDS) + + TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES -TOKENIZER_INFIXES = TOKENIZER_INFIXES - -# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] - +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) + +TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES +TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES +TOKENIZER_INFIXES = TOKENIZER_INFIXES + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py new file mode 100644 index 000000000..e28052fd3 --- /dev/null +++ b/spacy/hu/punctuation.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES + + +TOKENIZER_SUFFIXES = [ + r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) +] + +TOKENIZER_INFIXES = [ + r'(?<=[0-9])-(?=[0-9])', + r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[{a}])--(?=[{a}])', + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) +] + + +TOKENIZER_INFIXES += LIST_ELLIPSES + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuations.py b/spacy/hu/punctuations.py deleted file mode 100644 index 3681a2fbe..000000000 --- a/spacy/hu/punctuations.py +++ /dev/null @@ -1,89 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -TOKENIZER_PREFIXES = r''' -+ -'''.strip().split('\n') - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -\$ -> -: -; -' -” -“ -« -_ -'' -’ -‘ -€ -\.\. -\.\.\. -\.\.\.\. -(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\. -(?<=[a-züóőúéáűí)])-e -\-\- -´ -(?<=[0-9])\+ -(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\. -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=°[FCK])\. -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') - -TOKENIZER_INFIXES = r''' -… -\.\.+ -(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[0-9])[+\-\*/^](?=[0-9]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -'''.strip().split('\n') - -__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 627035bb8..46122564c 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -111,7 +111,6 @@ Vcs. Vhr. X.Y. Zs. -a. a.C. ac. adj. @@ -126,7 +125,6 @@ ang. arch. at. aug. -b. b.a. b.s. b.sc. @@ -141,7 +139,6 @@ br. bsc. bt. btk. -c. ca. cc. cca. @@ -155,7 +152,6 @@ csc. csüt. cső. ctv. -d. dbj. dd. ddr. @@ -170,7 +166,6 @@ dolg. dr. du. dzs. -e. ea. ed. eff. @@ -186,7 +181,6 @@ etc. ev. ezr. eü. -f. f.h. f.é. fam. @@ -213,7 +207,6 @@ főig. főisk. főtörm. főv. -g. gazd. gimn. gk. @@ -225,7 +218,6 @@ gy. gyak. gyártm. gör. -h. hads. hallg. hdm. @@ -266,7 +258,6 @@ isk. ism. izr. iá. -j. jan. jav. jegyz. @@ -278,7 +269,6 @@ jr. jvb. júl. jún. -k. karb. kat. kb. @@ -313,7 +303,6 @@ közl. közp. közt. kü. -l. lat. ld. legs. @@ -324,7 +313,6 @@ lt. ltd. ltp. luth. -m. m.a. m.s. m.sc. @@ -359,7 +347,6 @@ műh. műsz. műv. művez. -n. nagyker. nagys. nat. @@ -372,7 +359,6 @@ ny. nyilv. nyrt. nyug. -o. obj. okl. okt. @@ -381,7 +367,6 @@ orsz. ort. ov. ovh. -p. pf. pg. ph.d @@ -404,8 +389,6 @@ pság. ptk. pu. pü. -q. -r. r.k. rac. rad. @@ -420,7 +403,6 @@ rkt. rt. rtg. röv. -s. s.b. s.k. sa. @@ -450,7 +432,6 @@ szt. szubj. szöv. szül. -t. tanm. tb. tbk. @@ -476,13 +457,11 @@ tvr. ty. törv. tü. -u. ua. ui. unit. uo. uv. -v. vas. vb. vegy. @@ -501,9 +480,6 @@ vv. vál. vízv. vö. -w. -y. -z. zrt. zs. Ész. @@ -520,7 +496,6 @@ zs. évf. í. ó. -ö. össz. ötk. özv. @@ -528,7 +503,6 @@ zs. úm. ún. út. -ü. üag. üd. üdv. @@ -544,6 +518,5 @@ zs. """.strip().split() OTHER_EXC = """ -'' -e """.strip().split() diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index 2ef60fd94..bc0d13cab 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/it/language_data.py +++ b/spacy/it/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..43a4ef0be 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,3 +1,4 @@ +from .abbreviations import * from .emoticons import * from .punctuation import * from .tag_map import * diff --git a/spacy/language_data/abbreviations.py b/spacy/language_data/abbreviations.py new file mode 100644 index 000000000..b49daa0ad --- /dev/null +++ b/spacy/language_data/abbreviations.py @@ -0,0 +1,43 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +ABBREVIATIONS = [ + "'", + "\\\")", + "", + "''", + "C++", + "a.", + "b.", + "c.", + "d.", + "e.", + "f.", + "g.", + "h.", + "i.", + "j.", + "k.", + "l.", + "m.", + "n.", + "o.", + "p.", + "q.", + "r.", + "s.", + "t.", + "u.", + "v.", + "w.", + "x.", + "y.", + "z.", + "ä.", + "ö.", + "ü." +] + + +__all__ = [ "ABBREVIATIONS" ] diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py index 3fa44368d..bc951a007 100644 --- a/spacy/language_data/emoticons.py +++ b/spacy/language_data/emoticons.py @@ -13,6 +13,7 @@ EMOTICONS = set(""" (-: =) (= +") :] :-] [: diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index fb784271e..d8ed19ca1 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -1,133 +1,115 @@ # encoding: utf8 from __future__ import unicode_literals - -TOKENIZER_PREFIXES = r''' -, -" -( -[ -{ -* -< -> -$ -£ -¡ -¿ -„ -“ -' -`` -` -# -‘ -.... -... -… -‚ -» -§ -US$ -C$ -A$ -a- -'''.strip().split('\n') +import re -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -% -\$ -> -: -; -' -” -“ -« -_ -'' -'s -'S -’s -’S -’ -‘ -° -€ -… -\.\. -\.\.\. -\.\.\.\. -(?<=[a-z0-9)\]”"'%\)])\. -(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. -\-\- -´ -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=[0-9])°C -(?<=[0-9])°K -(?<=[0-9])°F -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') +_ALPHA_LOWER = """ +a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı +î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s +ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ +""" -TOKENIZER_INFIXES = r''' -… -\.\.\.+ -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-z])\.(?=[A-Z]) -(?<=[a-zA-Z])-(?=[a-zA-z]) -(?<=[a-zA-Z])--(?=[a-zA-z]) -(?<=[0-9])-(?=[0-9]) -(?<=[A-Za-z]),(?=[A-Za-z]) -(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ]) -(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ]) -'''.strip().split('\n') +_ALPHA_UPPER = """ +A Ä À Á  Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ +Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S +Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ +""" + + +_UNITS = """ +km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg +µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb +TB T G M K +""" + + +_CURRENCY = r""" +\$ £ € ¥ ฿ US\$ C\$ A\$ +""" + + +_QUOTES = r""" +' '' " ” “ `` ` ‘ ´ ‚ , „ » « +""" + + +_PUNCT = r""" +… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & +""" + + +_HYPHENS = r""" +- – — -- --- +""" + + +LIST_ELLIPSES = [ + r'\.\.+', + "…" +] + + +LIST_CURRENCY = list(_CURRENCY.strip().split()) +LIST_QUOTES = list(_QUOTES.strip().split()) +LIST_PUNCT = list(_PUNCT.strip().split()) +LIST_HYPHENS = list(_HYPHENS.strip().split()) + + +ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '') +ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '') +ALPHA = ALPHA_LOWER + ALPHA_UPPER + + +QUOTES = _QUOTES.strip().replace(' ', '|') +CURRENCY = _CURRENCY.strip().replace(' ', '|') +UNITS = _UNITS.strip().replace(' ', '|') +HYPHENS = _HYPHENS.strip().replace(' ', '|') + + + +# Prefixes + +TOKENIZER_PREFIXES = ( + ['§', '%', r'\+'] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY +) + + +# Suffixes + +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + [ + r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', + r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), + r'(?<=[0-9])(?:{u})'.format(u=UNITS), + r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), + r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER), + "'s", "'S", "’s", "’S" + ] +) + + +# Infixes + +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + [ + r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) + ] +) __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index f5b6b5040..966960721 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -20,5 +20,6 @@ TAG_MAP = { "X": {POS: X}, "CONJ": {POS: CONJ}, "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + "VERB": {POS: VERB}, + "PART": {POS: PART} } diff --git a/spacy/nl/__init__.py b/spacy/nl/__init__.py index d958783ea..d4aa39506 100644 --- a/spacy/nl/__init__.py +++ b/spacy/nl/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG from .language_data import * diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py index 06c6417dc..ed26fb0b3 100644 --- a/spacy/pt/__init__.py +++ b/spacy/pt/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/sv/__init__.py b/spacy/sv/__init__.py index 25930386a..e03c9a56f 100644 --- a/spacy/sv/__init__.py +++ b/spacy/sv/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG from .language_data import * diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/sv/language_data.py +++ b/spacy/sv/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/tests/website/__init__.py b/spacy/tests/de/__init__.py similarity index 100% rename from spacy/tests/website/__init__.py rename to spacy/tests/de/__init__.py diff --git a/spacy/tests/de/conftest.py b/spacy/tests/de/conftest.py new file mode 100644 index 000000000..c6b8be26e --- /dev/null +++ b/spacy/tests/de/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...de import German + + +@pytest.fixture +def de_tokenizer(): + return German.Defaults.create_tokenizer() diff --git a/spacy/tests/de/tokenizer/__init__.py b/spacy/tests/de/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/de/tokenizer/test_exceptions.py b/spacy/tests/de/tokenizer/test_exceptions.py new file mode 100644 index 000000000..13da3dc33 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_exceptions.py @@ -0,0 +1,27 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handles correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) +def test_tokenizer_splits_contractions(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) +def test_tokenizer_handles_abbr(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(de_tokenizer): + text = "Ich bin z.Zt. im Urlaub." + tokens = de_tokenizer(text) + assert len(tokens) == 6 + assert tokens[2].text == "z.Zt." + assert tokens[2].lemma_ == "zur Zeit" diff --git a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py new file mode 100644 index 000000000..dcf4f4ef0 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py @@ -0,0 +1,116 @@ +# coding: utf-8 +"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["(unter)"]) +def test_tokenizer_splits_no_special(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm"]) +def test_tokenizer_splits_no_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(unter'm"]) +def test_tokenizer_splits_prefix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm)"]) +def test_tokenizer_splits_suffix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(unter'm)"]) +def test_tokenizer_splits_even_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["(unter'm?)"]) +def test_tokenizer_splits_uneven_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) +def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text', ["z.B.)"]) +def test_tokenizer_splits_suffix_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(z.B.)"]) +def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(z.B.?)"]) +def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["blau-rot"]) +def test_tokenizer_splits_hyphens(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"]) +def test_tokenizer_splits_period_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"]) +def test_tokenizer_splits_comma_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"]) +def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): + tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") + assert len(tokens) == 12 + assert tokens[0].text == "Viele" + assert tokens[1].text == "Regeln" + assert tokens[2].text == "--" + assert tokens[3].text == "wie" + assert tokens[4].text == "die" + assert tokens[5].text == "Bindestrich" + assert tokens[6].text == "-" + assert tokens[7].text == "Regeln" + assert tokens[8].text == "--" + assert tokens[9].text == "sind" + assert tokens[10].text == "kompliziert" diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py new file mode 100644 index 000000000..84fa6f2a5 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_text.py @@ -0,0 +1,45 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_long_text(de_tokenizer): + text = """Die Verwandlung + +Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in +seinem Bett zu einem ungeheueren Ungeziefer verwandelt. + +Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig +hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten +Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit, +kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen +Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. + +»Was ist mit mir geschehen?«, dachte er.""" + + tokens = de_tokenizer(text) + assert len(tokens) == 109 + + +@pytest.mark.parametrize('text,length', [ + ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), + ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), + ("Kraftfahrzeug-Haftpflichtversicherung", 3), + ("Vakuum-Mittelfrequenz-Induktionsofen", 5) + ]) +def test_tokenizer_handles_long_words(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [ + ("»Was ist mit mir geschehen?«, dachte er.", 12), + ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15) + ]) +def test_tokenizer_handles_examples(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/en/__init__.py b/spacy/tests/en/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/en/conftest.py b/spacy/tests/en/conftest.py new file mode 100644 index 000000000..3a3516c41 --- /dev/null +++ b/spacy/tests/en/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...en import English + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() diff --git a/spacy/tests/en/tokenizer/__init__.py b/spacy/tests/en/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/en/tokenizer/test_contractions.py b/spacy/tests/en/tokenizer/test_contractions.py new file mode 100644 index 000000000..a97b8f5ba --- /dev/null +++ b/spacy/tests/en/tokenizer/test_contractions.py @@ -0,0 +1,87 @@ +# coding: utf-8 +"""Test that tokens are created correctly for contractions.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_basic_contraction(en_tokenizer): + text = "don't giggle" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "n't" + text = "i said don't!" + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert tokens[4].text == "!" + + +@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) +def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) +def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): + tokens = en_tokenizer(text_poss) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == "'s" + + +@pytest.mark.parametrize('text', ["schools'", "Alexis'"]) +def test_tokenizer_splits_trailing_apos(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'" + + +@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"]) +def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"]) +def test_tokenizer_handles_ll_contraction(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'ll" + assert tokens[1].lemma_ == "will" + + +@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")]) +def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title): + tokens_lower = en_tokenizer(text_lower) + tokens_title = en_tokenizer(text_title) + assert tokens_title[0].text == tokens_lower[0].text.title() + assert tokens_lower[0].text == tokens_title[0].text.lower() + assert tokens_lower[1].text == tokens_title[1].text + + +@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) +@pytest.mark.parametrize('contraction', ["'ll", "'d"]) +def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction): + tokens = en_tokenizer(pron + contraction) + assert tokens[0].text == pron + assert tokens[1].text == contraction + + +@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) +def test_tokenizer_excludes_ambiguous(en_tokenizer, exc): + tokens = en_tokenizer(exc) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")]) +def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): + tokens = en_tokenizer(wo_punct) + assert len(tokens) == 2 + tokens = en_tokenizer(w_punct) + assert len(tokens) == 3 diff --git a/spacy/tests/en/tokenizer/test_exceptions.py b/spacy/tests/en/tokenizer/test_exceptions.py new file mode 100644 index 000000000..ac7ed452f --- /dev/null +++ b/spacy/tests/en/tokenizer/test_exceptions.py @@ -0,0 +1,20 @@ +# coding: utf-8 +"""Test that tokenizer exceptions are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) +def test_tokenizer_handles_abbr(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(en_tokenizer): + text = "It's mediocre i.e. bad." + tokens = en_tokenizer(text) + assert len(tokens) == 6 + assert tokens[3].text == "i.e." diff --git a/spacy/tests/tokenizer/test_indices.py b/spacy/tests/en/tokenizer/test_indices.py similarity index 91% rename from spacy/tests/tokenizer/test_indices.py rename to spacy/tests/en/tokenizer/test_indices.py index 5df7bcc59..0ed6ca4dc 100644 --- a/spacy/tests/tokenizer/test_indices.py +++ b/spacy/tests/en/tokenizer/test_indices.py @@ -1,12 +1,14 @@ +# coding: utf-8 """Test that token.idx correctly computes index into the original string.""" + from __future__ import unicode_literals import pytest def test_simple_punct(en_tokenizer): - text = 'to walk, do foo' + text = "to walk, do foo" tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert tokens[1].idx == 3 @@ -16,7 +18,7 @@ def test_simple_punct(en_tokenizer): def test_complex_punct(en_tokenizer): - text = 'Tom (D., Ill.)!' + text = "Tom (D., Ill.)!" tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert len(tokens[0]) == 3 diff --git a/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py new file mode 100644 index 000000000..042934d4e --- /dev/null +++ b/spacy/tests/en/tokenizer/test_prefix_suffix_infix.py @@ -0,0 +1,136 @@ +# coding: utf-8 +"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["(can)"]) +def test_tokenizer_splits_no_special(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["can't"]) +def test_tokenizer_splits_no_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(can't"]) +def test_tokenizer_splits_prefix_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["can't)"]) +def test_tokenizer_splits_suffix_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(can't)"]) +def test_tokenizer_splits_even_wrap(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["(can't?)"]) +def test_tokenizer_splits_uneven_wrap(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)]) +def test_tokenizer_splits_prefix_interact(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text', ["U.S.)"]) +def test_tokenizer_splits_suffix_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(U.S.)"]) +def test_tokenizer_splits_even_wrap_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(U.S.?)"]) +def test_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["best-known"]) +def test_tokenizer_splits_hyphens(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["best.Known", "Hello.World"]) +def test_tokenizer_splits_period_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["Hello,world", "one,two"]) +def test_tokenizer_splits_comma_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["best...Known", "best...known"]) +def test_tokenizer_splits_ellipsis_infix(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +def test_tokenizer_splits_double_hyphen_infix(en_tokenizer): + tokens = en_tokenizer("No decent--let alone well-bred--people.") + assert tokens[0].text == "No" + assert tokens[1].text == "decent" + assert tokens[2].text == "--" + assert tokens[3].text == "let" + assert tokens[4].text == "alone" + assert tokens[5].text == "well" + assert tokens[6].text == "-" + assert tokens[7].text == "bred" + assert tokens[8].text == "--" + assert tokens[9].text == "people" + + +@pytest.mark.xfail +def test_tokenizer_splits_period_abbr(en_tokenizer): + text = "Today is Tuesday.Mr." + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert tokens[0].text == "Today" + assert tokens[1].text == "is" + assert tokens[2].text == "Tuesday" + assert tokens[3].text == "." + assert tokens[4].text == "Mr." + + +@pytest.mark.xfail +def test_tokenizer_splits_em_dash_infix(en_tokenizer): + # Re Issue #225 + tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """ + """you'll have to walk there.\u2014Ariel.""") + assert tokens[6].text == "Puddleton" + assert tokens[7].text == "?" + assert tokens[8].text == "\u2014" diff --git a/spacy/tests/en/tokenizer/test_punct.py b/spacy/tests/en/tokenizer/test_punct.py new file mode 100644 index 000000000..b6ae9224d --- /dev/null +++ b/spacy/tests/en/tokenizer/test_punct.py @@ -0,0 +1,132 @@ +# coding: utf-8 +"""Test that open, closed and paired punctuation is split off correctly.""" + + +from __future__ import unicode_literals + +import pytest + +from ....util import compile_prefix_regex +from ....language_data import TOKENIZER_PREFIXES + + + +en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search + +PUNCT_OPEN = ['(', '[', '{', '*'] +PUNCT_CLOSE = [')', ']', '}', '*'] +PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +@pytest.mark.parametrize('text', ["(", "((", "<"]) +def test_tokenizer_handles_only_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('punct_add', ["`"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('punct_add', ["'"]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): + tokens = en_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): + tokens = en_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('text', ["'The"]) +def test_tokenizer_splits_open_appostrophe(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.parametrize('text', ["Hello''"]) +def test_tokenizer_splits_double_end_quote(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = en_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('text', ["Hello"]) +def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): + tokens = en_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")]) +@pytest.mark.parametrize('text', ["Hello"]) +def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text): + tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add) + assert len(tokens) == 5 + assert tokens[0].text == punct_open_add + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close_add + + +@pytest.mark.parametrize('text,punct', [("(can't", "(")]) +def test_tokenizer_splits_pre_punct_regex(text, punct): + match = en_search_prefixes(text) + assert match.group() == punct + + +def test_tokenizer_splits_bracket_period(en_tokenizer): + text = "(And a 6a.m. run through Washington Park)." + tokens = en_tokenizer(text) + assert tokens[len(tokens) - 1].text == "." diff --git a/spacy/tests/en/tokenizer/test_text.py b/spacy/tests/en/tokenizer/test_text.py new file mode 100644 index 000000000..c7178fbf9 --- /dev/null +++ b/spacy/tests/en/tokenizer/test_text.py @@ -0,0 +1,36 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_long_text(en_tokenizer): + text = """Tributes pour in for late British Labour Party leader + +Tributes poured in from around the world Thursday +to the late Labour Party leader John Smith, who died earlier from a massive +heart attack aged 55. + +In Washington, the US State Department issued a statement regretting "the +untimely death" of the rapier-tongued Scottish barrister and parliamentarian. + +"Mr. Smith, throughout his distinguished""" + tokens = en_tokenizer(text) + assert len(tokens) == 76 + + +@pytest.mark.parametrize('text,length', [ + ("The U.S. Army likes Shock and Awe.", 8), + ("U.N. regulations are not a part of their concern.", 10), + ("“Isn't it?”", 6), + ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), + ("""'Me too!', Mr. P. Delaware cried. """, 11), + ("They ran about 10km.", 6), + # ("But then the 6,000-year ice age came...", 10) + ]) +def test_tokenizer_handles_cnts(en_tokenizer, text, length): + tokens = en_tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/hu/conftest.py b/spacy/tests/hu/conftest.py new file mode 100644 index 000000000..222bd1b00 --- /dev/null +++ b/spacy/tests/hu/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...hu import Hungarian + + +@pytest.fixture +def hu_tokenizer(): + return Hungarian.Defaults.create_tokenizer() diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py index 2bfbfdf36..0b76da0c6 100644 --- a/spacy/tests/hu/tokenizer/test_tokenizer.py +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -2,25 +2,27 @@ from __future__ import unicode_literals import pytest -from spacy.hu import Hungarian -_DEFAULT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - ('A .hu.', ['A', '.hu', '.']), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.'])] -_HYPHEN_TESTS = [ +DEFAULT_TESTS = [ + ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.']) +] + +HYPHEN_TESTS = [ ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), ('Egy -nak.', ['Egy', '-nak', '.']), ('Egy bel-.', ['Egy', 'bel-', '.']), @@ -39,195 +41,194 @@ _HYPHEN_TESTS = [ ('A 7-es.', ['A', '7-es', '.']), ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), ('A %-sal.', ['A', '%-sal', '.']), - ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])] + ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.']) +] -_NUMBER_TESTS = [('A 2b van.', ['A', '2b', 'van', '.']), - ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), - ('A 2b.', ['A', '2b', '.']), - ('A 2b-ben.', ['A', '2b-ben', '.']), - ('A 3.b van.', ['A', '3.b', 'van', '.']), - ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), - ('A 3.b.', ['A', '3.b', '.']), - ('A 3.b-ben.', ['A', '3.b-ben', '.']), - ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), - ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), - ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), - ('A 1:35 van.', ['A', '1:35', 'van', '.']), - ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), - ('A 1:35-ben.', ['A', '1:35-ben', '.']), - ('A 1.35 van.', ['A', '1.35', 'van', '.']), - ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), - ('A 1.35-ben.', ['A', '1.35-ben', '.']), - ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), - ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), - ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), - ('A 10--12 van.', ['A', '10--12', 'van', '.']), - ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), - ('A 10--12-ben.', ['A', '10--12-ben', '.']), - ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), - ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), - ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), - ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), - ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), - ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), - ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), - ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), - ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), - ('A 10–12 van.', ['A', '10–12', 'van', '.']), - ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), - ('A 10–12-ben.', ['A', '10–12-ben', '.']), - ('A 10—12 van.', ['A', '10—12', 'van', '.']), - ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), - ('A 10—12-ben.', ['A', '10—12-ben', '.']), - ('A 10―12 van.', ['A', '10―12', 'van', '.']), - ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), - ('A 10―12-ben.', ['A', '10―12-ben', '.']), - ('A -23,12 van.', ['A', '-23,12', 'van', '.']), - ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), - ('A -23,12-ben.', ['A', '-23,12-ben', '.']), - ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A C++ van.', ['A', 'C++', 'van', '.']), - ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), - ('A C++.', ['A', 'C++', '.']), - ('A C++-ben.', ['A', 'C++-ben', '.']), - ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), - ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), - ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), - ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), - ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), - ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), - ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), - ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), - ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), - ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), - ('A IV. 12.', ['A', 'IV.', '12.']), - ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), - ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), - ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), - ('A 2003.01.06.', ['A', '2003.01.06.']), - ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), - ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), - ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), - ('A IV.12.', ['A', 'IV.12.']), - ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), - ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), - ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), - ('A 1.1.2.', ['A', '1.1.2.']), - ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), - ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), - ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), - ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), - ('A 3,14 van.', ['A', '3,14', 'van', '.']), - ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), - ('A 3,14-ben.', ['A', '3,14-ben', '.']), - ('A 3.14 van.', ['A', '3.14', 'van', '.']), - ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), - ('A 3.14-ben.', ['A', '3.14-ben', '.']), - ('A 15. van.', ['A', '15.', 'van', '.']), - ('A 15-ben van.', ['A', '15-ben', 'van', '.']), - ('A 15-ben.', ['A', '15-ben', '.']), - ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), - ('A 15.-ben.', ['A', '15.-ben', '.']), - ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), - ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), - ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), - ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), - ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), - ('A -0,99%.', ['A', '-0,99%', '.']), - ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), - ('A 10--20% van.', ['A', '10--20%', 'van', '.']), - ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), - ('A 10--20%.', ['A', '10--20%', '.']), - ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), - ('A 99§ van.', ['A', '99§', 'van', '.']), - ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), - ('A 99§-ben.', ['A', '99§-ben', '.']), - ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), - ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), - ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), - ('A 99° van.', ['A', '99°', 'van', '.']), - ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), - ('A 99°-ben.', ['A', '99°-ben', '.']), - ('A 10--20° van.', ['A', '10--20°', 'van', '.']), - ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), - ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), - ('A °C van.', ['A', '°C', 'van', '.']), - ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), - ('A °C.', ['A', '°C', '.']), - ('A °C-ben.', ['A', '°C-ben', '.']), - ('A 100°C van.', ['A', '100°C', 'van', '.']), - ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), - ('A 100°C.', ['A', '100°C', '.']), - ('A 100°C-ben.', ['A', '100°C-ben', '.']), - ('A 800x600 van.', ['A', '800x600', 'van', '.']), - ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), - ('A 800x600-ben.', ['A', '800x600-ben', '.']), - ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), - ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), - ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), - ('A 5/J van.', ['A', '5/J', 'van', '.']), - ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), - ('A 5/J-ben.', ['A', '5/J-ben', '.']), - ('A 5/J. van.', ['A', '5/J.', 'van', '.']), - ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), - ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), - ('A III/1 van.', ['A', 'III/1', 'van', '.']), - ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), - ('A III/1-ben.', ['A', 'III/1-ben', '.']), - ('A III/1. van.', ['A', 'III/1.', 'van', '.']), - ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), - ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), - ('A III/c van.', ['A', 'III/c', 'van', '.']), - ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), - ('A III/c.', ['A', 'III/c', '.']), - ('A III/c-ben.', ['A', 'III/c-ben', '.']), - ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), - ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), - ('A TU–154-ben.', ['A', 'TU–154-ben', '.'])] +NUMBER_TESTS = [ + ('A 2b van.', ['A', '2b', 'van', '.']), + ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), + ('A 2b.', ['A', '2b', '.']), + ('A 2b-ben.', ['A', '2b-ben', '.']), + ('A 3.b van.', ['A', '3.b', 'van', '.']), + ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), + ('A 3.b.', ['A', '3.b', '.']), + ('A 3.b-ben.', ['A', '3.b-ben', '.']), + ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), + ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), + ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), + ('A 1:35 van.', ['A', '1:35', 'van', '.']), + ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), + ('A 1:35-ben.', ['A', '1:35-ben', '.']), + ('A 1.35 van.', ['A', '1.35', 'van', '.']), + ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), + ('A 1.35-ben.', ['A', '1.35-ben', '.']), + ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), + ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), + ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), + ('A 10--12 van.', ['A', '10--12', 'van', '.']), + ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), + ('A 10--12-ben.', ['A', '10--12-ben', '.']), + ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), + ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), + ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), + ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), + ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), + ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), + ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), + ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), + ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), + ('A 10–12 van.', ['A', '10–12', 'van', '.']), + ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), + ('A 10–12-ben.', ['A', '10–12-ben', '.']), + ('A 10—12 van.', ['A', '10—12', 'van', '.']), + ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), + ('A 10—12-ben.', ['A', '10—12-ben', '.']), + ('A 10―12 van.', ['A', '10―12', 'van', '.']), + ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), + ('A 10―12-ben.', ['A', '10―12-ben', '.']), + ('A -23,12 van.', ['A', '-23,12', 'van', '.']), + ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), + ('A -23,12-ben.', ['A', '-23,12-ben', '.']), + ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A C++ van.', ['A', 'C++', 'van', '.']), + ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), + ('A C++.', ['A', 'C++', '.']), + ('A C++-ben.', ['A', 'C++-ben', '.']), + ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), + ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), + ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), + ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), + ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), + ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), + ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), + ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), + ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), + ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), + ('A IV. 12.', ['A', 'IV.', '12.']), + ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), + ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), + ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), + ('A 2003.01.06.', ['A', '2003.01.06.']), + ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), + ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), + ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), + ('A IV.12.', ['A', 'IV.12.']), + ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), + ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), + ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), + ('A 1.1.2.', ['A', '1.1.2.']), + ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), + ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), + ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), + ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), + ('A 3,14 van.', ['A', '3,14', 'van', '.']), + ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), + ('A 3,14-ben.', ['A', '3,14-ben', '.']), + ('A 3.14 van.', ['A', '3.14', 'van', '.']), + ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), + ('A 3.14-ben.', ['A', '3.14-ben', '.']), + ('A 15. van.', ['A', '15.', 'van', '.']), + ('A 15-ben van.', ['A', '15-ben', 'van', '.']), + ('A 15-ben.', ['A', '15-ben', '.']), + ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), + ('A 15.-ben.', ['A', '15.-ben', '.']), + ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), + ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), + ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), + ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), + ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), + ('A -0,99%.', ['A', '-0,99%', '.']), + ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), + ('A 10--20% van.', ['A', '10--20%', 'van', '.']), + ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), + ('A 10--20%.', ['A', '10--20%', '.']), + ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), + ('A 99§ van.', ['A', '99§', 'van', '.']), + ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), + ('A 99§-ben.', ['A', '99§-ben', '.']), + ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), + ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), + ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), + ('A 99° van.', ['A', '99°', 'van', '.']), + ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), + ('A 99°-ben.', ['A', '99°-ben', '.']), + ('A 10--20° van.', ['A', '10--20°', 'van', '.']), + ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), + ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), + ('A °C van.', ['A', '°C', 'van', '.']), + ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), + ('A °C.', ['A', '°C', '.']), + ('A °C-ben.', ['A', '°C-ben', '.']), + ('A 100°C van.', ['A', '100°C', 'van', '.']), + ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), + ('A 100°C.', ['A', '100°C', '.']), + ('A 100°C-ben.', ['A', '100°C-ben', '.']), + ('A 800x600 van.', ['A', '800x600', 'van', '.']), + ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), + ('A 800x600-ben.', ['A', '800x600-ben', '.']), + ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), + ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), + ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), + ('A 5/J van.', ['A', '5/J', 'van', '.']), + ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), + ('A 5/J-ben.', ['A', '5/J-ben', '.']), + ('A 5/J. van.', ['A', '5/J.', 'van', '.']), + ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), + ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), + ('A III/1 van.', ['A', 'III/1', 'van', '.']), + ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), + ('A III/1-ben.', ['A', 'III/1-ben', '.']), + ('A III/1. van.', ['A', 'III/1.', 'van', '.']), + ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), + ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), + ('A III/c van.', ['A', 'III/c', 'van', '.']), + ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), + ('A III/c.', ['A', 'III/c', '.']), + ('A III/c-ben.', ['A', 'III/c-ben', '.']), + ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), + ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), + ('A TU–154-ben.', ['A', 'TU–154-ben', '.']) +] -_QUOTE_TESTS = [('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), - ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), - ("A don't van.", ['A', "don't", 'van', '.'])] +QUOTE_TESTS = [ + ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), + ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), + ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), + ("A don't van.", ['A', "don't", 'van', '.']) +] -_DOT_TESTS = [('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - ('A .hu.', ['A', '.hu', '.']), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.'])] +DOT_TESTS = [ + ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), + ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), + ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), + ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), + ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A .hu.', ['A', '.hu', '.']), + ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), + ('A pl.', ['A', 'pl.']), + ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), + ('Egy..ket.', ['Egy', '..', 'ket', '.']), + ('Valami... van.', ['Valami', '...', 'van', '.']), + ('Valami ...van...', ['Valami', '...', 'van', '...']), + ('Valami...', ['Valami', '...']), + ('Valami ...', ['Valami', '...']), + ('Valami ... más.', ['Valami', '...', 'más', '.']) +] -@pytest.fixture(scope="session") -def HU(): - return Hungarian() +TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS -@pytest.fixture(scope="module") -def hu_tokenizer(HU): - return HU.tokenizer - - -@pytest.mark.parametrize(("input", "expected_tokens"), - _DEFAULT_TESTS + _HYPHEN_TESTS + _NUMBER_TESTS + _DOT_TESTS + _QUOTE_TESTS) -def test_testcases(hu_tokenizer, input, expected_tokens): - tokens = hu_tokenizer(input) - token_list = [token.orth_ for token in tokens if not token.is_space] +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens): + tokens = hu_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py new file mode 100644 index 000000000..84d4398c5 --- /dev/null +++ b/spacy/tests/regression/test_issue351.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals +from ...en import English + +import pytest + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() + + +def test_issue351(en_tokenizer): + doc = en_tokenizer(" This is a cat.") + assert doc[0].idx == 0 + assert len(doc[0]) == 3 + assert doc[1].idx == 3 diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py new file mode 100644 index 000000000..018289030 --- /dev/null +++ b/spacy/tests/regression/test_issue360.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals +from ...en import English + +import pytest + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() + + +def test_big_ellipsis(en_tokenizer): + tokens = en_tokenizer(u'$45...............Asking') + assert len(tokens) > 2 diff --git a/spacy/tests/sun.tokens b/spacy/tests/sun.tokens deleted file mode 100644 index 4b912e18e..000000000 --- a/spacy/tests/sun.tokens +++ /dev/null @@ -1,4 +0,0 @@ -The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] - -The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] -Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py index 06ccde7b3..c8e340208 100644 --- a/spacy/tests/tokenizer/conftest.py +++ b/spacy/tests/tokenizer/conftest.py @@ -1,7 +1,23 @@ +# coding: utf-8 +from __future__ import unicode_literals + import pytest -from spacy.en import English + +from ...en import English +from ...de import German +from ...es import Spanish +from ...it import Italian +from ...fr import French +from ...pt import Portuguese +from ...nl import Dutch +from ...sv import Swedish +from ...hu import Hungarian -@pytest.fixture(scope="module") -def en_tokenizer(EN): - return EN.tokenizer +LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian] + + +@pytest.fixture(params=LANGUAGES) +def tokenizer(request): + lang = request.param + return lang.Defaults.create_tokenizer() diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py deleted file mode 100644 index 568e34704..000000000 --- a/spacy/tests/tokenizer/test_contractions.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -def test_possess(en_tokenizer): - tokens = en_tokenizer("Mike's") - assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike" - assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s" - assert len(tokens) == 2 - - -def test_apostrophe(en_tokenizer): - tokens = en_tokenizer("schools'") - assert len(tokens) == 2 - assert tokens[1].orth_ == "'" - assert tokens[0].orth_ == "schools" - - -def test_LL(en_tokenizer): - tokens = en_tokenizer("we'll") - assert len(tokens) == 2 - assert tokens[1].orth_ == "'ll" - assert tokens[1].lemma_ == "will" - assert tokens[0].orth_ == "we" - - -def test_aint(en_tokenizer): - tokens = en_tokenizer("ain't") - assert len(tokens) == 2 - assert tokens[0].orth_ == "ai" - assert tokens[0].lemma_ == "be" - assert tokens[1].orth_ == "n't" - assert tokens[1].lemma_ == "not" - -def test_capitalized(en_tokenizer): - tokens = en_tokenizer("can't") - assert len(tokens) == 2 - tokens = en_tokenizer("Can't") - assert len(tokens) == 2 - tokens = en_tokenizer("Ain't") - assert len(tokens) == 2 - assert tokens[0].orth_ == "Ai" - assert tokens[0].lemma_ == "be" - - -def test_punct(en_tokenizer): - tokens = en_tokenizer("We've") - assert len(tokens) == 2 - tokens = en_tokenizer("``We've") - assert len(tokens) == 3 - - -@pytest.mark.xfail -def test_therell(en_tokenizer): - tokens = en_tokenizer("there'll") - assert len(tokens) == 2 - assert tokens[0].text == "there" - assert tokens[1].text == "there" diff --git a/spacy/tests/tokenizer/test_emoticons.py b/spacy/tests/tokenizer/test_emoticons.py deleted file mode 100644 index e0022dbbd..000000000 --- a/spacy/tests/tokenizer/test_emoticons.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -def test_tweebo_challenge(en_tokenizer): - text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = en_tokenizer(text) - assert tokens[0].orth_ == ":o" - assert tokens[1].orth_ == ":/" - assert tokens[2].orth_ == ":'(" - assert tokens[3].orth_ == ">:o" - assert tokens[4].orth_ == "(:" - assert tokens[5].orth_ == ":)" - assert tokens[6].orth_ == ">.<" - assert tokens[7].orth_ == "XD" - assert tokens[8].orth_ == "-__-" - assert tokens[9].orth_ == "o.O" - assert tokens[10].orth_ == ";D" - assert tokens[11].orth_ == ":-)" - assert tokens[12].orth_ == "@_@" - assert tokens[13].orth_ == ":P" - assert tokens[14].orth_ == "8D" - assert tokens[15].orth_ == ":1" - assert tokens[16].orth_ == ">:(" - assert tokens[17].orth_ == ":D" - assert tokens[18].orth_ == "=|" - assert tokens[19].orth_ == '")' - assert tokens[20].orth_ == ':>' - assert tokens[21].orth_ == '....' - - -def test_false_positive(en_tokenizer): - text = "example:)" - tokens = en_tokenizer(text) - assert len(tokens) == 3 diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py new file mode 100644 index 000000000..aab27714e --- /dev/null +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -0,0 +1,41 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_emoticons(tokenizer): + # Tweebo challenge (CMU) + text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" + tokens = tokenizer(text) + assert tokens[0].text == ":o" + assert tokens[1].text == ":/" + assert tokens[2].text == ":'(" + assert tokens[3].text == ">:o" + assert tokens[4].text == "(:" + assert tokens[5].text == ":)" + assert tokens[6].text == ">.<" + assert tokens[7].text == "XD" + assert tokens[8].text == "-__-" + assert tokens[9].text == "o.O" + assert tokens[10].text == ";D" + assert tokens[11].text == ":-)" + assert tokens[12].text == "@_@" + assert tokens[13].text == ":P" + assert tokens[14].text == "8D" + assert tokens[15].text == ":1" + assert tokens[16].text == ">:(" + assert tokens[17].text == ":D" + assert tokens[18].text == "=|" + assert tokens[19].text == '")' + assert tokens[20].text == ':>' + assert tokens[21].text == '....' + + +@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): + tokens = tokenizer(text) + assert len(tokens) == length diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py deleted file mode 100644 index 1b7cbaa7b..000000000 --- a/spacy/tests/tokenizer/test_infix.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -import pytest - -def test_hyphen(en_tokenizer): - tokens = en_tokenizer('best-known') - assert len(tokens) == 3 - - -def test_numeric_range(en_tokenizer): - tokens = en_tokenizer('0.1-13.5') - assert len(tokens) == 3 - -def test_period(en_tokenizer): - tokens = en_tokenizer('best.Known') - assert len(tokens) == 3 - tokens = en_tokenizer('zombo.com') - assert len(tokens) == 1 - - -def test_ellipsis(en_tokenizer): - tokens = en_tokenizer('best...Known') - assert len(tokens) == 3 - tokens = en_tokenizer('best...known') - assert len(tokens) == 3 - -def test_big_ellipsis(en_tokenizer): - '''Test regression identified in Issue #360''' - tokens = en_tokenizer(u'$45...............Asking') - assert len(tokens) > 2 - - - -def test_email(en_tokenizer): - tokens = en_tokenizer('hello@example.com') - assert len(tokens) == 1 - tokens = en_tokenizer('hi+there@gmail.it') - assert len(tokens) == 1 - - -def test_double_hyphen(en_tokenizer): - tokens = en_tokenizer(u'No decent--let alone well-bred--people.') - assert tokens[0].text == u'No' - assert tokens[1].text == u'decent' - assert tokens[2].text == u'--' - assert tokens[3].text == u'let' - assert tokens[4].text == u'alone' - assert tokens[5].text == u'well' - assert tokens[6].text == u'-' - # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter - # on infixes. - assert tokens[7].text == u'bred' - assert tokens[8].text == u'--' - assert tokens[9].text == u'people' - - -def test_infix_comma(en_tokenizer): - # Re issue #326 - tokens = en_tokenizer(u'Hello,world') - assert tokens[0].text == u'Hello' - assert tokens[1].text == u',' - assert tokens[2].text == u'world' diff --git a/spacy/tests/tokenizer/test_only_punct.py b/spacy/tests/tokenizer/test_only_punct.py deleted file mode 100644 index 12c958088..000000000 --- a/spacy/tests/tokenizer/test_only_punct.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import unicode_literals - - -def test_only_pre1(en_tokenizer): - assert len(en_tokenizer("(")) == 1 - - -def test_only_pre2(en_tokenizer): - assert len(en_tokenizer("((")) == 2 diff --git a/spacy/tests/tokenizer/test_post_punct.py b/spacy/tests/tokenizer/test_post_punct.py deleted file mode 100644 index ff1120c63..000000000 --- a/spacy/tests/tokenizer/test_post_punct.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def close_puncts(): - return [')', ']', '}', '*'] - - -def test_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[1].string == p - assert tokens[0].string == word_str - - -def test_two_different_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].string == word_str - assert tokens[1].string == p - assert tokens[2].string == "'" - - -def test_three_same_close(close_puncts, en_tokenizer): - word_str = 'Hello' - for p in close_puncts: - string = word_str + p + p + p - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].string == word_str - assert tokens[1].string == p - - -def test_double_end_quote(en_tokenizer): - assert len(en_tokenizer("Hello''")) == 2 - assert len(en_tokenizer("''")) == 1 diff --git a/spacy/tests/tokenizer/test_pre_punct.py b/spacy/tests/tokenizer/test_pre_punct.py deleted file mode 100644 index 9aec1dc7b..000000000 --- a/spacy/tests/tokenizer/test_pre_punct.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -import pytest - - -@pytest.fixture -def open_puncts(): - return ['(', '[', '{', '*'] - - -def test_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == word_str - - -def test_two_different_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + "`" + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == p - assert tokens[1].orth_ == "`" - assert tokens[2].orth_ == word_str - - -def test_three_same_open(open_puncts, en_tokenizer): - word_str = 'Hello' - for p in open_puncts: - string = p + p + p + word_str - tokens = en_tokenizer(string) - assert len(tokens) == 4 - assert tokens[0].orth_ == p - assert tokens[3].orth_ == word_str - - -def test_open_appostrophe(en_tokenizer): - string = "'The" - tokens = en_tokenizer(string) - assert len(tokens) == 2 - assert tokens[0].orth_ == "'" diff --git a/spacy/tests/tokenizer/test_special_affix.py b/spacy/tests/tokenizer/test_special_affix.py deleted file mode 100644 index 62cf114f1..000000000 --- a/spacy/tests/tokenizer/test_special_affix.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Test entries in the tokenization special-case interacting with prefix -and suffix punctuation.""" -from __future__ import unicode_literals -import pytest - - -def test_no_special(en_tokenizer): - assert len(en_tokenizer("(can)")) == 3 - - -def test_no_punct(en_tokenizer): - assert len(en_tokenizer("can't")) == 2 - - -def test_prefix(en_tokenizer): - assert len(en_tokenizer("(can't")) == 3 - - -def test_suffix(en_tokenizer): - assert len(en_tokenizer("can't)")) == 3 - - -def test_wrap(en_tokenizer): - assert len(en_tokenizer("(can't)")) == 4 - - -def test_uneven_wrap(en_tokenizer): - assert len(en_tokenizer("(can't?)")) == 5 - - -def test_prefix_interact(en_tokenizer): - assert len(en_tokenizer("U.S.")) == 1 - assert len(en_tokenizer("us.")) == 2 - assert len(en_tokenizer("(U.S.")) == 2 - - -def test_suffix_interact(en_tokenizer): - assert len(en_tokenizer("U.S.)")) == 2 - - -def test_even_wrap_interact(en_tokenizer): - assert len(en_tokenizer("(U.S.)")) == 3 - - -def test_uneven_wrap_interact(en_tokenizer): - assert len(en_tokenizer("(U.S.?)")) == 4 diff --git a/spacy/tests/tokenizer/test_string_loading.py b/spacy/tests/tokenizer/test_string_loading.py deleted file mode 100644 index 1bc5539bc..000000000 --- a/spacy/tests/tokenizer/test_string_loading.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Test suspected freeing of strings""" -from __future__ import unicode_literals - - -def test_one(en_tokenizer): - tokens = en_tokenizer('Betty Botter bought a pound of butter.') - assert tokens[0].orth_ == 'Betty' - tokens2 = en_tokenizer('Betty also bought a pound of butter.') - assert tokens2[0].orth_ == 'Betty' diff --git a/spacy/tests/tokenizer/test_surround_punct.py b/spacy/tests/tokenizer/test_surround_punct.py deleted file mode 100644 index 7c7a50904..000000000 --- a/spacy/tests/tokenizer/test_surround_punct.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import unicode_literals -import pytest - - -@pytest.fixture -def paired_puncts(): - return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] - - -def test_token(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = open_ + word_str + close_ - tokens = en_tokenizer(string) - assert len(tokens) == 3 - assert tokens[0].orth_ == open_ - assert tokens[1].orth_ == word_str - assert tokens[2].orth_ == close_ - - -def test_two_different(paired_puncts, en_tokenizer): - word_str = 'Hello' - for open_, close_ in paired_puncts: - string = "`" + open_ + word_str + close_ + "'" - tokens = en_tokenizer(string) - assert len(tokens) == 5 - assert tokens[0].orth_ == "`" - assert tokens[1].orth_ == open_ - assert tokens[2].orth_ == word_str - assert tokens[2].orth_ == word_str - assert tokens[3].orth_ == close_ - assert tokens[4].orth_ == "'" diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 091561ae3..9db007d7e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,172 +1,83 @@ # coding: utf-8 from __future__ import unicode_literals +from os import path import pytest -import io -import pickle -import cloudpickle -import tempfile -from ... import util -from ...language_data import TOKENIZER_PREFIXES +from ...util import utf8open -en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search -# @pytest.mark.xfail -# def test_pickle(en_tokenizer): -# file_ = io.BytesIO() -# cloudpickle.dump(en_tokenizer, file_) -# file_.seek(0) -# loaded = pickle.load(file_) -# assert loaded is not None - -def test_pre_punct_regex(): - string = "(can't" - match = en_search_prefixes(string) - assert match.group() == "(" - -def test_no_word(en_tokenizer): - tokens = en_tokenizer(u'') +def test_tokenizer_handles_no_word(tokenizer): + tokens = tokenizer("") assert len(tokens) == 0 -def test_single_word(en_tokenizer): - tokens = en_tokenizer(u'hello') - assert tokens[0].orth_ == 'hello' +@pytest.mark.parametrize('text', ["lorem"]) +def test_tokenizer_handles_single_word(tokenizer, text): + tokens = tokenizer(text) + assert tokens[0].text == text -def test_two_words(en_tokenizer): - tokens = en_tokenizer('hello possums') - assert len(tokens) == 2 - assert tokens[0].orth_ != tokens[1].orth_ - - -def test_punct(en_tokenizer): - tokens = en_tokenizer('hello, possums.') +def test_tokenizer_handles_punct(tokenizer): + text = "Lorem, ipsum." + tokens = tokenizer(text) assert len(tokens) == 4 - assert tokens[0].orth_ == 'hello' - assert tokens[1].orth_ == ',' - assert tokens[2].orth_ == 'possums' - assert tokens[1].orth_ != 'hello' + assert tokens[0].text == "Lorem" + assert tokens[1].text == "," + assert tokens[2].text == "ipsum" + assert tokens[1].text != "Lorem" -def test_digits(en_tokenizer): - tokens = en_tokenizer('The year: 1984.') - assert len(tokens) == 5 - assert tokens[0].orth == en_tokenizer.vocab['The'].orth - assert tokens[3].orth == en_tokenizer.vocab['1984'].orth +def test_tokenizer_handles_digits(tokenizer): + exceptions = ["hu"] + text = "Lorem ipsum: 1984." + tokens = tokenizer(text) + + if tokens[0].lang_ not in exceptions: + assert len(tokens) == 5 + assert tokens[0].text == "Lorem" + assert tokens[3].text == "1984" -def test_contraction(en_tokenizer): - tokens = en_tokenizer("don't giggle") - assert len(tokens) == 3 - assert tokens[1].orth == en_tokenizer.vocab["n't"].orth - tokens = en_tokenizer("i said don't!") - assert len(tokens) == 5 - assert tokens[4].orth == en_tokenizer.vocab['!'].orth - -def test_contraction_punct(en_tokenizer): - tokens = [w.text for w in en_tokenizer("(can't")] - assert tokens == ['(', 'ca', "n't"] - tokens = en_tokenizer("`ain't") - assert len(tokens) == 3 - tokens = en_tokenizer('''"isn't''') - assert len(tokens) == 3 - tokens = en_tokenizer("can't!") - assert len(tokens) == 3 +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +def test_tokenizer_keep_urls(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 -def test_sample(en_tokenizer): - text = """Tributes pour in for late British Labour Party leader +@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +def test_tokenizer_keeps_email(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 1 -Tributes poured in from around the world Thursday -to the late Labour Party leader John Smith, who died earlier from a massive -heart attack aged 55. -In Washington, the US State Department issued a statement regretting "the -untimely death" of the rapier-tongued Scottish barrister and parliamentarian. +def test_tokenizer_handles_long_text(tokenizer): + text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit -"Mr. Smith, throughout his distinguished""" +Cras egestas orci non porttitor maximus. +Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate. - tokens = en_tokenizer(text) +Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris. + +"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo.""" + + tokens = tokenizer(text) assert len(tokens) > 5 -def test_cnts1(en_tokenizer): - text = u"""The U.S. Army likes Shock and Awe.""" - tokens = en_tokenizer(text) - assert len(tokens) == 8 +@pytest.mark.parametrize('file_name', ["sun.txt"]) +def test_tokenizer_handle_text_from_file(tokenizer, file_name): + loc = path.join(path.dirname(__file__), '..', file_name) + text = utf8open(loc).read() + assert len(text) != 0 + tokens = tokenizer(text) + assert len(tokens) > 100 -def test_cnts2(en_tokenizer): - text = u"""U.N. regulations are not a part of their concern.""" - tokens = en_tokenizer(text) - assert len(tokens) == 10 - - -def test_cnts3(en_tokenizer): - text = u"“Isn't it?”" - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 6 - - -def test_cnts4(en_tokenizer): - text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 15 - - -def test_cnts5(en_tokenizer): - text = """'Me too!', Mr. P. Delaware cried. """ - tokens = en_tokenizer(text) - assert len(tokens) == 11 - - -@pytest.mark.xfail -def test_mr(en_tokenizer): - text = """Today is Tuesday.Mr.""" - tokens = en_tokenizer(text) - assert len(tokens) == 5 - assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] - - -def test_cnts6(en_tokenizer): - text = u'They ran about 10km.' - tokens = en_tokenizer(text) - words = [t.orth_ for t in tokens] - assert len(words) == 6 - -def test_bracket_period(en_tokenizer): - text = u'(And a 6a.m. run through Washington Park).' - tokens = en_tokenizer(text) - assert tokens[len(tokens) - 1].orth_ == u'.' - - -def test_ie(en_tokenizer): - text = u"It's mediocre i.e. bad." - tokens = en_tokenizer(text) - assert len(tokens) == 6 - assert tokens[3].orth_ == "i.e." - - -def test_two_whitespace(en_tokenizer): - orig_str = u'there are 2 spaces after this ' - tokens = en_tokenizer(orig_str) - assert repr(tokens.text_with_ws) == repr(orig_str) - - -@pytest.mark.xfail -def test_em_dash_infix(en_tokenizer): - # Re Issue #225 - tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, ''' - '''you'll have to walk there.\u2014Ariel.''') - assert tokens[6].text == 'Puddleton' - assert tokens[7].text == '?' - assert tokens[8].text == '\u2014' - -#def test_cnts7(): -# text = 'But then the 6,000-year ice age came...' -# tokens = EN.tokenize(text) -# assert len(tokens) == 10 +def test_tokenizer_suspected_freeing_strings(tokenizer): + text1 = "Lorem dolor sit amet, consectetur adipiscing elit." + text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + tokens1 = tokenizer(text1) + tokens2 = tokenizer(text2) + assert tokens1[0].text == "Lorem" + assert tokens2[0].text == "Lorem" diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index ad34c8791..7ff3106a8 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,67 +1,51 @@ +# coding: utf-8 """Test that tokens are created correctly for whitespace.""" + + from __future__ import unicode_literals import pytest -def test_single_space(en_tokenizer): - tokens = en_tokenizer('hello possums') +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_single_space(tokenizer, text): + tokens = tokenizer(text) assert len(tokens) == 2 -def test_double_space(en_tokenizer): - tokens = en_tokenizer('hello possums') +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_double_space(tokenizer, text): + tokens = tokenizer(text) assert len(tokens) == 3 - assert tokens[1].orth_ == ' ' + assert tokens[1].text == " " -def test_newline(en_tokenizer): - tokens = en_tokenizer('hello\npossums') +@pytest.mark.parametrize('text', ["lorem ipsum "]) +def test_tokenizer_handles_double_trainling_ws(tokenizer, text): + tokens = tokenizer(text) + assert repr(tokens.text_with_ws) == repr(text) + + +@pytest.mark.parametrize('text', ["lorem\nipsum"]) +def test_tokenizer_splits_newline(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "\n" + + +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_space(tokenizer, text): + tokens = tokenizer(text) assert len(tokens) == 3 -def test_newline_space(en_tokenizer): - tokens = en_tokenizer('hello \npossums') +@pytest.mark.parametrize('text', ["lorem \nipsum"]) +def test_tokenizer_splits_newline_double_space(tokenizer, text): + tokens = tokenizer(text) assert len(tokens) == 3 -def test_newline_double_space(en_tokenizer): - tokens = en_tokenizer('hello \npossums') +@pytest.mark.parametrize('text', ["lorem \n ipsum"]) +def test_tokenizer_splits_newline_space_wrap(tokenizer, text): + tokens = tokenizer(text) assert len(tokens) == 3 - - -def test_newline_space_wrap(en_tokenizer): - tokens = en_tokenizer('hello \n possums') - assert len(tokens) == 3 - - -def test_leading_space_offsets(en_tokenizer): - '''Issue #351 - # this works - - text1 = u"This is a cat." - a = english_spacy(text1) - - tok0 = list(a.sents)[0][0] - print tok0, tok0.idx, text1[tok0.idx] - - tok1 = list(a.sents)[0][1] - print tok1, tok1.idx, text1[tok1.idx] - - print "==" - - # this does not work - - text2 = u" This is a cat." - b = english_spacy(text2) - - tok0 = list(b.sents)[0][0] -print tok0, tok0.idx, text2[tok0.idx] - - tok1 = list(b.sents)[0][1] - print tok1, tok1.idx, text2[tok1.idx] - ''' - doc = en_tokenizer(u" This is a cat.") - assert doc[0].idx == 0 - assert len(doc[0]) == 3 - assert doc[1].idx == 3 diff --git a/spacy/tests/tokenizer/test_wiki_sun.py b/spacy/tests/tokenizer/test_wiki_sun.py deleted file mode 100644 index 8d2a6682e..000000000 --- a/spacy/tests/tokenizer/test_wiki_sun.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import unicode_literals - -from spacy.util import utf8open - -import pytest -from os import path - - -HERE = path.dirname(__file__) - - -@pytest.fixture -def sun_txt(): - loc = path.join(HERE, '..', 'sun.txt') - return utf8open(loc).read() - - -def test_tokenize(sun_txt, en_tokenizer): - assert len(sun_txt) != 0 - tokens = en_tokenizer(sun_txt) - assert len(tokens) > 100 diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py deleted file mode 100644 index 4f533ae76..000000000 --- a/spacy/tests/website/conftest.py +++ /dev/null @@ -1,20 +0,0 @@ -from __future__ import unicode_literals -import pytest -import os - - -@pytest.fixture(scope='session') -def nlp(): - from spacy.en import English - if os.environ.get('SPACY_DATA'): - data_dir = os.environ.get('SPACY_DATA') - else: - data_dir = True - return English(path=data_dir) - - -@pytest.fixture() -def doc(nlp): - for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']: - _ = nlp.vocab[word] - return nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tests/website/test_api.py b/spacy/tests/website/test_api.py deleted file mode 100644 index 6a7379d87..000000000 --- a/spacy/tests/website/test_api.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import unicode_literals -import pytest -from spacy.attrs import HEAD -import numpy - - -@pytest.mark.xfail -def test_example_war_and_peace(nlp): - # from spacy.en import English - from spacy._doc_examples import download_war_and_peace - - unprocessed_unicode = download_war_and_peace() - - # nlp = English() - # TODO: ImportError: No module named _doc_examples - doc = nlp(unprocessed_unicode) - - -def test_main_entry_point(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp('Some text.') # Applies tagger, parser, entity - doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser - doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity - doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser - doc = nlp('') # Zero-length tokens, not an error - # doc = nlp(b'Some text') <-- Error: need unicode - doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - -@pytest.mark.models -def test_sentence_spans(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp("This is a sentence. Here's another...") - assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] - - -@pytest.mark.models -def test_entity_spans(nlp): - # from spacy.en import English - # nlp = English() - tokens = nlp('Mr. Best flew to New York on Saturday morning.') - ents = list(tokens.ents) - assert ents[0].label == 346 - assert ents[0].label_ == 'PERSON' - assert ents[0].orth_ == 'Best' - assert ents[0].string == ents[0].string - - -@pytest.mark.models -def test_noun_chunk_spans(nlp): - # from spacy.en import English - # nlp = English() - doc = nlp('The sentence in this example has three noun chunks.') - for chunk in doc.noun_chunks: - print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_) - - # NP The sentence <-- has - # NP this example <-- in - # NP three noun chunks <-- has - - -@pytest.mark.models -def test_count_by(nlp): - # from spacy.en import English, attrs - # nlp = English() - import numpy - from spacy import attrs - tokens = nlp('apple apple orange banana') - assert tokens.count_by(attrs.ORTH) == {3699: 2, 3750: 1, 5965: 1} - assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[3699], - [3699], - [3750], - [5965]], dtype=numpy.int32)) - -@pytest.mark.models -def test_read_bytes(nlp): - from spacy.tokens.doc import Doc - loc = 'test_serialize.bin' - with open(loc, 'wb') as file_: - file_.write(nlp(u'This is a document.').to_bytes()) - file_.write(nlp(u'This is another.').to_bytes()) - docs = [] - with open(loc, 'rb') as file_: - for byte_string in Doc.read_bytes(file_): - docs.append(Doc(nlp.vocab).from_bytes(byte_string)) - assert len(docs) == 2 - - -def test_token_span(doc): - span = doc[4:6] - token = span[0] - assert token.i == 4 - - -@pytest.mark.models -def test_example_i_like_new_york1(nlp): - toks = nlp('I like New York in Autumn.') - - -@pytest.fixture -def toks(nlp): - doc = nlp('I like New York in Autumn.') - doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) - return doc - - -def test_example_i_like_new_york2(toks): - i, like, new, york, in_, autumn, dot = range(len(toks)) - - -@pytest.fixture -def tok(toks, tok): - i, like, new, york, in_, autumn, dot = range(len(toks)) - return locals()[tok] - - -@pytest.fixture -def new(toks): - return tok(toks, "new") - - -@pytest.fixture -def york(toks): - return tok(toks, "york") - - -@pytest.fixture -def autumn(toks): - return tok(toks, "autumn") - - -@pytest.fixture -def dot(toks): - return tok(toks, "dot") - - -def test_example_i_like_new_york3(toks, new, york): - assert toks[new].head.orth_ == 'York' - assert toks[york].head.orth_ == 'like' - - -def test_example_i_like_new_york4(toks, new, york): - new_york = toks[new:york+1] - assert new_york.root.orth_ == 'York' - - -def test_example_i_like_new_york5(toks, autumn, dot): - assert toks[autumn].head.orth_ == 'in' - assert toks[dot].head.orth_ == 'like' - autumn_dot = toks[autumn:] - assert autumn_dot.root.orth_ == 'Autumn' - - -def test_navigating_the_parse_tree_lefts(doc): - # TODO: where does the span object come from? - span = doc[:2] - lefts = [span.doc[i] for i in range(0, span.start) - if span.doc[i].head in span] - - -def test_navigating_the_parse_tree_rights(doc): - span = doc[:2] - rights = [span.doc[i] for i in range(span.end, len(span.doc)) - if span.doc[i].head in span] - - -def test_string_store(doc): - string_store = doc.vocab.strings - for i, string in enumerate(string_store): - assert i == string_store[string] diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py deleted file mode 100644 index 95c0ec3bb..000000000 --- a/spacy/tests/website/test_home.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import unicode_literals -import pytest -import spacy -import os - - -try: - xrange -except NameError: - xrange = range - - -@pytest.fixture() -def token(doc): - return doc[0] - - -@pytest.mark.models -def test_load_resources_and_process_text(): - from spacy.en import English - nlp = English() - doc = nlp(u'Hello, world. Here are two sentences.') - - -@pytest.mark.models -def test_get_tokens_and_sentences(doc): - token = doc[0] - sentence = next(doc.sents) - assert token is sentence[0] - assert sentence.text == 'Hello, world.' - - -@pytest.mark.models -def test_use_integer_ids_for_any_strings(nlp, token): - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - - assert token.orth == hello_id == 3125 - assert token.orth_ == hello_str == 'Hello' - - -def test_get_and_set_string_views_and_flags(nlp, token): - assert token.shape_ == 'Xxxxx' - for lexeme in nlp.vocab: - if lexeme.is_alpha: - lexeme.shape_ = 'W' - elif lexeme.is_digit: - lexeme.shape_ = 'D' - elif lexeme.is_punct: - lexeme.shape_ = 'P' - else: - lexeme.shape_ = 'M' - assert token.shape_ == 'W' - - -def test_export_to_numpy_arrays(nlp, doc): - from spacy.attrs import ORTH, LIKE_URL, IS_OOV - - attr_ids = [ORTH, LIKE_URL, IS_OOV] - doc_array = doc.to_array(attr_ids) - assert doc_array.shape == (len(doc), len(attr_ids)) - assert doc[0].orth == doc_array[0, 0] - assert doc[1].orth == doc_array[1, 0] - assert doc[0].like_url == doc_array[0, 1] - assert list(doc_array[:, 1]) == [t.like_url for t in doc] - - -@pytest.mark.models -def test_word_vectors(nlp): - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - - apples = doc[0] - oranges = doc[2] - boots = doc[6] - hippos = doc[8] - - assert apples.similarity(oranges) > boots.similarity(hippos) - - -@pytest.mark.models -def test_part_of_speech_tags(nlp): - from spacy.parts_of_speech import ADV - - def is_adverb(token): - return token.pos == spacy.parts_of_speech.ADV - - # These are data-specific, so no constants are provided. You have to look - # up the IDs from the StringStore. - NNS = nlp.vocab.strings['NNS'] - NNPS = nlp.vocab.strings['NNPS'] - def is_plural_noun(token): - return token.tag == NNS or token.tag == NNPS - - def print_coarse_pos(token): - print(token.pos_) - - def print_fine_pos(token): - print(token.tag_) - - -@pytest.mark.models -def test_syntactic_dependencies(): - def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels - - -@pytest.mark.models -def test_named_entities(): - def iter_products(docs): - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PRODUCT': - yield ent - - def word_is_in_entity(word): - return word.ent_type != 0 - - def count_parent_verb_by_person(docs): - counts = defaultdict(defaultdict(int)) - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - counts[ent.orth_][ent.root.head.lemma_] += 1 - return counts - - -def test_calculate_inline_mark_up_on_original_string(): - def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' - output = [] - template = '{word}{space}' - for token in doc: - if token.is_space: - output.append(token.orth_) - else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) - string = ''.join(output) - string = string.replace('\n', '') - string = string.replace('\t', ' ') - return string - - -@pytest.mark.models -def test_efficient_binary_serialization(doc): - from spacy.tokens.doc import Doc - - byte_string = doc.to_bytes() - open('moby_dick.bin', 'wb').write(byte_string) - - nlp = spacy.en.English() - for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) - - -@pytest.mark.models -def test_multithreading(nlp): - texts = [u'One document.', u'...', u'Lots of documents'] - # .pipe streams input, and produces streaming output - iter_texts = (texts[i % 3] for i in xrange(100000000)) - for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)): - assert doc.is_parsed - if i == 100: - break - diff --git a/spacy/util.py b/spacy/util.py index afed4142e..457534302 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -94,8 +94,13 @@ def read_regex(path): def compile_prefix_regex(entries): - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) - return re.compile(expression) + if '(' in entries: + # Handle deprecated data + expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + return re.compile(expression) + else: + expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) + return re.compile(expression) def compile_suffix_regex(entries): diff --git a/website/_harp.json b/website/_harp.json index bc8cf4d84..04a66f772 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -22,7 +22,8 @@ "twitter": "spacy_io", "github": "explosion", "reddit": "spacynlp", - "codepen": "explosion" + "codepen": "explosion", + "gitter": "explosion/spaCy" }, "NAVIGATION": { @@ -53,7 +54,7 @@ } }, - "V_CSS": "1.10", + "V_CSS": "1.14", "V_JS": "1.0", "DEFAULT_SYNTAX" : "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 27f195690..bc8b85557 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -1,6 +1,7 @@ //- 💫 MIXINS > BASE //- Aside wrapper + label - [string] aside label mixin aside-wrapper(label) aside.c-aside @@ -21,6 +22,10 @@ mixin date(input) //- SVG from map + file - [string] SVG file name in /assets/img/ + name - [string] SVG symbol id + width - [integer] width in px + height - [integer] height in px (default: same as width) mixin svg(file, name, width, height) svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) @@ -28,19 +33,23 @@ mixin svg(file, name, width, height) //- Icon + name - [string] icon name, should be SVG symbol ID + size - [integer] icon width and height (default: 20) mixin icon(name, size) - +svg("icons", "icon-" + name, size || 20).o-icon&attributes(attributes) + +svg("icons", name, size || 20).o-icon&attributes(attributes) //- Pro/Con/Neutral icon + icon - [string] "pro", "con" or "neutral" (default: "neutral") mixin procon(icon) - - colors = { pro: "green", con: "red" } + - colors = { pro: "green", con: "red", neutral: "yellow" } +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) //- Headlines Helper Mixin + level - [integer] 1, 2, 3, 4, or 5 mixin headline(level) if level == 1 @@ -65,6 +74,7 @@ mixin headline(level) //- Permalink rendering + id - [string] permalink ID used for link anchor mixin permalink(id) if id @@ -77,6 +87,7 @@ mixin permalink(id) //- Terminal-style code window + label - [string] title displayed in top bar of terminal window mixin terminal(label) .x-terminal @@ -87,6 +98,18 @@ mixin terminal(label) block +//- Gitter chat button and widget + button - [string] text shown on button + label - [string] title of chat window (default: same as button) + +mixin gitter(button, label) + aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) + + button.js-gitter-button.c-chat__button.u-text-small + +icon("chat").o-icon--inline + !=button + + //- Logo mixin logo() diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 8fe24b11b..8a42024c1 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -44,7 +44,7 @@ mixin api(path) +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block block - | #[+icon("book", 18).o-icon--inline.u-help.u-color-subtle] + | #[+icon("book", 18).o-icon--inline.u-color-subtle] //- Aside for text diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index 09cbfa6a5..72db134cd 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -24,4 +24,6 @@ main.o-main.o-main--sidebar.o-main--aside .o-inline-list +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] + +gitter("spaCy chat") + include _footer diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade new file mode 100644 index 000000000..544cf0977 --- /dev/null +++ b/website/_includes/_scripts.jade @@ -0,0 +1,23 @@ +//- 💫 INCLUDES > SCRIPTS + +script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") +script(src="/assets/js/prism.js", type="text/javascript") + +if SECTION == "docs" + script. + ((window.gitter = {}).chat = {}).options = { + useStyles: false, + activationElement: '.js-gitter-button', + targetElement: '.js-gitter', + room: '!{SOCIAL.gitter}' + }; + + script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) + +if environment == "deploy" + script + | window.ga=window.ga||function(){ + | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; + | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); + + script(async src="https://www.google-analytics.com/analytics.js") diff --git a/website/_layout.jade b/website/_layout.jade index b04c4b5f3..d5c52df3f 100644 --- a/website/_layout.jade +++ b/website/_layout.jade @@ -52,13 +52,4 @@ html(lang="en") main!=yield include _includes/_footer - script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript") - script(src="/assets/js/prism.js", type="text/javascript") - - if environment == "deploy" - script - | window.ga=window.ga||function(){ - | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; - | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); - - script(async src="https://www.google-analytics.com/analytics.js") + include _includes/_scripts diff --git a/website/assets/css/_base/_fonts.sass b/website/assets/css/_base/_fonts.sass index 72aaf97f8..be113798c 100644 --- a/website/assets/css/_base/_fonts.sass +++ b/website/assets/css/_base/_fonts.sass @@ -6,36 +6,36 @@ font-family: "Source Sans Pro" font-style: normal font-weight: 400 - src: url("../fonts/sourcesanspro-regular.eot") - src: url("../fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-regular.woff2") format("woff2"), url("../fonts/sourcesanspro-regular.woff") format("woff"), url("../fonts/sourcesanspro-regular.ttf") format("truetype"), url("../fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") + src: url("/assets/fonts/sourcesanspro-regular.eot") + src: url("/assets/fonts/sourcesanspro-regular.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-regular.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-regular.woff") format("woff"), url("/assets/fonts/sourcesanspro-regular.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-regular.svg#source_sans_proregular") format("svg") @font-face font-family: "Source Sans Pro" font-style: italic font-weight: 400 - src: url("../fonts/sourcesanspro-italic.eot") - src: url("../fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-italic.woff2") format("woff2"), url("../fonts/sourcesanspro-italic.woff") format("woff"), url("../fonts/sourcesanspro-italic.ttf") format("truetype"), url("../fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") + src: url("/assets/fonts/sourcesanspro-italic.eot") + src: url("/assets/fonts/sourcesanspro-italic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-italic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-italic.woff") format("woff"), url("/assets/fonts/sourcesanspro-italic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-italic.svg#source_sans_proitalic") format("svg") @font-face font-family: "Source Sans Pro" font-style: normal font-weight: 700 - src: url("../fonts/sourcesanspro-bold.eot") - src: url("../fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bold.woff2") format("woff2"), url("../fonts/sourcesanspro-bold.woff") format("woff"), url("../fonts/sourcesanspro-bold.ttf") format("truetype"), url("../fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") + src: url("/assets/fonts/sourcesanspro-bold.eot") + src: url("/assets/fonts/sourcesanspro-bold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bold.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bold.woff") format("woff"), url("/assets/fonts/sourcesanspro-bold.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bold.svg#source_sans_probold") format("svg") @font-face font-family: "Source Sans Pro" font-style: italic font-weight: 700 - src: url("../fonts/sourcesanspro-bolditalic.eot") - src: url("../fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("../fonts/sourcesanspro-bolditalic.woff") format("woff"), url("../fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("../fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") + src: url("/assets/fonts/sourcesanspro-bolditalic.eot") + src: url("/assets/fonts/sourcesanspro-bolditalic.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcesanspro-bolditalic.woff2") format("woff2"), url("/assets/fonts/sourcesanspro-bolditalic.woff") format("woff"), url("/assets/fonts/sourcesanspro-bolditalic.ttf") format("truetype"), url("/assets/fonts/sourcesanspro-bolditalic.svg#source_sans_probold_italic") format("svg") // Source Code Pro @font-face - font-family: "Source Code Pro" - font-style: normal - font-weight: 600 - src: url("../fonts/sourcecodepro-semibold.eot") - src: url("../fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("../fonts/sourcecodepro-semibold.woff") format("woff"), url("../fonts/sourcecodepro-semibold.ttf") format("truetype"), url("../fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") + font-family: "Source Code Pro" + font-style: normal + font-weight: 600 + src: url("/assets/fonts/sourcecodepro-semibold.eot") + src: url("/assets/fonts/sourcecodepro-semibold.eot?#iefix") format("embedded-opentype"), url("/assets/fonts/sourcecodepro-semibold.woff") format("woff"), url("/assets/fonts/sourcecodepro-semibold.ttf") format("truetype"), url("/assets/fonts/sourcecodepro-semibold.svg#sourcecodepro_semibold") format("svg") diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 2b037dca7..7aaaef787 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -60,7 +60,7 @@ background: $color-back border-radius: 2px border: 1px solid $color-subtle - padding: 3.5% 2.5% + padding: 3rem 2.5% //- Icons diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 95be81bcd..2c40858a8 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -141,12 +141,6 @@ background: $pattern -//- Cursors - -.u-help - cursor: help - - //- Hidden elements .u-hidden diff --git a/website/assets/css/_components/_chat.sass b/website/assets/css/_components/_chat.sass new file mode 100644 index 000000000..2a1e5cc3d --- /dev/null +++ b/website/assets/css/_components/_chat.sass @@ -0,0 +1,100 @@ +//- 💫 CSS > COMPONENTS > CHAT + +.c-chat + @include position(fixed, top, left, 0, 60%) + bottom: 0 + right: 0 + display: flex + flex-flow: column nowrap + background: $color-back + transition: transform 0.3s cubic-bezier(0.16, 0.22, 0.22, 1.7) + box-shadow: -0.25rem 0 1rem 0 rgba($color-front, 0.25) + z-index: 100 + + @include breakpoint(min, md) + left: calc(100% - #{$aside-width} - #{$aside-padding}) + + @include breakpoint(max, sm) + left: 50% + + @include breakpoint(max, xs) + left: 0 + + &.is-collapsed:not(.is-loading) + transform: translateX(110%) + + &:before + @include position(absolute, top, left, 1rem, 2rem) + content: attr(data-title) + font: bold 1.4rem $font-code + text-transform: uppercase + color: $color-back + + &:after + @include position(absolute, top, left, 0, 100%) + content: "" + z-index: -1 + bottom: 0 + right: -100% + background: $color-back + + & > iframe + width: 100% + flex: 1 1 calc(100% - #{$nav-height}) + border: 0 + + .gitter-chat-embed-loading-wrapper + @include position(absolute, top, left, 0, 0) + right: 0 + bottom: 0 + display: none + justify-content: center + align-items: center + + .is-loading & + display: flex + + .gitter-chat-embed-action-bar, + .gitter-chat-embed-action-bar-item + display: flex + + .gitter-chat-embed-action-bar + align-items: center + justify-content: flex-end + background: $color-theme + padding: 0 1rem 0 2rem + flex: 0 0 $nav-height + + .gitter-chat-embed-action-bar-item + @include size(40px) + padding: 0 + opacity: 0.75 + background-position: 50% + background-repeat: no-repeat + background-size: 22px 22px + border: 0 + cursor: pointer + transition: all 0.2s ease + + &:focus, + &:hover + opacity: 1 + + &.gitter-chat-embed-action-bar-item-pop-out + background-image: url() + margin-right: -4px + + &.gitter-chat-embed-action-bar-item-collapse-chat + background-image: url() + +.c-chat__button + @include position(fixed, bottom, right, 0, 2rem) + padding: 1rem 1.5rem + background: $color-front + color: $color-back + border-top-left-radius: 4px + border-top-right-radius: 4px + z-index: 20 + border-color: $color-theme + border-style: solid + border-width: 1px 1px 0 1px diff --git a/website/assets/css/style.sass b/website/assets/css/style.sass index 5ab135ab9..a8d2edad4 100644 --- a/website/assets/css/style.sass +++ b/website/assets/css/style.sass @@ -24,6 +24,7 @@ $theme: blue !default @import _components/asides @import _components/buttons +@import _components/chat @import _components/code @import _components/landing @import _components/lists diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg index 23036f4ca..dc69deda4 100644 --- a/website/assets/img/graphics.svg +++ b/website/assets/img/graphics.svg @@ -64,5 +64,6 @@ matt-signature + diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg index 9237c9994..224224084 100644 --- a/website/assets/img/icons.svg +++ b/website/assets/img/icons.svg @@ -1,32 +1,28 @@ - - github - + + - - code - + + - - anchor - + + - - book - + + - - pro - + + - - con - + + - - neutral - + + + + + diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 20995df2e..24f3d4458 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -23,7 +23,7 @@ p +row +cell Multi-language support - each icon in [ "con", "pro", "pro", "pro" ] + each icon in [ "neutral", "pro", "pro", "pro" ] +cell.u-text-center #[+procon(icon)] +row diff --git a/website/docs/index.jade b/website/docs/index.jade index d2949b8c4..c19602002 100644 --- a/website/docs/index.jade +++ b/website/docs/index.jade @@ -2,8 +2,6 @@ include ../_includes/_mixins -p=lorem_short - +aside("Help us improve the docs") | Did you spot a mistake or come across explanations that | are unclear? You can find a "Suggest edits" button at the diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 4b62a290b..a96df5694 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -57,7 +57,7 @@ p doc.ents = [Span(0, 1, label='GPE')] assert doc[0].ent_type_ == 'GPE' doc.ents = [] - doc.ents = [(u'LondonCity', 0, 1, u'GPE')] + doc.ents = [(u'LondonCity', u'GPE', 0, 1)] p | The value you assign should be a sequence, the values of which diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade index a09c7358d..2b80ebe48 100644 --- a/website/docs/usage/resources.jade +++ b/website/docs/usage/resources.jade @@ -30,6 +30,13 @@ p Many of the associated tools and resources that we're developing alongside spa +cell | REST microservices for spaCy demos and visualisers. + +row + +cell + +src(gh("spacy-notebooks")) spaCy Notebooks + + +cell + | Jupyter notebooks for spaCy examples and tutorials. + +h(2, "libraries") Libraries and projects +table(["Name", "Description"]) +row diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index bedadb0d3..fde9ee4d7 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -141,7 +141,7 @@ p span.merge(label=label, tag='NNP' if label else span.root.tag_) matcher.add_entity('GoogleNow', on_match=merge_phrases) - matcher.add_pattern('GoogleNow', {ORTH: 'Google'}, {ORTH: 'Now'}]) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded']) matcher(doc) print([w.text for w in doc])