From 71954d5fe7b9e98e34f894af04df32eafbf56147 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 10:32:53 +0200 Subject: [PATCH 01/28] Update Thinc version --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7cd5fba43..ae50be598 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.7.1,<6.8.0 +thinc>=6.7.2,<6.8.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six diff --git a/setup.py b/setup.py index a16b35748..c317c537f 100755 --- a/setup.py +++ b/setup.py @@ -191,7 +191,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.7.1,<6.8.0', + 'thinc>=6.7.2,<6.8.0', 'plac<1.0.0,>=0.9.6', 'pip>=9.0.0,<10.0.0', 'six', From 5109bba91018729952a3263418ad0f5ab114fce1 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 11:31:11 +0200 Subject: [PATCH 02/28] Port over fix from #1070 --- spacy/tokens/doc.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 84b39d454..e22a35875 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -437,7 +437,8 @@ cdef class Doc: """ def __get__(self): if 'sents' in self.user_hooks: - return self.user_hooks['sents'](self) + yield from self.user_hooks['sents'](self) + return if not self.is_parsed: raise ValueError( From 459a1e8470f244623804aea9bef13d394562d558 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 11:31:18 +0200 Subject: [PATCH 03/28] Fix whitespace --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e22a35875..b2706ea6f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -741,7 +741,7 @@ cdef class Doc: token.spacy = self.c[end-1].spacy for attr_name, attr_value in attributes.items(): if attr_name == TAG: - self.vocab.morphology.assign_tag(token, attr_value) + self.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) # Begin by setting all the head indices to absolute token positions From c60431357de50de6caada0802b514c6e618b6c2a Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 11:31:30 +0200 Subject: [PATCH 04/28] Port over docs typo corrections --- website/docs/api/doc.jade | 2 +- website/docs/usage/customizing-tokenizer.jade | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 9b8392fcb..4228aed8f 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -64,7 +64,7 @@ p doc = nlp(u'Give it back! He pleaded.') assert doc[0].text == 'Give' assert doc[-1].text == '.' - span = doc[1:1] + span = doc[1:3] assert span.text == 'it back' +table(["Name", "Type", "Description"]) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 5c9a9fd78..f56ce9fb1 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -141,7 +141,7 @@ p else: tokens.append(substring) substring = '' - tokens.extend(suffixes) + tokens.extend(reversed(suffixes)) return tokens p From 70fbba7d085fb756c976021cebec9d0474b8e336 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:24:43 +0200 Subject: [PATCH 05/28] Clone Doc to never merge punctuation on original Doc --- spacy/displacy/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index b27370909..3bb0b8aec 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -65,12 +65,13 @@ def app(environ, start_response): return [res] -def parse_deps(doc, options={}): +def parse_deps(orig_doc, options={}): """Generate dependency parse in {'words': [], 'arcs': []} format. doc (Doc): Document do parse. RETURNS (dict): Generated dependency parse keyed by words and arcs. """ + doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) if options.get('collapse_punct', True): spans = [] for word in doc[:-1]: From cc8c8617a4e078afcb6ed8de0235be505561dea1 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:24:56 +0200 Subject: [PATCH 06/28] Shut down displaCy server on KeyboardInterrupt --- spacy/displacy/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3bb0b8aec..8468720cd 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server('0.0.0.0', port, app) prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) - httpd.serve_forever() + try: + httpd.serve_forever() + except KeyboardInterrupt: + prints("Shutting down server on port %d." % port) + finally: + httpd.server_close() def app(environ, start_response): From 32c6f05de91b8ae7a189bec4c4efb11f50d78947 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:25:32 +0200 Subject: [PATCH 07/28] Adjust spacing and sizing in compact mode --- spacy/displacy/render.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index e9b792881..1050ffa87 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -18,12 +18,11 @@ class DependencyRenderer(object): offset_x, color, bg, font) """ self.compact = options.get('compact', False) - distance, arrow_width = (85, 8) if self.compact else (175, 10) self.word_spacing = options.get('word_spacing', 45) - self.arrow_spacing = options.get('arrow_spacing', 20) - self.arrow_width = options.get('arrow_width', arrow_width) + self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20) + self.arrow_width = options.get('arrow_width', 6 if self.compact else 10) self.arrow_stroke = options.get('arrow_stroke', 2) - self.distance = options.get('distance', distance) + self.distance = options.get('distance', 150 if self.compact else 175) self.offset_x = options.get('offset_x', 50) self.color = options.get('color', '#000000') self.bg = options.get('bg', '#ffffff') @@ -99,6 +98,8 @@ class DependencyRenderer(object): x_end = (self.offset_x+(end-start)*self.distance+start*self.distance -self.arrow_spacing*(self.highest_level-level)/4) y_curve = self.offset_y-level*self.distance/2 + if self.compact: + y_curve = self.offset_y-level*self.distance/6 if y_curve == 0 and len(self.levels) > 5: y_curve = -self.distance arrowhead = self.get_arrowhead(direction, x_start, y, x_end) From 82154a1861170538e4afe705baa285440ab30476 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:25:41 +0200 Subject: [PATCH 08/28] Add letter spacing to arrow label --- spacy/displacy/templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index 54df44489..2f6fc22de 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -21,7 +21,7 @@ TPL_DEP_WORDS = """ TPL_DEP_ARCS = """ - + {label} From 0153b66a861e023ba23dc0d23e6b5a0cc9ca0519 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:26:13 +0200 Subject: [PATCH 09/28] Return self in Tokenizer.from_bytes --- spacy/tokenizer.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 20d2d7a47..a7067f69e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -392,3 +392,4 @@ cdef class Tokenizer: self.token_match = re.compile(data['token_match']).search for string, substrings in data.get('rules', {}).items(): self.add_special_case(string, substrings) + return self From de974f7bef19dbddc046f07bb2a58b8afa3dba09 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:26:34 +0200 Subject: [PATCH 10/28] Add serializer tests for tokenizer --- .../serialize/test_serialize_tokenizer.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 spacy/tests/serialize/test_serialize_tokenizer.py diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py new file mode 100644 index 000000000..2e3d78c14 --- /dev/null +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -0,0 +1,25 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import make_tempdir + +import pytest + + +@pytest.mark.parametrize('text', ["I can't do this"]) +def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): + tokenizer_b = en_tokenizer.to_bytes() + new_tokenizer = en_tokenizer.from_bytes(tokenizer_b) + assert new_tokenizer.to_bytes() == tokenizer_b + doc1 = en_tokenizer(text) + doc2 = new_tokenizer(text) + assert [token.text for token in doc1] == [token.text for token in doc2] + + +def test_serialize_tokenizer_roundtrip_disk(en_tokenizer): + tokenizer = en_tokenizer + with make_tempdir() as d: + file_path = d / 'tokenizer' + tokenizer.to_disk(file_path) + tokenizer_d = en_tokenizer.from_disk(file_path) + assert tokenizer.to_bytes() == tokenizer_d.to_bytes() From b0225183c2487ac1a5ca617e2169b40b3c67bff7 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:27:06 +0200 Subject: [PATCH 11/28] Update displaCy defaults --- website/docs/api/displacy.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade index 415fab77d..59fcca3ca 100644 --- a/website/docs/api/displacy.jade +++ b/website/docs/api/displacy.jade @@ -205,7 +205,7 @@ p +cell #[code arrow_spacing] +cell int +cell Spacing between arrows in px to avoid overlaps. - +cell #[code 20] + +cell #[code 20] / #[code 12] (compact) +row +cell #[code word_spacing] From 9acf8686f7bcaae05ed7a411c8f3b2581dc093b7 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 13:31:16 +0200 Subject: [PATCH 12/28] Update note on compact mode issues --- website/docs/usage/visualizers.jade | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index b26fbc27a..62dc8e871 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -59,9 +59,11 @@ p | to customise the layout, for example: +aside("Important note") - | There's currently a known issue with the #[code compact] mode for long - | sentences with arrow spacing. If the spacing is larger than the arc - | itself, it'll cause the arc and its label to flip. + | There's currently a known issue with the #[code compact] mode for + | sentences with short arrows and long dependency labels, that causes labels + | longer than the arrow to wrap. So if you come across this problem, + | especially when using custom labels, you'll have to increase the + | #[code distance] setting in the #[code options] to allow longer arcs. +table(["Name", "Type", "Description", "Default"]) +row From 1ebd0d3f276d09a7de72a386d7b52808c3e6ce56 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 17:04:30 +0200 Subject: [PATCH 13/28] Add assert_packed_msg_equal util function --- spacy/tests/util.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 7f8884235..56aeb5223 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -10,6 +10,7 @@ import numpy import tempfile import shutil import contextlib +import msgpack from pathlib import Path @@ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2): assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] + + +def assert_packed_msg_equal(b1, b2): + """Assert that two packed msgpack messages are equal.""" + msg1 = msgpack.loads(b1, encoding='utf8') + msg2 = msgpack.loads(b2, encoding='utf8') + assert sorted(msg1.keys()) == sorted(msg2.keys()) + for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): + assert k1 == k2 + assert v1 == v2 From 7c919aeb09eec6888d1b6918ff4421921b5cc90f Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 17:05:09 +0200 Subject: [PATCH 14/28] Make sure serializers and deserializers are ordered --- spacy/tokenizer.pyx | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index a7067f69e..de184baba 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -2,6 +2,7 @@ # coding: utf8 from __future__ import unicode_literals +from collections import OrderedDict from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool @@ -355,14 +356,14 @@ cdef class Tokenizer: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Tokenizer` object. """ - serializers = { - 'vocab': lambda: self.vocab.to_bytes(), - 'prefix_search': lambda: self.prefix_search.__self__.pattern, - 'suffix_search': lambda: self.suffix_search.__self__.pattern, - 'infix_finditer': lambda: self.infix_finditer.__self__.pattern, - 'token_match': lambda: self.token_match.__self__.pattern, - 'exceptions': lambda: self._rules - } + serializers = OrderedDict(( + ('vocab', lambda: self.vocab.to_bytes()), + ('prefix_search', lambda: self.prefix_search.__self__.pattern), + ('suffix_search', lambda: self.suffix_search.__self__.pattern), + ('infix_finditer', lambda: self.infix_finditer.__self__.pattern), + ('token_match', lambda: self.token_match.__self__.pattern), + ('exceptions', lambda: OrderedDict(sorted(self._rules.items()))) + )) return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, **exclude): @@ -372,15 +373,15 @@ cdef class Tokenizer: **exclude: Named attributes to prevent from being loaded. RETURNS (Tokenizer): The `Tokenizer` object. """ - data = {} - deserializers = { - 'vocab': lambda b: self.vocab.from_bytes(b), - 'prefix_search': lambda b: data.setdefault('prefix', b), - 'suffix_search': lambda b: data.setdefault('suffix_search', b), - 'infix_finditer': lambda b: data.setdefault('infix_finditer', b), - 'token_match': lambda b: data.setdefault('token_match', b), - 'exceptions': lambda b: data.setdefault('rules', b) - } + data = OrderedDict() + deserializers = OrderedDict(( + ('vocab', lambda b: self.vocab.from_bytes(b)), + ('prefix_search', lambda b: data.setdefault('prefix', b)), + ('suffix_search', lambda b: data.setdefault('suffix_search', b)), + ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)), + ('token_match', lambda b: data.setdefault('token_match', b)), + ('exceptions', lambda b: data.setdefault('rules', b)) + )) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'prefix_search' in data: self.prefix_search = re.compile(data['prefix_search']).search From 3152ee5ca2f21708e428faac5eaadbb403d0a1dc Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 17:05:28 +0200 Subject: [PATCH 15/28] Update serialization tests for tokenizer --- .../serialize/test_serialize_tokenizer.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 2e3d78c14..e893d3a77 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,17 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals -from ..util import make_tempdir +from ...util import get_lang_class +from ..util import make_tempdir, assert_packed_msg_equal import pytest -@pytest.mark.parametrize('text', ["I can't do this"]) +def load_tokenizer(b): + tok = get_lang_class('en').Defaults.create_tokenizer() + tok.from_bytes(b) + return tok + + +@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"]) def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): - tokenizer_b = en_tokenizer.to_bytes() - new_tokenizer = en_tokenizer.from_bytes(tokenizer_b) - assert new_tokenizer.to_bytes() == tokenizer_b - doc1 = en_tokenizer(text) + tokenizer = en_tokenizer + new_tokenizer = load_tokenizer(tokenizer.to_bytes()) + assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes()) + # assert new_tokenizer.to_bytes() == tokenizer.to_bytes() + doc1 = tokenizer(text) doc2 = new_tokenizer(text) assert [token.text for token in doc1] == [token.text for token in doc2] From 05fe6758a71c0e524405d59b005eab0656f41098 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 19:44:39 +0200 Subject: [PATCH 16/28] Set lexeme attributes for tokenizer special cases --- spacy/vocab.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d3aa426cd..6655925e4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -231,11 +231,13 @@ cdef class Vocab: props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - token.lex = self.get_by_orth(self.mem, props[attrs.ORTH]) + lex = self.get_by_orth(self.mem, props[attrs.ORTH]) + token.lex = lex if attrs.TAG in props: self.morphology.assign_tag(token, props[attrs.TAG]) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) + Lexeme.set_struct_attr(lex, attr_id, value) return tokens @property From 4c2bbc3ccc2c6830846764376a52edb307ef592e Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 19:44:47 +0200 Subject: [PATCH 17/28] Add add_lookups util function --- spacy/util.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 55f2a49bb..469123479 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -299,6 +299,22 @@ def compile_infix_regex(entries): return re.compile(expression) +def add_lookups(default_func, *lookups): + """Extend an attribute function with special cases. If a word is in the + lookups, the value is returned. Otherwise the previous function is used. + + default_func (callable): The default function to execute. + *lookups (dict): Lookup dictionary mapping string to attribute value. + RETURNS (callable): Lexical attribute getter. + """ + def get_attr(string): + for lookup in lookups: + if string in lookup: + return lookup[string] + return default_func(string) + return get_attr + + def update_exc(base_exceptions, *addition_dicts): """Update and validate tokenizer exceptions. Will overwrite exceptions. From e5d426406ad3661a2863c06339f896da451d9450 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:27:05 +0200 Subject: [PATCH 18/28] Add base norm exceptions --- spacy/lang/norm_exceptions.py | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 spacy/lang/norm_exceptions.py diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py new file mode 100644 index 000000000..b02dda2c8 --- /dev/null +++ b/spacy/lang/norm_exceptions.py @@ -0,0 +1,46 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# These exceptions are used to add NORM values based on a token's ORTH value. +# Individual languages can also add their own exceptions and overwrite them - +# for example, British vs. American spelling in English. + +# Norms are only set if no alternative is provided in the tokenizer exceptions. +# Note that this does not change any other token attributes. Its main purpose +# is to normalise the word representations so that equivalent tokens receive +# similar representations. For example: $ and € are very different, but they're +# both currency symbols. By normalising currency symbols to $, all symbols are +# seen as similar, no matter how common they are in the training data. + + +BASE_NORMS = { + "'s": "'s", + "'S": "'s", + "’s": "'s", + "’S": "'s", + "’": "'", + "‘": "'", + "ÂŽ": "'", + "`": "'", + "”": '"', + "“": '"', + "''": '"', + "``": '"', + "ÂŽÂŽ": '"', + "„": '"', + "»": '"', + "«": '"', + "
": "...", + "—": "-", + "–": "-", + "--": "-", + "---": "-", + "€": "$", + "ÂŁ": "$", + "„": "$", + "àžż": "$", + "US$": "$", + "C$": "$", + "A$": "$" +} From 095eeeb12f208fb368b1fcd5eae6a9b99eaa2c8b Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:27:16 +0200 Subject: [PATCH 19/28] Update English tokenizer exceptions and add norms --- spacy/lang/en/tokenizer_exceptions.py | 366 +++++++++++++------------- 1 file changed, 187 insertions(+), 179 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 5c6e3f893..392532619 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", for pron in ["i"]: for orth in [pron, pron.title()]: _exc[orth + "'m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}] _exc[orth + "m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] _exc[orth + "'ma"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] _exc[orth + "ma"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] @@ -36,72 +36,72 @@ for pron in ["i"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}] _exc[orth + "ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}] _exc[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "llve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "'d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}] _exc[orth + "d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}] _exc[orth + "'d've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "dve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] for pron in ["i", "you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] for pron in ["you", "we", "they"]: for orth in [pron, pron.title()]: _exc[orth + "'re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "'re", LEMMA: "be", NORM: "are"}] _exc[orth + "re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] for pron in ["he", "she", "it"]: for orth in [pron, pron.title()]: _exc[orth + "'s"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'s"}] + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, + {ORTH: "'s", NORM: "'s"}] _exc[orth + "s"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: "s"}] @@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "'s"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "'s", NORM: "'s"}] _exc[orth + "s"] = [ - {ORTH: orth, LEMMA: word}, + {ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}] _exc[orth + "'ll"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}] _exc[orth + "ll"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}] _exc[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "llve"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "'re"] = [ - {ORTH: orth, LEMMA: word}, + {ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "'re", LEMMA: "be", NORM: "are"}] _exc[orth + "re"] = [ - {ORTH: orth, LEMMA: word}, + {ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "re", LEMMA: "be", NORM: "are"}] _exc[orth + "'ve"] = [ - {ORTH: orth}, + {ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] _exc[orth + "ve"] = [ {ORTH: orth, LEMMA: word}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "'d"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "'d"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "'d", NORM: "'d"}] _exc[orth + "d"] = [ - {ORTH: orth, LEMMA: word}, + {ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}] _exc[orth + "'d've"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[orth + "dve"] = [ - {ORTH: orth, LEMMA: word}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] # Verbs for verb_data in [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "could", TAG: "MD"}, - {ORTH: "do", LEMMA: "do"}, - {ORTH: "does", LEMMA: "do"}, - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "may", TAG: "MD"}, - {ORTH: "might", TAG: "MD"}, - {ORTH: "must", TAG: "MD"}, - {ORTH: "need"}, - {ORTH: "ought"}, - {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, - {ORTH: "should", TAG: "MD"}, - {ORTH: "wo", LEMMA: "will", TAG: "MD"}, - {ORTH: "would", TAG: "MD"}]: + {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"}, + {ORTH: "could", NORM: "could", TAG: "MD"}, + {ORTH: "do", LEMMA: "do", NORM: "do"}, + {ORTH: "does", LEMMA: "do", NORM: "does"}, + {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"}, + {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"}, + {ORTH: "may", NORM: "may", TAG: "MD"}, + {ORTH: "might", NORM: "might", TAG: "MD"}, + {ORTH: "must", NORM: "must", TAG: "MD"}, + {ORTH: "need", NORM: "need"}, + {ORTH: "ought", NORM: "ought", TAG: "MD"}, + {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"}, + {ORTH: "should", NORM: "should", TAG: "MD"}, + {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"}, + {ORTH: "would", NORM: "would", TAG: "MD"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"}] + {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"}] + {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}] _exc[data[ORTH] + "n't've"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] + {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] _exc[data[ORTH] + "ntve"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}] + {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] for verb_data in [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "might"}, - {ORTH: "must"}, - {ORTH: "should"}]: + {ORTH: "could", NORM: "could", TAG: "MD"}, + {ORTH: "might", NORM: "might", TAG: "MD"}, + {ORTH: "must", NORM: "must", TAG: "MD"}, + {ORTH: "should", NORM: "should", TAG: "MD"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: @@ -228,21 +228,21 @@ for verb_data in [ for verb_data in [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"}]: + {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2}, + {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, + {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, + {ORTH: "was", LEMMA: "be", NORM: "was"}, + {ORTH: "were", LEMMA: "be", NORM: "were"}]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "n't"] = [ dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"}] + {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] _exc[data[ORTH] + "nt"] = [ dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"}] + {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}] # Other contractions with trailing apostrophe @@ -250,10 +250,10 @@ for verb_data in [ for exc_data in [ {ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "goin", LEMMA: "go", NORM: "going"}, - {ORTH: "nothin", LEMMA: "nothing"}, - {ORTH: "nuthin", LEMMA: "nothing"}, - {ORTH: "ol", LEMMA: "old"}, - {ORTH: "somethin", LEMMA: "something"}]: + {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"}, + {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"}, + {ORTH: "ol", LEMMA: "old", NORM: "old"}, + {ORTH: "somethin", LEMMA: "something", NORM: "something"}]: exc_data_tc = dict(exc_data) exc_data_tc[ORTH] = exc_data_tc[ORTH].title() for data in [exc_data, exc_data_tc]: @@ -266,10 +266,10 @@ for exc_data in [ # Other contractions with leading apostrophe for exc_data in [ - {ORTH: "cause", LEMMA: "because"}, + {ORTH: "cause", LEMMA: "because", NORM: "because"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, - {ORTH: "ll", LEMMA: "will"}, - {ORTH: "nuff", LEMMA: "enough"}]: + {ORTH: "ll", LEMMA: "will", NORM: "will"}, + {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]: exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] for data in [exc_data, exc_data_apos]: @@ -282,11 +282,11 @@ for h in range(1, 12 + 1): for period in ["a.m.", "am"]: _exc["%d%s" % (h, period)] = [ {ORTH: "%d" % h}, - {ORTH: period, LEMMA: "a.m."}] + {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}] for period in ["p.m.", "pm"]: _exc["%d%s" % (h, period)] = [ {ORTH: "%d" % h}, - {ORTH: period, LEMMA: "p.m."}] + {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}] # Rest @@ -306,56 +306,56 @@ _other_exc = { {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], "How'd'y": [ - {ORTH: "How", LEMMA: "how"}, + {ORTH: "How", LEMMA: "how", NORM: "how"}, {ORTH: "'d", LEMMA: "do"}, {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], "not've": [ {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}], "notve": [ {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}], + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}], "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], + {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}], "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"}], + {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}], "cannot": [ {ORTH: "can", LEMMA: "can", TAG: "MD"}, {ORTH: "not", LEMMA: "not", TAG: "RB"}], "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"}, {ORTH: "not", LEMMA: "not", TAG: "RB"}], "gonna": [ {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"}], + {ORTH: "na", LEMMA: "to", NORM: "to"}], "Gonna": [ {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"}], + {ORTH: "na", LEMMA: "to", NORM: "to"}], "gotta": [ {ORTH: "got"}, - {ORTH: "ta", LEMMA: "to"}], + {ORTH: "ta", LEMMA: "to", NORM: "to"}], "Gotta": [ - {ORTH: "Got"}, - {ORTH: "ta", LEMMA: "to"}], + {ORTH: "Got", NORM: "got"}, + {ORTH: "ta", LEMMA: "to", NORM: "to"}], "let's": [ {ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], "Let's": [ - {ORTH: "Let", LEMMA: "let"}, + {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] } @@ -363,72 +363,80 @@ _exc.update(_other_exc) for exc_data in [ - {ORTH: "'S", LEMMA: "'s"}, - {ORTH: "'s", LEMMA: "'s"}, - {ORTH: "\u2018S", LEMMA: "'s"}, - {ORTH: "\u2018s", LEMMA: "'s"}, - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, + {ORTH: "'S", LEMMA: "'s", NORM: "'s"}, + {ORTH: "'s", LEMMA: "'s", NORM: "'s"}, + {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"}, + {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"}, + {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"}, + {ORTH: "w/o", LEMMA: "without", NORM: "without"}, {ORTH: "'re", LEMMA: "be", NORM: "are"}, - {ORTH: "'Cause", LEMMA: "because"}, - {ORTH: "'cause", LEMMA: "because"}, - {ORTH: "ma'am", LEMMA: "madam"}, - {ORTH: "Ma'am", LEMMA: "madam"}, - {ORTH: "o'clock", LEMMA: "o'clock"}, - {ORTH: "O'clock", LEMMA: "o'clock"}, + {ORTH: "'Cause", LEMMA: "because", NORM: "because"}, + {ORTH: "'cause", LEMMA: "because", NORM: "because"}, + {ORTH: "'cos", LEMMA: "because", NORM: "because"}, + {ORTH: "'Cos", LEMMA: "because", NORM: "because"}, + {ORTH: "'coz", LEMMA: "because", NORM: "because"}, + {ORTH: "'Coz", LEMMA: "because", NORM: "because"}, + {ORTH: "'cuz", LEMMA: "because", NORM: "because"}, + {ORTH: "'Cuz", LEMMA: "because", NORM: "because"}, + {ORTH: "'bout", LEMMA: "about", NORM: "about"}, + {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"}, + {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"}, + {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"}, + {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, - {ORTH: "Mt.", LEMMA: "Mount"}, - {ORTH: "Ak.", LEMMA: "Alaska"}, - {ORTH: "Ala.", LEMMA: "Alabama"}, - {ORTH: "Apr.", LEMMA: "April"}, - {ORTH: "Ariz.", LEMMA: "Arizona"}, - {ORTH: "Ark.", LEMMA: "Arkansas"}, - {ORTH: "Aug.", LEMMA: "August"}, - {ORTH: "Calif.", LEMMA: "California"}, - {ORTH: "Colo.", LEMMA: "Colorado"}, - {ORTH: "Conn.", LEMMA: "Connecticut"}, - {ORTH: "Dec.", LEMMA: "December"}, - {ORTH: "Del.", LEMMA: "Delaware"}, - {ORTH: "Feb.", LEMMA: "February"}, - {ORTH: "Fla.", LEMMA: "Florida"}, - {ORTH: "Ga.", LEMMA: "Georgia"}, - {ORTH: "Ia.", LEMMA: "Iowa"}, - {ORTH: "Id.", LEMMA: "Idaho"}, - {ORTH: "Ill.", LEMMA: "Illinois"}, - {ORTH: "Ind.", LEMMA: "Indiana"}, - {ORTH: "Jan.", LEMMA: "January"}, - {ORTH: "Jul.", LEMMA: "July"}, - {ORTH: "Jun.", LEMMA: "June"}, - {ORTH: "Kan.", LEMMA: "Kansas"}, - {ORTH: "Kans.", LEMMA: "Kansas"}, - {ORTH: "Ky.", LEMMA: "Kentucky"}, - {ORTH: "La.", LEMMA: "Louisiana"}, - {ORTH: "Mar.", LEMMA: "March"}, - {ORTH: "Mass.", LEMMA: "Massachusetts"}, - {ORTH: "May.", LEMMA: "May"}, - {ORTH: "Mich.", LEMMA: "Michigan"}, - {ORTH: "Minn.", LEMMA: "Minnesota"}, - {ORTH: "Miss.", LEMMA: "Mississippi"}, - {ORTH: "N.C.", LEMMA: "North Carolina"}, - {ORTH: "N.D.", LEMMA: "North Dakota"}, - {ORTH: "N.H.", LEMMA: "New Hampshire"}, - {ORTH: "N.J.", LEMMA: "New Jersey"}, - {ORTH: "N.M.", LEMMA: "New Mexico"}, - {ORTH: "N.Y.", LEMMA: "New York"}, - {ORTH: "Neb.", LEMMA: "Nebraska"}, - {ORTH: "Nebr.", LEMMA: "Nebraska"}, - {ORTH: "Nev.", LEMMA: "Nevada"}, - {ORTH: "Nov.", LEMMA: "November"}, - {ORTH: "Oct.", LEMMA: "October"}, - {ORTH: "Okla.", LEMMA: "Oklahoma"}, - {ORTH: "Ore.", LEMMA: "Oregon"}, - {ORTH: "Pa.", LEMMA: "Pennsylvania"}, - {ORTH: "S.C.", LEMMA: "South Carolina"}, - {ORTH: "Sep.", LEMMA: "September"}, - {ORTH: "Sept.", LEMMA: "September"}, - {ORTH: "Tenn.", LEMMA: "Tennessee"}, - {ORTH: "Va.", LEMMA: "Virginia"}, - {ORTH: "Wash.", LEMMA: "Washington"}, - {ORTH: "Wis.", LEMMA: "Wisconsin"}]: + {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, + {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, + {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"}, + {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, + {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"}, + {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"}, + {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, + {ORTH: "Calif.", LEMMA: "California", NORM: "California"}, + {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"}, + {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"}, + {ORTH: "Dec.", LEMMA: "December", NORM: "December"}, + {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"}, + {ORTH: "Feb.", LEMMA: "February", NORM: "February"}, + {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"}, + {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"}, + {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"}, + {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"}, + {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"}, + {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"}, + {ORTH: "Jan.", LEMMA: "January", NORM: "January"}, + {ORTH: "Jul.", LEMMA: "July", NORM: "July"}, + {ORTH: "Jun.", LEMMA: "June", NORM: "June"}, + {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"}, + {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"}, + {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"}, + {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"}, + {ORTH: "Mar.", LEMMA: "March", NORM: "March"}, + {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"}, + {ORTH: "May.", LEMMA: "May", NORM: "May"}, + {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"}, + {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"}, + {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"}, + {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"}, + {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"}, + {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"}, + {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"}, + {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"}, + {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"}, + {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"}, + {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"}, + {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"}, + {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, + {ORTH: "Oct.", LEMMA: "October", NORM: "October"}, + {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"}, + {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"}, + {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"}, + {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"}, + {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, + {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, + {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"}, + {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, + {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, + {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]: _exc[exc_data[ORTH]] = [dict(exc_data)] From 746653880ce2fd24a511ae03f7d5f0eaa4d861ca Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:27:28 +0200 Subject: [PATCH 20/28] Add English norm exceptions to lex_attrs --- spacy/lang/en/__init__.py | 8 +- spacy/lang/en/norm_exceptions.py | 1761 ++++++++++++++++++++++++++++++ 2 files changed, 1767 insertions(+), 2 deletions(-) create mode 100644 spacy/lang/en/norm_exceptions.py diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 7e1da789b..3f422b834 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class EnglishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'en' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + BASE_NORMS, NORM_EXCEPTIONS) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py new file mode 100644 index 000000000..ec106b960 --- /dev/null +++ b/spacy/lang/en/norm_exceptions.py @@ -0,0 +1,1761 @@ +# coding: utf8 +from __future__ import unicode_literals + + +_exc = { + # Slang and abbreviations + "cos": "because", + "cuz": "because", + "fav": "favorite", + "fave": "favorite", + "misc": "miscellaneous", + "plz": "please", + "pls": "please", + "thx": "thanks", + + # US vs. UK spelling + "accessorise": "accessorize", + "accessorised": "accessorized", + "accessorises": "accessorizes", + "accessorising": "accessorizing", + "acclimatisation": "acclimatization", + "acclimatise": "acclimatize", + "acclimatised": "acclimatized", + "acclimatises": "acclimatizes", + "acclimatising": "acclimatizing", + "accoutrements": "accouterments", + "aeon": "eon", + "aeons": "eons", + "aerogramme": "aerogram", + "aerogrammes": "aerograms", + "aeroplane": "airplane", + "aeroplanes ": "airplanes ", + "aesthete": "esthete", + "aesthetes": "esthetes", + "aesthetic": "esthetic", + "aesthetically": "esthetically", + "aesthetics": "esthetics", + "aetiology": "etiology", + "ageing": "aging", + "aggrandisement": "aggrandizement", + "agonise": "agonize", + "agonised": "agonized", + "agonises": "agonizes", + "agonising": "agonizing", + "agonisingly": "agonizingly", + "almanack": "almanac", + "almanacks": "almanacs", + "aluminium": "aluminum", + "amortisable": "amortizable", + "amortisation": "amortization", + "amortisations": "amortizations", + "amortise": "amortize", + "amortised": "amortized", + "amortises": "amortizes", + "amortising": "amortizing", + "amphitheatre": "amphitheater", + "amphitheatres": "amphitheaters", + "anaemia": "anemia", + "anaemic": "anemic", + "anaesthesia": "anesthesia", + "anaesthetic": "anesthetic", + "anaesthetics": "anesthetics", + "anaesthetise": "anesthetize", + "anaesthetised": "anesthetized", + "anaesthetises": "anesthetizes", + "anaesthetising": "anesthetizing", + "anaesthetist": "anesthetist", + "anaesthetists": "anesthetists", + "anaesthetize": "anesthetize", + "anaesthetized": "anesthetized", + "anaesthetizes": "anesthetizes", + "anaesthetizing": "anesthetizing", + "analogue": "analog", + "analogues": "analogs", + "analyse": "analyze", + "analysed": "analyzed", + "analyses": "analyzes", + "analysing": "analyzing", + "anglicise": "anglicize", + "anglicised": "anglicized", + "anglicises": "anglicizes", + "anglicising": "anglicizing", + "annualised": "annualized", + "antagonise": "antagonize", + "antagonised": "antagonized", + "antagonises": "antagonizes", + "antagonising": "antagonizing", + "apologise": "apologize", + "apologised": "apologized", + "apologises": "apologizes", + "apologising": "apologizing", + "appal": "appall", + "appals": "appalls", + "appetiser": "appetizer", + "appetisers": "appetizers", + "appetising": "appetizing", + "appetisingly": "appetizingly", + "arbour": "arbor", + "arbours": "arbors", + "archaeological": "archeological", + "archaeologically": "archeologically", + "archaeologist": "archeologist", + "archaeologists": "archeologists", + "archaeology": "archeology", + "ardour": "ardor", + "armour": "armor", + "armoured": "armored", + "armourer": "armorer", + "armourers": "armorers", + "armouries": "armories", + "armoury": "armory", + "artefact": "artifact", + "artefacts": "artifacts", + "authorise": "authorize", + "authorised": "authorized", + "authorises": "authorizes", + "authorising": "authorizing", + "axe": "ax", + "backpedalled": "backpedaled", + "backpedalling": "backpedaling", + "bannister": "banister", + "bannisters": "banisters", + "baptise": "baptize", + "baptised": "baptized", + "baptises": "baptizes", + "baptising": "baptizing", + "bastardise": "bastardize", + "bastardised": "bastardized", + "bastardises": "bastardizes", + "bastardising": "bastardizing", + "battleaxe": "battleax", + "baulk": "balk", + "baulked": "balked", + "baulking": "balking", + "baulks": "balks", + "bedevilled": "bedeviled", + "bedevilling": "bedeviling", + "behaviour": "behavior", + "behavioural": "behavioral", + "behaviourism": "behaviorism", + "behaviourist": "behaviorist", + "behaviourists": "behaviorists", + "behaviours": "behaviors", + "behove": "behoove", + "behoved": "behooved", + "behoves": "behooves", + "bejewelled": "bejeweled", + "belabour": "belabor", + "belaboured": "belabored", + "belabouring": "belaboring", + "belabours": "belabors", + "bevelled": "beveled", + "bevvies": "bevies", + "bevvy": "bevy", + "biassed": "biased", + "biassing": "biasing", + "bingeing": "binging", + "bougainvillaea": "bougainvillea", + "bougainvillaeas": "bougainvilleas", + "bowdlerise": "bowdlerize", + "bowdlerised": "bowdlerized", + "bowdlerises": "bowdlerizes", + "bowdlerising": "bowdlerizing", + "breathalyse": "breathalyze", + "breathalysed": "breathalyzed", + "breathalyser": "breathalyzer", + "breathalysers": "breathalyzers", + "breathalyses": "breathalyzes", + "breathalysing": "breathalyzing", + "brutalise": "brutalize", + "brutalised": "brutalized", + "brutalises": "brutalizes", + "brutalising": "brutalizing", + "buses": "busses", + "busing": "bussing", + "caesarean": "cesarean", + "caesareans": "cesareans", + "calibre": "caliber", + "calibres": "calibers", + "calliper": "caliper", + "callipers": "calipers", + "callisthenics": "calisthenics", + "canalise": "canalize", + "canalised": "canalized", + "canalises": "canalizes", + "canalising": "canalizing", + "cancellation": "cancelation", + "cancellations": "cancelations", + "cancelled": "canceled", + "cancelling": "canceling", + "candour": "candor", + "cannibalise": "cannibalize", + "cannibalised": "cannibalized", + "cannibalises": "cannibalizes", + "cannibalising": "cannibalizing", + "canonise": "canonize", + "canonised": "canonized", + "canonises": "canonizes", + "canonising": "canonizing", + "capitalise": "capitalize", + "capitalised": "capitalized", + "capitalises": "capitalizes", + "capitalising": "capitalizing", + "caramelise": "caramelize", + "caramelised": "caramelized", + "caramelises": "caramelizes", + "caramelising": "caramelizing", + "carbonise": "carbonize", + "carbonised": "carbonized", + "carbonises": "carbonizes", + "carbonising": "carbonizing", + "carolled": "caroled", + "carolling": "caroling", + "catalogue": "catalog", + "catalogued": "cataloged", + "catalogues": "catalogs", + "cataloguing": "cataloging", + "catalyse": "catalyze", + "catalysed": "catalyzed", + "catalyses": "catalyzes", + "catalysing": "catalyzing", + "categorise": "categorize", + "categorised": "categorized", + "categorises": "categorizes", + "categorising": "categorizing", + "cauterise": "cauterize", + "cauterised": "cauterized", + "cauterises": "cauterizes", + "cauterising": "cauterizing", + "cavilled": "caviled", + "cavilling": "caviling", + "centigramme": "centigram", + "centigrammes": "centigrams", + "centilitre": "centiliter", + "centilitres": "centiliters", + "centimetre": "centimeter", + "centimetres": "centimeters", + "centralise": "centralize", + "centralised": "centralized", + "centralises": "centralizes", + "centralising": "centralizing", + "centre": "center", + "centred": "centered", + "centrefold": "centerfold", + "centrefolds": "centerfolds", + "centrepiece": "centerpiece", + "centrepieces": "centerpieces", + "centres": "centers", + "channelled": "channeled", + "channelling": "channeling", + "characterise": "characterize", + "characterised": "characterized", + "characterises": "characterizes", + "characterising": "characterizing", + "cheque": "check", + "chequebook": "checkbook", + "chequebooks": "checkbooks", + "chequered": "checkered", + "cheques": "checks", + "chilli": "chili", + "chimaera": "chimera", + "chimaeras": "chimeras", + "chiselled": "chiseled", + "chiselling": "chiseling", + "circularise": "circularize", + "circularised": "circularized", + "circularises": "circularizes", + "circularising": "circularizing", + "civilise": "civilize", + "civilised": "civilized", + "civilises": "civilizes", + "civilising": "civilizing", + "clamour": "clamor", + "clamoured": "clamored", + "clamouring": "clamoring", + "clamours": "clamors", + "clangour": "clangor", + "clarinettist": "clarinetist", + "clarinettists": "clarinetists", + "collectivise": "collectivize", + "collectivised": "collectivized", + "collectivises": "collectivizes", + "collectivising": "collectivizing", + "colonisation": "colonization", + "colonise": "colonize", + "colonised": "colonized", + "coloniser": "colonizer", + "colonisers": "colonizers", + "colonises": "colonizes", + "colonising": "colonizing", + "colour": "color", + "colourant": "colorant", + "colourants": "colorants", + "coloured": "colored", + "coloureds": "coloreds", + "colourful": "colorful", + "colourfully": "colorfully", + "colouring": "coloring", + "colourize": "colorize", + "colourized": "colorized", + "colourizes": "colorizes", + "colourizing": "colorizing", + "colourless": "colorless", + "colours": "colors", + "commercialise": "commercialize", + "commercialised": "commercialized", + "commercialises": "commercializes", + "commercialising": "commercializing", + "compartmentalise": "compartmentalize", + "compartmentalised": "compartmentalized", + "compartmentalises": "compartmentalizes", + "compartmentalising": "compartmentalizing", + "computerise": "computerize", + "computerised": "computerized", + "computerises": "computerizes", + "computerising": "computerizing", + "conceptualise": "conceptualize", + "conceptualised": "conceptualized", + "conceptualises": "conceptualizes", + "conceptualising": "conceptualizing", + "connexion": "connection", + "connexions": "connections", + "contextualise": "contextualize", + "contextualised": "contextualized", + "contextualises": "contextualizes", + "contextualising": "contextualizing", + "cosier": "cozier", + "cosies": "cozies", + "cosiest": "coziest", + "cosily": "cozily", + "cosiness": "coziness", + "cosy": "cozy", + "councillor": "councilor", + "councillors": "councilors", + "counselled": "counseled", + "counselling": "counseling", + "counsellor": "counselor", + "counsellors": "counselors", + "crenellated": "crenelated", + "criminalise": "criminalize", + "criminalised": "criminalized", + "criminalises": "criminalizes", + "criminalising": "criminalizing", + "criticise": "criticize", + "criticised": "criticized", + "criticises": "criticizes", + "criticising": "criticizing", + "crueller": "crueler", + "cruellest": "cruelest", + "crystallisation": "crystallization", + "crystallise": "crystallize", + "crystallised": "crystallized", + "crystallises": "crystallizes", + "crystallising": "crystallizing", + "cudgelled": "cudgeled", + "cudgelling": "cudgeling", + "customise": "customize", + "customised": "customized", + "customises": "customizes", + "customising": "customizing", + "cypher": "cipher", + "cyphers": "ciphers", + "decentralisation": "decentralization", + "decentralise": "decentralize", + "decentralised": "decentralized", + "decentralises": "decentralizes", + "decentralising": "decentralizing", + "decriminalisation": "decriminalization", + "decriminalise": "decriminalize", + "decriminalised": "decriminalized", + "decriminalises": "decriminalizes", + "decriminalising": "decriminalizing", + "defence": "defense", + "defenceless": "defenseless", + "defences": "defenses", + "dehumanisation": "dehumanization", + "dehumanise": "dehumanize", + "dehumanised": "dehumanized", + "dehumanises": "dehumanizes", + "dehumanising": "dehumanizing", + "demeanour": "demeanor", + "demilitarisation": "demilitarization", + "demilitarise": "demilitarize", + "demilitarised": "demilitarized", + "demilitarises": "demilitarizes", + "demilitarising": "demilitarizing", + "demobilisation": "demobilization", + "demobilise": "demobilize", + "demobilised": "demobilized", + "demobilises": "demobilizes", + "demobilising": "demobilizing", + "democratisation": "democratization", + "democratise": "democratize", + "democratised": "democratized", + "democratises": "democratizes", + "democratising": "democratizing", + "demonise": "demonize", + "demonised": "demonized", + "demonises": "demonizes", + "demonising": "demonizing", + "demoralisation": "demoralization", + "demoralise": "demoralize", + "demoralised": "demoralized", + "demoralises": "demoralizes", + "demoralising": "demoralizing", + "denationalisation": "denationalization", + "denationalise": "denationalize", + "denationalised": "denationalized", + "denationalises": "denationalizes", + "denationalising": "denationalizing", + "deodorise": "deodorize", + "deodorised": "deodorized", + "deodorises": "deodorizes", + "deodorising": "deodorizing", + "depersonalise": "depersonalize", + "depersonalised": "depersonalized", + "depersonalises": "depersonalizes", + "depersonalising": "depersonalizing", + "deputise": "deputize", + "deputised": "deputized", + "deputises": "deputizes", + "deputising": "deputizing", + "desensitisation": "desensitization", + "desensitise": "desensitize", + "desensitised": "desensitized", + "desensitises": "desensitizes", + "desensitising": "desensitizing", + "destabilisation": "destabilization", + "destabilise": "destabilize", + "destabilised": "destabilized", + "destabilises": "destabilizes", + "destabilising": "destabilizing", + "dialled": "dialed", + "dialling": "dialing", + "dialogue": "dialog", + "dialogues": "dialogs", + "diarrhoea": "diarrhea", + "digitise": "digitize", + "digitised": "digitized", + "digitises": "digitizes", + "digitising": "digitizing", + "disc": "disk", + "discolour": "discolor", + "discoloured": "discolored", + "discolouring": "discoloring", + "discolours": "discolors", + "discs": "disks", + "disembowelled": "disemboweled", + "disembowelling": "disemboweling", + "disfavour": "disfavor", + "dishevelled": "disheveled", + "dishonour": "dishonor", + "dishonourable": "dishonorable", + "dishonourably": "dishonorably", + "dishonoured": "dishonored", + "dishonouring": "dishonoring", + "dishonours": "dishonors", + "disorganisation": "disorganization", + "disorganised": "disorganized", + "distil": "distill", + "distils": "distills", + "dramatisation": "dramatization", + "dramatisations": "dramatizations", + "dramatise": "dramatize", + "dramatised": "dramatized", + "dramatises": "dramatizes", + "dramatising": "dramatizing", + "draught": "draft", + "draughtboard": "draftboard", + "draughtboards": "draftboards", + "draughtier": "draftier", + "draughtiest": "draftiest", + "draughts": "drafts", + "draughtsman": "draftsman", + "draughtsmanship": "draftsmanship", + "draughtsmen": "draftsmen", + "draughtswoman": "draftswoman", + "draughtswomen": "draftswomen", + "draughty": "drafty", + "drivelled": "driveled", + "drivelling": "driveling", + "duelled": "dueled", + "duelling": "dueling", + "economise": "economize", + "economised": "economized", + "economises": "economizes", + "economising": "economizing", + "edoema": "edema ", + "editorialise": "editorialize", + "editorialised": "editorialized", + "editorialises": "editorializes", + "editorialising": "editorializing", + "empathise": "empathize", + "empathised": "empathized", + "empathises": "empathizes", + "empathising": "empathizing", + "emphasise": "emphasize", + "emphasised": "emphasized", + "emphasises": "emphasizes", + "emphasising": "emphasizing", + "enamelled": "enameled", + "enamelling": "enameling", + "enamoured": "enamored", + "encyclopaedia": "encyclopedia", + "encyclopaedias": "encyclopedias", + "encyclopaedic": "encyclopedic", + "endeavour": "endeavor", + "endeavoured": "endeavored", + "endeavouring": "endeavoring", + "endeavours": "endeavors", + "energise": "energize", + "energised": "energized", + "energises": "energizes", + "energising": "energizing", + "enrol": "enroll", + "enrols": "enrolls", + "enthral": "enthrall", + "enthrals": "enthralls", + "epaulette": "epaulet", + "epaulettes": "epaulets", + "epicentre": "epicenter", + "epicentres": "epicenters", + "epilogue": "epilog", + "epilogues": "epilogs", + "epitomise": "epitomize", + "epitomised": "epitomized", + "epitomises": "epitomizes", + "epitomising": "epitomizing", + "equalisation": "equalization", + "equalise": "equalize", + "equalised": "equalized", + "equaliser": "equalizer", + "equalisers": "equalizers", + "equalises": "equalizes", + "equalising": "equalizing", + "eulogise": "eulogize", + "eulogised": "eulogized", + "eulogises": "eulogizes", + "eulogising": "eulogizing", + "evangelise": "evangelize", + "evangelised": "evangelized", + "evangelises": "evangelizes", + "evangelising": "evangelizing", + "exorcise": "exorcize", + "exorcised": "exorcized", + "exorcises": "exorcizes", + "exorcising": "exorcizing", + "extemporisation": "extemporization", + "extemporise": "extemporize", + "extemporised": "extemporized", + "extemporises": "extemporizes", + "extemporising": "extemporizing", + "externalisation": "externalization", + "externalisations": "externalizations", + "externalise": "externalize", + "externalised": "externalized", + "externalises": "externalizes", + "externalising": "externalizing", + "factorise": "factorize", + "factorised": "factorized", + "factorises": "factorizes", + "factorising": "factorizing", + "faecal": "fecal", + "faeces": "feces", + "familiarisation": "familiarization", + "familiarise": "familiarize", + "familiarised": "familiarized", + "familiarises": "familiarizes", + "familiarising": "familiarizing", + "fantasise": "fantasize", + "fantasised": "fantasized", + "fantasises": "fantasizes", + "fantasising": "fantasizing", + "favour": "favor", + "favourable": "favorable", + "favourably": "favorably", + "favoured": "favored", + "favouring": "favoring", + "favourite": "favorite", + "favourites": "favorites", + "favouritism": "favoritism", + "favours": "favors", + "feminise": "feminize", + "feminised": "feminized", + "feminises": "feminizes", + "feminising": "feminizing", + "fertilisation": "fertilization", + "fertilise": "fertilize", + "fertilised": "fertilized", + "fertiliser": "fertilizer", + "fertilisers": "fertilizers", + "fertilises": "fertilizes", + "fertilising": "fertilizing", + "fervour": "fervor", + "fibre": "fiber", + "fibreglass": "fiberglass", + "fibres": "fibers", + "fictionalisation": "fictionalization", + "fictionalisations": "fictionalizations", + "fictionalise": "fictionalize", + "fictionalised": "fictionalized", + "fictionalises": "fictionalizes", + "fictionalising": "fictionalizing", + "fillet": "filet", + "filleted ": "fileted ", + "filleting": "fileting", + "fillets ": "filets ", + "finalisation": "finalization", + "finalise": "finalize", + "finalised": "finalized", + "finalises": "finalizes", + "finalising": "finalizing", + "flautist": "flutist", + "flautists": "flutists", + "flavour": "flavor", + "flavoured": "flavored", + "flavouring": "flavoring", + "flavourings": "flavorings", + "flavourless": "flavorless", + "flavours": "flavors", + "flavoursome": "flavorsome", + "flyer / flier ": "flier / flyer ", + "foetal": "fetal", + "foetid": "fetid", + "foetus": "fetus", + "foetuses": "fetuses", + "formalisation": "formalization", + "formalise": "formalize", + "formalised": "formalized", + "formalises": "formalizes", + "formalising": "formalizing", + "fossilisation": "fossilization", + "fossilise": "fossilize", + "fossilised": "fossilized", + "fossilises": "fossilizes", + "fossilising": "fossilizing", + "fraternisation": "fraternization", + "fraternise": "fraternize", + "fraternised": "fraternized", + "fraternises": "fraternizes", + "fraternising": "fraternizing", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "fulfils": "fulfills", + "funnelled": "funneled", + "funnelling": "funneling", + "galvanise": "galvanize", + "galvanised": "galvanized", + "galvanises": "galvanizes", + "galvanising": "galvanizing", + "gambolled": "gamboled", + "gambolling": "gamboling", + "gaol": "jail", + "gaolbird": "jailbird", + "gaolbirds": "jailbirds", + "gaolbreak": "jailbreak", + "gaolbreaks": "jailbreaks", + "gaoled": "jailed", + "gaoler": "jailer", + "gaolers": "jailers", + "gaoling": "jailing", + "gaols": "jails", + "gases": "gasses", + "gauge": "gage", + "gauged": "gaged", + "gauges": "gages", + "gauging": "gaging", + "generalisation": "generalization", + "generalisations": "generalizations", + "generalise": "generalize", + "generalised": "generalized", + "generalises": "generalizes", + "generalising": "generalizing", + "ghettoise": "ghettoize", + "ghettoised": "ghettoized", + "ghettoises": "ghettoizes", + "ghettoising": "ghettoizing", + "gipsies": "gypsies", + "glamorise": "glamorize", + "glamorised": "glamorized", + "glamorises": "glamorizes", + "glamorising": "glamorizing", + "glamour": "glamor", + "globalisation": "globalization", + "globalise": "globalize", + "globalised": "globalized", + "globalises": "globalizes", + "globalising": "globalizing", + "glueing ": "gluing ", + "goitre": "goiter", + "goitres": "goiters", + "gonorrhoea": "gonorrhea", + "gramme": "gram", + "grammes": "grams", + "gravelled": "graveled", + "grey": "gray", + "greyed": "grayed", + "greying": "graying", + "greyish": "grayish", + "greyness": "grayness", + "greys": "grays", + "grovelled": "groveled", + "grovelling": "groveling", + "groyne": "groin", + "groynes ": "groins", + "gruelling": "grueling", + "gruellingly": "gruelingly", + "gryphon": "griffin", + "gryphons": "griffins", + "gynaecological": "gynecological", + "gynaecologist": "gynecologist", + "gynaecologists": "gynecologists", + "gynaecology": "gynecology", + "haematological": "hematological", + "haematologist": "hematologist", + "haematologists": "hematologists", + "haematology": "hematology", + "haemoglobin": "hemoglobin", + "haemophilia": "hemophilia", + "haemophiliac": "hemophiliac", + "haemophiliacs": "hemophiliacs", + "haemorrhage": "hemorrhage", + "haemorrhaged": "hemorrhaged", + "haemorrhages": "hemorrhages", + "haemorrhaging": "hemorrhaging", + "haemorrhoids": "hemorrhoids", + "harbour": "harbor", + "harboured": "harbored", + "harbouring": "harboring", + "harbours": "harbors", + "harmonisation": "harmonization", + "harmonise": "harmonize", + "harmonised": "harmonized", + "harmonises": "harmonizes", + "harmonising": "harmonizing", + "homoeopath": "homeopath", + "homoeopathic": "homeopathic", + "homoeopaths": "homeopaths", + "homoeopathy": "homeopathy", + "homogenise": "homogenize", + "homogenised": "homogenized", + "homogenises": "homogenizes", + "homogenising": "homogenizing", + "honour": "honor", + "honourable": "honorable", + "honourably": "honorably", + "honoured": "honored", + "honouring": "honoring", + "honours": "honors", + "hospitalisation": "hospitalization", + "hospitalise": "hospitalize", + "hospitalised": "hospitalized", + "hospitalises": "hospitalizes", + "hospitalising": "hospitalizing", + "humanise": "humanize", + "humanised": "humanized", + "humanises": "humanizes", + "humanising": "humanizing", + "humour": "humor", + "humoured": "humored", + "humouring": "humoring", + "humourless": "humorless", + "humours": "humors", + "hybridise": "hybridize", + "hybridised": "hybridized", + "hybridises": "hybridizes", + "hybridising": "hybridizing", + "hypnotise": "hypnotize", + "hypnotised": "hypnotized", + "hypnotises": "hypnotizes", + "hypnotising": "hypnotizing", + "hypothesise": "hypothesize", + "hypothesised": "hypothesized", + "hypothesises": "hypothesizes", + "hypothesising": "hypothesizing", + "idealisation": "idealization", + "idealise": "idealize", + "idealised": "idealized", + "idealises": "idealizes", + "idealising": "idealizing", + "idolise": "idolize", + "idolised": "idolized", + "idolises": "idolizes", + "idolising": "idolizing", + "immobilisation": "immobilization", + "immobilise": "immobilize", + "immobilised": "immobilized", + "immobiliser": "immobilizer", + "immobilisers": "immobilizers", + "immobilises": "immobilizes", + "immobilising": "immobilizing", + "immortalise": "immortalize", + "immortalised": "immortalized", + "immortalises": "immortalizes", + "immortalising": "immortalizing", + "immunisation": "immunization", + "immunise": "immunize", + "immunised": "immunized", + "immunises": "immunizes", + "immunising": "immunizing", + "impanelled": "impaneled", + "impanelling": "impaneling", + "imperilled": "imperiled", + "imperilling": "imperiling", + "individualise": "individualize", + "individualised": "individualized", + "individualises": "individualizes", + "individualising": "individualizing", + "industrialise": "industrialize", + "industrialised": "industrialized", + "industrialises": "industrializes", + "industrialising": "industrializing", + "inflexion": "inflection", + "inflexions": "inflections", + "initialise": "initialize", + "initialised": "initialized", + "initialises": "initializes", + "initialising": "initializing", + "initialled": "initialed", + "initialling": "initialing", + "instal": "install", + "instalment": "installment", + "instalments": "installments", + "instals": "installs", + "instil": "instill", + "instils": "instills", + "institutionalisation": "institutionalization", + "institutionalise": "institutionalize", + "institutionalised": "institutionalized", + "institutionalises": "institutionalizes", + "institutionalising": "institutionalizing", + "intellectualise": "intellectualize", + "intellectualised": "intellectualized", + "intellectualises": "intellectualizes", + "intellectualising": "intellectualizing", + "internalisation": "internalization", + "internalise": "internalize", + "internalised": "internalized", + "internalises": "internalizes", + "internalising": "internalizing", + "internationalisation": "internationalization", + "internationalise": "internationalize", + "internationalised": "internationalized", + "internationalises": "internationalizes", + "internationalising": "internationalizing", + "ionisation": "ionization", + "ionise": "ionize", + "ionised": "ionized", + "ioniser": "ionizer", + "ionisers": "ionizers", + "ionises": "ionizes", + "ionising": "ionizing", + "italicise": "italicize", + "italicised": "italicized", + "italicises": "italicizes", + "italicising": "italicizing", + "itemise": "itemize", + "itemised": "itemized", + "itemises": "itemizes", + "itemising": "itemizing", + "jeopardise": "jeopardize", + "jeopardised": "jeopardized", + "jeopardises": "jeopardizes", + "jeopardising": "jeopardizing", + "jewelled": "jeweled", + "jeweller": "jeweler", + "jewellers": "jewelers", + "jewellery": "jewelry", + "judgement ": "judgment", + "kilogramme": "kilogram", + "kilogrammes": "kilograms", + "kilometre": "kilometer", + "kilometres": "kilometers", + "labelled": "labeled", + "labelling": "labeling", + "labour": "labor", + "laboured": "labored", + "labourer": "laborer", + "labourers": "laborers", + "labouring": "laboring", + "labours": "labors", + "lacklustre": "lackluster", + "legalisation": "legalization", + "legalise": "legalize", + "legalised": "legalized", + "legalises": "legalizes", + "legalising": "legalizing", + "legitimise": "legitimize", + "legitimised": "legitimized", + "legitimises": "legitimizes", + "legitimising": "legitimizing", + "leukaemia": "leukemia", + "levelled": "leveled", + "leveller": "leveler", + "levellers": "levelers", + "levelling": "leveling", + "libelled": "libeled", + "libelling": "libeling", + "libellous": "libelous", + "liberalisation": "liberalization", + "liberalise": "liberalize", + "liberalised": "liberalized", + "liberalises": "liberalizes", + "liberalising": "liberalizing", + "licence": "license", + "licenced": "licensed", + "licences": "licenses", + "licencing": "licensing", + "likeable": "likable ", + "lionisation": "lionization", + "lionise": "lionize", + "lionised": "lionized", + "lionises": "lionizes", + "lionising": "lionizing", + "liquidise": "liquidize", + "liquidised": "liquidized", + "liquidiser": "liquidizer", + "liquidisers": "liquidizers", + "liquidises": "liquidizes", + "liquidising": "liquidizing", + "litre": "liter", + "litres": "liters", + "localise": "localize", + "localised": "localized", + "localises": "localizes", + "localising": "localizing", + "louvre": "louver", + "louvred": "louvered", + "louvres": "louvers ", + "lustre": "luster", + "magnetise": "magnetize", + "magnetised": "magnetized", + "magnetises": "magnetizes", + "magnetising": "magnetizing", + "manoeuvrability": "maneuverability", + "manoeuvrable": "maneuverable", + "manoeuvre": "maneuver", + "manoeuvred": "maneuvered", + "manoeuvres": "maneuvers", + "manoeuvring": "maneuvering", + "manoeuvrings": "maneuverings", + "marginalisation": "marginalization", + "marginalise": "marginalize", + "marginalised": "marginalized", + "marginalises": "marginalizes", + "marginalising": "marginalizing", + "marshalled": "marshaled", + "marshalling": "marshaling", + "marvelled": "marveled", + "marvelling": "marveling", + "marvellous": "marvelous", + "marvellously": "marvelously", + "materialisation": "materialization", + "materialise": "materialize", + "materialised": "materialized", + "materialises": "materializes", + "materialising": "materializing", + "maximisation": "maximization", + "maximise": "maximize", + "maximised": "maximized", + "maximises": "maximizes", + "maximising": "maximizing", + "meagre": "meager", + "mechanisation": "mechanization", + "mechanise": "mechanize", + "mechanised": "mechanized", + "mechanises": "mechanizes", + "mechanising": "mechanizing", + "mediaeval": "medieval", + "memorialise": "memorialize", + "memorialised": "memorialized", + "memorialises": "memorializes", + "memorialising": "memorializing", + "memorise": "memorize", + "memorised": "memorized", + "memorises": "memorizes", + "memorising": "memorizing", + "mesmerise": "mesmerize", + "mesmerised": "mesmerized", + "mesmerises": "mesmerizes", + "mesmerising": "mesmerizing", + "metabolise": "metabolize", + "metabolised": "metabolized", + "metabolises": "metabolizes", + "metabolising": "metabolizing", + "metre": "meter", + "metres": "meters", + "micrometre": "micrometer", + "micrometres": "micrometers", + "militarise": "militarize", + "militarised": "militarized", + "militarises": "militarizes", + "militarising": "militarizing", + "milligramme": "milligram", + "milligrammes": "milligrams", + "millilitre": "milliliter", + "millilitres": "milliliters", + "millimetre": "millimeter", + "millimetres": "millimeters", + "miniaturisation": "miniaturization", + "miniaturise": "miniaturize", + "miniaturised": "miniaturized", + "miniaturises": "miniaturizes", + "miniaturising": "miniaturizing", + "minibuses": "minibusses ", + "minimise": "minimize", + "minimised": "minimized", + "minimises": "minimizes", + "minimising": "minimizing", + "misbehaviour": "misbehavior", + "misdemeanour": "misdemeanor", + "misdemeanours": "misdemeanors", + "misspelt": "misspelled ", + "mitre": "miter", + "mitres": "miters", + "mobilisation": "mobilization", + "mobilise": "mobilize", + "mobilised": "mobilized", + "mobilises": "mobilizes", + "mobilising": "mobilizing", + "modelled": "modeled", + "modeller": "modeler", + "modellers": "modelers", + "modelling": "modeling", + "modernise": "modernize", + "modernised": "modernized", + "modernises": "modernizes", + "modernising": "modernizing", + "moisturise": "moisturize", + "moisturised": "moisturized", + "moisturiser": "moisturizer", + "moisturisers": "moisturizers", + "moisturises": "moisturizes", + "moisturising": "moisturizing", + "monologue": "monolog", + "monologues": "monologs", + "monopolisation": "monopolization", + "monopolise": "monopolize", + "monopolised": "monopolized", + "monopolises": "monopolizes", + "monopolising": "monopolizing", + "moralise": "moralize", + "moralised": "moralized", + "moralises": "moralizes", + "moralising": "moralizing", + "motorised": "motorized", + "mould": "mold", + "moulded": "molded", + "moulder": "molder", + "mouldered": "moldered", + "mouldering": "moldering", + "moulders": "molders", + "mouldier": "moldier", + "mouldiest": "moldiest", + "moulding": "molding", + "mouldings": "moldings", + "moulds": "molds", + "mouldy": "moldy", + "moult": "molt", + "moulted": "molted", + "moulting": "molting", + "moults": "molts", + "moustache": "mustache", + "moustached": "mustached", + "moustaches": "mustaches", + "moustachioed": "mustachioed", + "multicoloured": "multicolored", + "nationalisation": "nationalization", + "nationalisations": "nationalizations", + "nationalise": "nationalize", + "nationalised": "nationalized", + "nationalises": "nationalizes", + "nationalising": "nationalizing", + "naturalisation": "naturalization", + "naturalise": "naturalize", + "naturalised": "naturalized", + "naturalises": "naturalizes", + "naturalising": "naturalizing", + "neighbour": "neighbor", + "neighbourhood": "neighborhood", + "neighbourhoods": "neighborhoods", + "neighbouring": "neighboring", + "neighbourliness": "neighborliness", + "neighbourly": "neighborly", + "neighbours": "neighbors", + "neutralisation": "neutralization", + "neutralise": "neutralize", + "neutralised": "neutralized", + "neutralises": "neutralizes", + "neutralising": "neutralizing", + "normalisation": "normalization", + "normalise": "normalize", + "normalised": "normalized", + "normalises": "normalizes", + "normalising": "normalizing", + "odour": "odor", + "odourless": "odorless", + "odours": "odors", + "oesophagus": "esophagus", + "oesophaguses": "esophaguses", + "oestrogen": "estrogen", + "offence": "offense", + "offences": "offenses", + "omelette": "omelet", + "omelettes": "omelets", + "optimise": "optimize", + "optimised": "optimized", + "optimises": "optimizes", + "optimising": "optimizing", + "organisation": "organization", + "organisational": "organizational", + "organisations": "organizations", + "organise": "organize", + "organised": "organized", + "organiser": "organizer", + "organisers": "organizers", + "organises": "organizes", + "organising": "organizing", + "orthopaedic": "orthopedic", + "orthopaedics": "orthopedics", + "ostracise": "ostracize", + "ostracised": "ostracized", + "ostracises": "ostracizes", + "ostracising": "ostracizing", + "outmanoeuvre": "outmaneuver", + "outmanoeuvred": "outmaneuvered", + "outmanoeuvres": "outmaneuvers", + "outmanoeuvring": "outmaneuvering", + "overemphasise": "overemphasize", + "overemphasised": "overemphasized", + "overemphasises": "overemphasizes", + "overemphasising": "overemphasizing", + "oxidisation": "oxidization", + "oxidise": "oxidize", + "oxidised": "oxidized", + "oxidises": "oxidizes", + "oxidising": "oxidizing", + "paederast": "pederast", + "paederasts": "pederasts", + "paediatric": "pediatric", + "paediatrician": "pediatrician", + "paediatricians": "pediatricians", + "paediatrics": "pediatrics", + "paedophile": "pedophile", + "paedophiles": "pedophiles", + "paedophilia": "pedophilia", + "palaeolithic": "paleolithic", + "palaeontologist": "paleontologist", + "palaeontologists": "paleontologists", + "palaeontology": "paleontology", + "panelled": "paneled", + "panelling": "paneling", + "panellist": "panelist", + "panellists": "panelists", + "paralyse": "paralyze", + "paralysed": "paralyzed", + "paralyses": "paralyzes", + "paralysing": "paralyzing", + "parcelled": "parceled", + "parcelling": "parceling", + "parlour": "parlor", + "parlours": "parlors", + "particularise": "particularize", + "particularised": "particularized", + "particularises": "particularizes", + "particularising": "particularizing", + "passivisation": "passivization", + "passivise": "passivize", + "passivised": "passivized", + "passivises": "passivizes", + "passivising": "passivizing", + "pasteurisation": "pasteurization", + "pasteurise": "pasteurize", + "pasteurised": "pasteurized", + "pasteurises": "pasteurizes", + "pasteurising": "pasteurizing", + "patronise": "patronize", + "patronised": "patronized", + "patronises": "patronizes", + "patronising": "patronizing", + "patronisingly": "patronizingly", + "pedalled": "pedaled", + "pedalling": "pedaling", + "pedestrianisation": "pedestrianization", + "pedestrianise": "pedestrianize", + "pedestrianised": "pedestrianized", + "pedestrianises": "pedestrianizes", + "pedestrianising": "pedestrianizing", + "penalise": "penalize", + "penalised": "penalized", + "penalises": "penalizes", + "penalising": "penalizing", + "pencilled": "penciled", + "pencilling": "penciling", + "personalise": "personalize", + "personalised": "personalized", + "personalises": "personalizes", + "personalising": "personalizing", + "pharmacopoeia": "pharmacopeia", + "pharmacopoeias": "pharmacopeias", + "philosophise": "philosophize", + "philosophised": "philosophized", + "philosophises": "philosophizes", + "philosophising": "philosophizing", + "philtre": "filter", + "philtres": "filters", + "phoney ": "phony ", + "plagiarise": "plagiarize", + "plagiarised": "plagiarized", + "plagiarises": "plagiarizes", + "plagiarising": "plagiarizing", + "plough": "plow", + "ploughed": "plowed", + "ploughing": "plowing", + "ploughman": "plowman", + "ploughmen": "plowmen", + "ploughs": "plows", + "ploughshare": "plowshare", + "ploughshares": "plowshares", + "polarisation": "polarization", + "polarise": "polarize", + "polarised": "polarized", + "polarises": "polarizes", + "polarising": "polarizing", + "politicisation": "politicization", + "politicise": "politicize", + "politicised": "politicized", + "politicises": "politicizes", + "politicising": "politicizing", + "popularisation": "popularization", + "popularise": "popularize", + "popularised": "popularized", + "popularises": "popularizes", + "popularising": "popularizing", + "pouffe": "pouf", + "pouffes": "poufs", + "practise": "practice", + "practised": "practiced", + "practises": "practices", + "practising ": "practicing ", + "praesidium": "presidium", + "praesidiums ": "presidiums ", + "pressurisation": "pressurization", + "pressurise": "pressurize", + "pressurised": "pressurized", + "pressurises": "pressurizes", + "pressurising": "pressurizing", + "pretence": "pretense", + "pretences": "pretenses", + "primaeval": "primeval", + "prioritisation": "prioritization", + "prioritise": "prioritize", + "prioritised": "prioritized", + "prioritises": "prioritizes", + "prioritising": "prioritizing", + "privatisation": "privatization", + "privatisations": "privatizations", + "privatise": "privatize", + "privatised": "privatized", + "privatises": "privatizes", + "privatising": "privatizing", + "professionalisation": "professionalization", + "professionalise": "professionalize", + "professionalised": "professionalized", + "professionalises": "professionalizes", + "professionalising": "professionalizing", + "programme": "program", + "programmes": "programs", + "prologue": "prolog", + "prologues": "prologs", + "propagandise": "propagandize", + "propagandised": "propagandized", + "propagandises": "propagandizes", + "propagandising": "propagandizing", + "proselytise": "proselytize", + "proselytised": "proselytized", + "proselytiser": "proselytizer", + "proselytisers": "proselytizers", + "proselytises": "proselytizes", + "proselytising": "proselytizing", + "psychoanalyse": "psychoanalyze", + "psychoanalysed": "psychoanalyzed", + "psychoanalyses": "psychoanalyzes", + "psychoanalysing": "psychoanalyzing", + "publicise": "publicize", + "publicised": "publicized", + "publicises": "publicizes", + "publicising": "publicizing", + "pulverisation": "pulverization", + "pulverise": "pulverize", + "pulverised": "pulverized", + "pulverises": "pulverizes", + "pulverising": "pulverizing", + "pummelled": "pummel", + "pummelling": "pummeled", + "pyjama": "pajama", + "pyjamas": "pajamas", + "pzazz": "pizzazz", + "quarrelled": "quarreled", + "quarrelling": "quarreling", + "radicalise": "radicalize", + "radicalised": "radicalized", + "radicalises": "radicalizes", + "radicalising": "radicalizing", + "rancour": "rancor", + "randomise": "randomize", + "randomised": "randomized", + "randomises": "randomizes", + "randomising": "randomizing", + "rationalisation": "rationalization", + "rationalisations": "rationalizations", + "rationalise": "rationalize", + "rationalised": "rationalized", + "rationalises": "rationalizes", + "rationalising": "rationalizing", + "ravelled": "raveled", + "ravelling": "raveling", + "realisable": "realizable", + "realisation": "realization", + "realisations": "realizations", + "realise": "realize", + "realised": "realized", + "realises": "realizes", + "realising": "realizing", + "recognisable": "recognizable", + "recognisably": "recognizably", + "recognisance": "recognizance", + "recognise": "recognize", + "recognised": "recognized", + "recognises": "recognizes", + "recognising": "recognizing", + "reconnoitre": "reconnoiter", + "reconnoitred": "reconnoitered", + "reconnoitres": "reconnoiters", + "reconnoitring": "reconnoitering", + "refuelled": "refueled", + "refuelling": "refueling", + "regularisation": "regularization", + "regularise": "regularize", + "regularised": "regularized", + "regularises": "regularizes", + "regularising": "regularizing", + "remodelled": "remodeled", + "remodelling": "remodeling", + "remould": "remold", + "remoulded": "remolded", + "remoulding": "remolding", + "remoulds": "remolds", + "reorganisation": "reorganization", + "reorganisations": "reorganizations", + "reorganise": "reorganize", + "reorganised": "reorganized", + "reorganises": "reorganizes", + "reorganising": "reorganizing", + "revelled": "reveled", + "reveller": "reveler", + "revellers": "revelers", + "revelling": "reveling", + "revitalise": "revitalize", + "revitalised": "revitalized", + "revitalises": "revitalizes", + "revitalising": "revitalizing", + "revolutionise": "revolutionize", + "revolutionised": "revolutionized", + "revolutionises": "revolutionizes", + "revolutionising": "revolutionizing", + "rhapsodise": "rhapsodize", + "rhapsodised": "rhapsodized", + "rhapsodises": "rhapsodizes", + "rhapsodising": "rhapsodizing", + "rigour": "rigor", + "rigours": "rigors", + "ritualised": "ritualized", + "rivalled": "rivaled", + "rivalling": "rivaling", + "romanticise": "romanticize", + "romanticised": "romanticized", + "romanticises": "romanticizes", + "romanticising": "romanticizing", + "rumour": "rumor", + "rumoured": "rumored", + "rumours": "rumors", + "sabre": "saber", + "sabres": "sabers", + "saltpetre": "saltpeter", + "sanitise": "sanitize", + "sanitised": "sanitized", + "sanitises": "sanitizes", + "sanitising": "sanitizing", + "satirise": "satirize", + "satirised": "satirized", + "satirises": "satirizes", + "satirising": "satirizing", + "saviour": "savior", + "saviours": "saviors", + "savour": "savor", + "savoured": "savored", + "savouries": "savories", + "savouring": "savoring", + "savours": "savors", + "savoury": "savory", + "scandalise": "scandalize", + "scandalised": "scandalized", + "scandalises": "scandalizes", + "scandalising": "scandalizing", + "sceptic": "skeptic", + "sceptical": "skeptical", + "sceptically": "skeptically", + "scepticism": "skepticism", + "sceptics": "skeptics", + "sceptre": "scepter", + "sceptres": "scepters", + "scrutinise": "scrutinize", + "scrutinised": "scrutinized", + "scrutinises": "scrutinizes", + "scrutinising": "scrutinizing", + "secularisation": "secularization", + "secularise": "secularize", + "secularised": "secularized", + "secularises": "secularizes", + "secularising": "secularizing", + "sensationalise": "sensationalize", + "sensationalised": "sensationalized", + "sensationalises": "sensationalizes", + "sensationalising": "sensationalizing", + "sensitise": "sensitize", + "sensitised": "sensitized", + "sensitises": "sensitizes", + "sensitising": "sensitizing", + "sentimentalise": "sentimentalize", + "sentimentalised": "sentimentalized", + "sentimentalises": "sentimentalizes", + "sentimentalising": "sentimentalizing", + "sepulchre": "sepulcher", + "sepulchres": "sepulchers ", + "serialisation": "serialization", + "serialisations": "serializations", + "serialise": "serialize", + "serialised": "serialized", + "serialises": "serializes", + "serialising": "serializing", + "sermonise": "sermonize", + "sermonised": "sermonized", + "sermonises": "sermonizes", + "sermonising": "sermonizing", + "sheikh ": "sheik ", + "shovelled": "shoveled", + "shovelling": "shoveling", + "shrivelled": "shriveled", + "shrivelling": "shriveling", + "signalise": "signalize", + "signalised": "signalized", + "signalises": "signalizes", + "signalising": "signalizing", + "signalled": "signaled", + "signalling": "signaling", + "smoulder": "smolder", + "smouldered": "smoldered", + "smouldering": "smoldering", + "smoulders": "smolders", + "snivelled": "sniveled", + "snivelling": "sniveling", + "snorkelled": "snorkeled", + "snorkelling": "snorkeling", + "snowplough": "snowplow", + "snowploughs": "snowplow", + "socialisation": "socialization", + "socialise": "socialize", + "socialised": "socialized", + "socialises": "socializes", + "socialising": "socializing", + "sodomise": "sodomize", + "sodomised": "sodomized", + "sodomises": "sodomizes", + "sodomising": "sodomizing", + "solemnise": "solemnize", + "solemnised": "solemnized", + "solemnises": "solemnizes", + "solemnising": "solemnizing", + "sombre": "somber", + "specialisation": "specialization", + "specialisations": "specializations", + "specialise": "specialize", + "specialised": "specialized", + "specialises": "specializes", + "specialising": "specializing", + "spectre": "specter", + "spectres": "specters", + "spiralled": "spiraled", + "spiralling": "spiraling", + "splendour": "splendor", + "splendours": "splendors", + "squirrelled": "squirreled", + "squirrelling": "squirreling", + "stabilisation": "stabilization", + "stabilise": "stabilize", + "stabilised": "stabilized", + "stabiliser": "stabilizer", + "stabilisers": "stabilizers", + "stabilises": "stabilizes", + "stabilising": "stabilizing", + "standardisation": "standardization", + "standardise": "standardize", + "standardised": "standardized", + "standardises": "standardizes", + "standardising": "standardizing", + "stencilled": "stenciled", + "stencilling": "stenciling", + "sterilisation": "sterilization", + "sterilisations": "sterilizations", + "sterilise": "sterilize", + "sterilised": "sterilized", + "steriliser": "sterilizer", + "sterilisers": "sterilizers", + "sterilises": "sterilizes", + "sterilising": "sterilizing", + "stigmatisation": "stigmatization", + "stigmatise": "stigmatize", + "stigmatised": "stigmatized", + "stigmatises": "stigmatizes", + "stigmatising": "stigmatizing", + "storey": "story", + "storeys": "stories", + "subsidisation": "subsidization", + "subsidise": "subsidize", + "subsidised": "subsidized", + "subsidiser": "subsidizer", + "subsidisers": "subsidizers", + "subsidises": "subsidizes", + "subsidising": "subsidizing", + "succour": "succor", + "succoured": "succored", + "succouring": "succoring", + "succours": "succors", + "sulphate": "sulfate", + "sulphates": "sulfates", + "sulphide": "sulfide", + "sulphides": "sulfides", + "sulphur": "sulfur", + "sulphurous": "sulfurous", + "summarise": "summarize", + "summarised": "summarized", + "summarises": "summarizes", + "summarising": "summarizing", + "swivelled": "swiveled", + "swivelling": "swiveling", + "symbolise": "symbolize", + "symbolised": "symbolized", + "symbolises": "symbolizes", + "symbolising": "symbolizing", + "sympathise": "sympathize", + "sympathised": "sympathized", + "sympathiser": "sympathizer", + "sympathisers": "sympathizers", + "sympathises": "sympathizes", + "sympathising": "sympathizing", + "synchronisation": "synchronization", + "synchronise": "synchronize", + "synchronised": "synchronized", + "synchronises": "synchronizes", + "synchronising": "synchronizing", + "synthesise": "synthesize", + "synthesised": "synthesized", + "synthesiser": "synthesizer", + "synthesisers": "synthesizers", + "synthesises": "synthesizes", + "synthesising": "synthesizing", + "syphon": "siphon", + "syphoned": "siphoned", + "syphoning": "siphoning", + "syphons": "siphons", + "systematisation": "systematization", + "systematise": "systematize", + "systematised": "systematized", + "systematises": "systematizes", + "systematising": "systematizing", + "tantalise": "tantalize", + "tantalised": "tantalized", + "tantalises": "tantalizes", + "tantalising": "tantalizing", + "tantalisingly": "tantalizingly", + "tasselled": "tasseled", + "technicolour": "technicolor", + "temporise": "temporize", + "temporised": "temporized", + "temporises": "temporizes", + "temporising": "temporizing", + "tenderise": "tenderize", + "tenderised": "tenderized", + "tenderises": "tenderizes", + "tenderising": "tenderizing", + "terrorise": "terrorize", + "terrorised": "terrorized", + "terrorises": "terrorizes", + "terrorising": "terrorizing", + "theatre": "theater", + "theatregoer": "theatergoer", + "theatregoers": "theatergoers", + "theatres": "theaters", + "theorise": "theorize", + "theorised": "theorized", + "theorises": "theorizes", + "theorising": "theorizing", + "tonne": "ton", + "tonnes": "tons", + "towelled": "toweled", + "towelling": "toweling", + "toxaemia": "toxemia", + "tranquillise": "tranquilize", + "tranquillised": "tranquilized", + "tranquilliser": "tranquilizer", + "tranquillisers": "tranquilizers", + "tranquillises": "tranquilizes", + "tranquillising": "tranquilizing", + "tranquillity": "tranquility", + "tranquillize": "tranquilize", + "tranquillized": "tranquilized", + "tranquillizer": "tranquilizer", + "tranquillizers": "tranquilizers", + "tranquillizes": "tranquilizes", + "tranquillizing": "tranquilizing", + "tranquilly": "tranquility", + "transistorised": "transistorized", + "traumatise": "traumatize", + "traumatised": "traumatized", + "traumatises": "traumatizes", + "traumatising": "traumatizing", + "travelled": "traveled", + "traveller": "traveler", + "travellers": "travelers", + "travelling": "traveling", + "travelogue": "travelog", + "travelogues ": "travelogs ", + "trialled": "trialed", + "trialling": "trialing", + "tricolour": "tricolor", + "tricolours": "tricolors", + "trivialise": "trivialize", + "trivialised": "trivialized", + "trivialises": "trivializes", + "trivialising": "trivializing", + "tumour": "tumor", + "tumours": "tumors", + "tunnelled": "tunneled", + "tunnelling": "tunneling", + "tyrannise": "tyrannize", + "tyrannised": "tyrannized", + "tyrannises": "tyrannizes", + "tyrannising": "tyrannizing", + "tyre": "tire", + "tyres": "tires", + "unauthorised": "unauthorized", + "uncivilised": "uncivilized", + "underutilised": "underutilized", + "unequalled": "unequaled", + "unfavourable": "unfavorable", + "unfavourably": "unfavorably", + "unionisation": "unionization", + "unionise": "unionize", + "unionised": "unionized", + "unionises": "unionizes", + "unionising": "unionizing", + "unorganised": "unorganized", + "unravelled": "unraveled", + "unravelling": "unraveling", + "unrecognisable": "unrecognizable", + "unrecognised": "unrecognized", + "unrivalled": "unrivaled", + "unsavoury": "unsavory", + "untrammelled": "untrammeled", + "urbanisation": "urbanization", + "urbanise": "urbanize", + "urbanised": "urbanized", + "urbanises": "urbanizes", + "urbanising": "urbanizing", + "utilisable": "utilizable", + "utilisation": "utilization", + "utilise": "utilize", + "utilised": "utilized", + "utilises": "utilizes", + "utilising": "utilizing", + "valour": "valor", + "vandalise": "vandalize", + "vandalised": "vandalized", + "vandalises": "vandalizes", + "vandalising": "vandalizing", + "vaporisation": "vaporization", + "vaporise": "vaporize", + "vaporised": "vaporized", + "vaporises": "vaporizes", + "vaporising": "vaporizing", + "vapour": "vapor", + "vapours": "vapors", + "verbalise": "verbalize", + "verbalised": "verbalized", + "verbalises": "verbalizes", + "verbalising": "verbalizing", + "victimisation": "victimization", + "victimise": "victimize", + "victimised": "victimized", + "victimises": "victimizes", + "victimising": "victimizing", + "videodisc": "videodisk", + "videodiscs": "videodisks", + "vigour": "vigor", + "visualisation": "visualization", + "visualisations": "visualizations", + "visualise": "visualize", + "visualised": "visualized", + "visualises": "visualizes", + "visualising": "visualizing", + "vocalisation": "vocalization", + "vocalisations": "vocalizations", + "vocalise": "vocalize", + "vocalised": "vocalized", + "vocalises": "vocalizes", + "vocalising": "vocalizing", + "vulcanised": "vulcanized", + "vulgarisation": "vulgarization", + "vulgarise": "vulgarize", + "vulgarised": "vulgarized", + "vulgarises": "vulgarizes", + "vulgarising": "vulgarizing", + "waggon": "wagon", + "waggons": "wagons", + "watercolour": "watercolor", + "watercolours": "watercolors", + "weaselled": "weaseled", + "weaselling": "weaseling", + "westernisation": "westernization", + "westernise": "westernize", + "westernised": "westernized", + "westernises": "westernizes", + "westernising": "westernizing", + "womanise": "womanize", + "womanised": "womanized", + "womaniser": "womanizer", + "womanisers": "womanizers", + "womanises": "womanizes", + "womanising": "womanizing", + "woollen": "woolen", + "woollens": "woolens", + "woollies": "woolies", + "woolly": "wooly", + "worshipped ": "worshiped", + "worshipping ": "worshiping ", + "worshipper": "worshiper", + "yodelled": "yodeled", + "yodelling": "yodeling", + "yoghourt": "yogurt", + "yoghourts": "yogurts", + "yoghurt": "yogurt", + "yoghurts": "yogurts" +} + + +for string, norm in _exc.items(): + _exc[string.title()] = norm + + +NORM_EXCEPTIONS = _exc From 43353b5413285ff9409978c5336d88b108d60ca2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jun 2017 13:28:20 -0500 Subject: [PATCH 21/28] Improve train CLI script --- spacy/cli/train.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index a2c06c571..bc0664917 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -28,15 +28,17 @@ from .. import displacy n_iter=("number of iterations", "option", "n", int), n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "flag", "G", bool), + resume=("Whether to resume training", "flag", "R", bool), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool) ) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, - use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): + use_gpu=False, resume=False, no_tagger=False, no_parser=False, no_entities=False): """ Train a model. Expects data in spaCy's JSON format. """ + util.set_env_log(True) n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) @@ -66,7 +68,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, util.env_opt('batch_to', 64), util.env_opt('batch_compound', 1.001)) - nlp = lang_class(pipeline=pipeline) + if resume: + prints(output_path / 'model19.pickle', title="Resuming training") + nlp = dill.load((output_path / 'model19.pickle').open('rb')) + else: + nlp = lang_class(pipeline=pipeline) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_docs = corpus.count_train() @@ -75,6 +81,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") try: for i in range(n_iter): + if resume: + i += 20 with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: train_docs = corpus.train_docs(nlp, projectivize=True, gold_preproc=False, max_length=0) @@ -86,14 +94,18 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, pbar.update(len(docs)) with nlp.use_params(optimizer.averages): + util.set_env_log(False) + epoch_model_path = output_path / ('model%d' % i) + nlp.to_disk(epoch_model_path) with (output_path / ('model%d.pickle' % i)).open('wb') as file_: dill.dump(nlp, file_, -1) - with (output_path / ('model%d.bin' % i)).open('wb') as file_: - file_.write(nlp.to_bytes()) - with (output_path / ('model%d.bin' % i)).open('rb') as file_: - nlp_loaded = lang_class(pipeline=pipeline) - nlp_loaded.from_bytes(file_.read()) - scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) + nlp_loaded = lang_class(pipeline=pipeline) + nlp_loaded = nlp_loaded.from_disk(epoch_model_path) + scorer = nlp_loaded.evaluate( + corpus.dev_docs( + nlp_loaded, + gold_preproc=False)) + util.set_env_log(True) print_progress(i, losses, scorer.scores) finally: print("Saving model...") From e62f46d39f2ec9098f68c4df52f0690119d7930d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jun 2017 13:28:52 -0500 Subject: [PATCH 22/28] Clarify gold.pyx slightly --- spacy/gold.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index de48501fb..a16dc1f2a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -211,7 +211,7 @@ class GoldCorpus(object): def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) - gold_docs = nlp.preprocess_gold(gold_docs) + #gold_docs = nlp.preprocess_gold(gold_docs) yield from gold_docs @classmethod @@ -226,7 +226,7 @@ class GoldCorpus(object): gold_preproc) golds = cls._make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): - if not max_length or len(doc) < max_length: + if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod @@ -234,17 +234,17 @@ class GoldCorpus(object): if raw_text is not None: return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=sent_tuples[0][1]) - for sent_tuples in paragraph_tuples] + return [Doc(nlp.vocab, words=sent_tuples[1]) + for (sent_tuples, brackets) in paragraph_tuples] @classmethod def _make_golds(cls, docs, paragraph_tuples): + assert len(docs) == len(paragraph_tuples) if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0]) - for sent_tuples in paragraph_tuples] + return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] else: - return [GoldParse.from_annot_tuples(doc, sent_tuples[0]) - for doc, sent_tuples in zip(docs, paragraph_tuples)] + return [GoldParse.from_annot_tuples(doc, sent_tuples) + for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] @staticmethod def walk_corpus(path): From 805495af279cba209996432617fc0684982cbb4a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jun 2017 13:29:23 -0500 Subject: [PATCH 23/28] Fix off-by-one in number of tags --- spacy/morphology.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 50bec3115..b79fcaeef 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -38,7 +38,7 @@ cdef class Morphology: self.strings = string_store self.tag_map = {} self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) + 1 + self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} From fea1144e6dafef70093c2f92b1c803bf1aa5c2d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jun 2017 13:31:33 -0500 Subject: [PATCH 24/28] Set max batch size in evaluate --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 394919dcf..acbf169b4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -301,7 +301,7 @@ class Language(object): def evaluate(self, docs_golds): docs, golds = zip(*docs_golds) scorer = Scorer() - for doc, gold in zip(self.pipe(docs), golds): + for doc, gold in zip(self.pipe(docs, batch_size=32), golds): scorer.score(doc, gold) doc.tensor = None return scorer From 5bd311c77ed90f929b895ebf8aa419c5d2499179 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:54:09 +0200 Subject: [PATCH 25/28] Fix update of norm exceptions --- spacy/lang/en/norm_exceptions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index ec106b960..c5f7baad5 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -1754,8 +1754,7 @@ _exc = { } +NORM_EXCEPTIONS = {} + for string, norm in _exc.items(): - _exc[string.title()] = norm - - -NORM_EXCEPTIONS = _exc + NORM_EXCEPTIONS[string.title()] = norm From 0d6fa8b241d1d29a99a0e12015a7fadaec217cf5 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:54:18 +0200 Subject: [PATCH 26/28] Add German norm exceptions --- spacy/lang/de/__init__.py | 8 ++++++-- spacy/lang/de/norm_exceptions.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 spacy/lang/de/norm_exceptions.py diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index fa957a6f5..0a161e80e 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -2,21 +2,25 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS from ...language import Language from ...lemmatizerlookup import Lemmatizer -from ...attrs import LANG -from ...util import update_exc +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups class GermanDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'de' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + BASE_NORMS, NORM_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py new file mode 100644 index 000000000..6116aa9be --- /dev/null +++ b/spacy/lang/de/norm_exceptions.py @@ -0,0 +1,17 @@ +# coding: utf8 +from __future__ import unicode_literals + +# Here we only want to include the absolute most common words. Otherwise, +# this list would get impossibly long for German – especially considering the +# old vs. new spelling rules, and all possible cases. + + +_exc = { + "daß": "dass" +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string.title()] = norm From d77c2cc8bb9d9e3c73cb30e3bd766d73f7308865 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 20:59:50 +0200 Subject: [PATCH 27/28] Add tests for English norm exceptions --- spacy/tests/lang/en/test_exceptions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index a49c0c421..736f760d7 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 assert tokens[1].lemma_ in ["a.m.", "p.m."] + + +@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]) +def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): + tokens = en_tokenizer(text) + assert [token.norm_ for token in tokens] == norms + + +@pytest.mark.xfail +@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")]) +def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): + tokens = en_tokenizer(text) + assert tokens[0].norm_ == norm From e47eef5e034b645a868a812b64874547cf267a76 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 3 Jun 2017 21:07:44 +0200 Subject: [PATCH 28/28] Update German tokenizer exceptions and tests --- spacy/lang/de/tokenizer_exceptions.py | 170 ++++++++++++------------- spacy/tests/lang/de/test_exceptions.py | 19 ++- 2 files changed, 101 insertions(+), 88 deletions(-) diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 080311f4e..184d88104 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA _exc = { "auf'm": [ {ORTH: "auf", LEMMA: "auf"}, - {ORTH: "'m", LEMMA: "der", NORM: "dem" }], + {ORTH: "'m", LEMMA: "der", NORM: "dem"}], "du's": [ {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, @@ -53,97 +53,97 @@ _exc = { for exc_data in [ - {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}, - {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}, + {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, + {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, + {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, + {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "'n", LEMMA: "ein", NORM: "ein"}, {ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, {ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, {ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, - {ORTH: "Abb.", LEMMA: "Abbildung"}, - {ORTH: "Abk.", LEMMA: "AbkĂŒrzung"}, - {ORTH: "Abt.", LEMMA: "Abteilung"}, - {ORTH: "Apr.", LEMMA: "April"}, - {ORTH: "Aug.", LEMMA: "August"}, - {ORTH: "Bd.", LEMMA: "Band"}, - {ORTH: "Betr.", LEMMA: "Betreff"}, - {ORTH: "Bf.", LEMMA: "Bahnhof"}, - {ORTH: "Bhf.", LEMMA: "Bahnhof"}, - {ORTH: "Bsp.", LEMMA: "Beispiel"}, - {ORTH: "Dez.", LEMMA: "Dezember"}, - {ORTH: "Di.", LEMMA: "Dienstag"}, - {ORTH: "Do.", LEMMA: "Donnerstag"}, - {ORTH: "Fa.", LEMMA: "Firma"}, - {ORTH: "Fam.", LEMMA: "Familie"}, - {ORTH: "Feb.", LEMMA: "Februar"}, - {ORTH: "Fr.", LEMMA: "Frau"}, - {ORTH: "Frl.", LEMMA: "FrĂ€ulein"}, - {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}, - {ORTH: "Hr.", LEMMA: "Herr"}, - {ORTH: "Hrn.", LEMMA: "Herr"}, - {ORTH: "Jan.", LEMMA: "Januar"}, - {ORTH: "Jh.", LEMMA: "Jahrhundert"}, - {ORTH: "Jhd.", LEMMA: "Jahrhundert"}, - {ORTH: "Jul.", LEMMA: "Juli"}, - {ORTH: "Jun.", LEMMA: "Juni"}, - {ORTH: "Mi.", LEMMA: "Mittwoch"}, - {ORTH: "Mio.", LEMMA: "Million"}, - {ORTH: "Mo.", LEMMA: "Montag"}, - {ORTH: "Mrd.", LEMMA: "Milliarde"}, - {ORTH: "Mrz.", LEMMA: "MĂ€rz"}, - {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}, - {ORTH: "MĂ€r.", LEMMA: "MĂ€rz"}, - {ORTH: "Nov.", LEMMA: "November"}, - {ORTH: "Nr.", LEMMA: "Nummer"}, - {ORTH: "Okt.", LEMMA: "Oktober"}, - {ORTH: "Orig.", LEMMA: "Original"}, - {ORTH: "Pkt.", LEMMA: "Punkt"}, - {ORTH: "Prof.", LEMMA: "Professor"}, - {ORTH: "Red.", LEMMA: "Redaktion"}, - {ORTH: "Sa.", LEMMA: "Samstag"}, - {ORTH: "Sep.", LEMMA: "September"}, - {ORTH: "Sept.", LEMMA: "September"}, - {ORTH: "So.", LEMMA: "Sonntag"}, - {ORTH: "Std.", LEMMA: "Stunde"}, - {ORTH: "Str.", LEMMA: "Straße"}, - {ORTH: "Tel.", LEMMA: "Telefon"}, - {ORTH: "Tsd.", LEMMA: "Tausend"}, - {ORTH: "Univ.", LEMMA: "UniversitĂ€t"}, - {ORTH: "abzgl.", LEMMA: "abzĂŒglich"}, - {ORTH: "allg.", LEMMA: "allgemein"}, - {ORTH: "bspw.", LEMMA: "beispielsweise"}, - {ORTH: "bzgl.", LEMMA: "bezĂŒglich"}, - {ORTH: "bzw.", LEMMA: "beziehungsweise"}, + {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"}, + {ORTH: "Abk.", LEMMA: "AbkĂŒrzung", NORM: "AbkĂŒrzung"}, + {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"}, + {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, + {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, + {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"}, + {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"}, + {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, + {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, + {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"}, + {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"}, + {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"}, + {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"}, + {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"}, + {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"}, + {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"}, + {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"}, + {ORTH: "Frl.", LEMMA: "FrĂ€ulein", NORM: "FrĂ€ulein"}, + {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"}, + {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"}, + {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"}, + {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, + {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, + {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, + {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, + {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, + {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"}, + {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"}, + {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"}, + {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"}, + {ORTH: "Mrz.", LEMMA: "MĂ€rz", NORM: "MĂ€rz"}, + {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"}, + {ORTH: "MĂ€r.", LEMMA: "MĂ€rz", NORM: "MĂ€rz"}, + {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, + {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"}, + {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, + {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"}, + {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"}, + {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"}, + {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"}, + {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"}, + {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, + {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, + {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"}, + {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"}, + {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"}, + {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, + {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"}, + {ORTH: "Univ.", LEMMA: "UniversitĂ€t", NORM: "UniversitĂ€t"}, + {ORTH: "abzgl.", LEMMA: "abzĂŒglich", NORM: "abzĂŒglich"}, + {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"}, + {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"}, + {ORTH: "bzgl.", LEMMA: "bezĂŒglich", NORM: "bezĂŒglich"}, + {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"}, {ORTH: "d.h.", LEMMA: "das heißt"}, - {ORTH: "dgl.", LEMMA: "dergleichen"}, - {ORTH: "ebd.", LEMMA: "ebenda"}, - {ORTH: "eigtl.", LEMMA: "eigentlich"}, - {ORTH: "engl.", LEMMA: "englisch"}, - {ORTH: "evtl.", LEMMA: "eventuell"}, - {ORTH: "frz.", LEMMA: "französisch"}, - {ORTH: "gegr.", LEMMA: "gegrĂŒndet"}, - {ORTH: "ggf.", LEMMA: "gegebenenfalls"}, - {ORTH: "ggfs.", LEMMA: "gegebenenfalls"}, - {ORTH: "ggĂŒ.", LEMMA: "gegenĂŒber"}, + {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"}, + {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"}, + {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"}, + {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"}, + {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"}, + {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"}, + {ORTH: "gegr.", LEMMA: "gegrĂŒndet", NORM: "gegrĂŒndet"}, + {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, + {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, + {ORTH: "ggĂŒ.", LEMMA: "gegenĂŒber", NORM: "gegenĂŒber"}, {ORTH: "i.O.", LEMMA: "in Ordnung"}, {ORTH: "i.d.R.", LEMMA: "in der Regel"}, - {ORTH: "incl.", LEMMA: "inklusive"}, - {ORTH: "inkl.", LEMMA: "inklusive"}, - {ORTH: "insb.", LEMMA: "insbesondere"}, - {ORTH: "kath.", LEMMA: "katholisch"}, - {ORTH: "lt.", LEMMA: "laut"}, - {ORTH: "max.", LEMMA: "maximal"}, - {ORTH: "min.", LEMMA: "minimal"}, - {ORTH: "mind.", LEMMA: "mindestens"}, - {ORTH: "mtl.", LEMMA: "monatlich"}, + {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"}, + {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"}, + {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"}, + {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"}, + {ORTH: "lt.", LEMMA: "laut", NORM: "laut"}, + {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"}, + {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"}, + {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"}, + {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"}, {ORTH: "n.Chr.", LEMMA: "nach Christus"}, - {ORTH: "orig.", LEMMA: "original"}, - {ORTH: "röm.", LEMMA: "römisch"}, + {ORTH: "orig.", LEMMA: "original", NORM: "original"}, + {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"}, {ORTH: "s.o.", LEMMA: "siehe oben"}, {ORTH: "sog.", LEMMA: "so genannt"}, {ORTH: "stellv.", LEMMA: "stellvertretend"}, - {ORTH: "tĂ€gl.", LEMMA: "tĂ€glich"}, + {ORTH: "tĂ€gl.", LEMMA: "tĂ€glich", NORM: "tĂ€glich"}, {ORTH: "u.U.", LEMMA: "unter UmstĂ€nden"}, {ORTH: "u.s.w.", LEMMA: "und so weiter"}, {ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, @@ -153,9 +153,9 @@ for exc_data in [ {ORTH: "v.Chr.", LEMMA: "vor Christus"}, {ORTH: "v.a.", LEMMA: "vor allem"}, {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, - {ORTH: "vgl.", LEMMA: "vergleiche"}, - {ORTH: "vllt.", LEMMA: "vielleicht"}, - {ORTH: "vlt.", LEMMA: "vielleicht"}, + {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"}, + {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"}, + {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"}, {ORTH: "z.B.", LEMMA: "zum Beispiel"}, {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, {ORTH: "z.T.", LEMMA: "zum Teil"}, @@ -163,7 +163,7 @@ for exc_data in [ {ORTH: "z.Zt.", LEMMA: "zur Zeit"}, {ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "zzgl.", LEMMA: "zuzĂŒglich"}, - {ORTH: "österr.", LEMMA: "österreichisch"}]: + {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]: _exc[exc_data[ORTH]] = [dict(exc_data)] diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 13da3dc33..f7db648c9 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -8,20 +8,33 @@ import pytest @pytest.mark.parametrize('text', ["auf'm", "du's", "ĂŒber'm", "wir's"]) -def test_tokenizer_splits_contractions(de_tokenizer, text): +def test_de_tokenizer_splits_contractions(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 2 @pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) -def test_tokenizer_handles_abbr(de_tokenizer, text): +def test_de_tokenizer_handles_abbr(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 1 -def test_tokenizer_handles_exc_in_text(de_tokenizer): +def test_de_tokenizer_handles_exc_in_text(de_tokenizer): text = "Ich bin z.Zt. im Urlaub." tokens = de_tokenizer(text) assert len(tokens) == 6 assert tokens[2].text == "z.Zt." assert tokens[2].lemma_ == "zur Zeit" + + +@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]) +def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms): + tokens = de_tokenizer(text) + assert [token.norm_ for token in tokens] == norms + + +@pytest.mark.xfail +@pytest.mark.parametrize('text,norm', [("daß", "dass")]) +def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): + tokens = de_tokenizer(text) + assert tokens[0].norm_ == norm