From 493c77462a236fae204920e8a3fa22d70833d2fc Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 18:46:51 +0200
Subject: [PATCH 01/69] issue5230: test cases
covering known sources of resource warnings
---
spacy/tests/regression/test_issue5230.py | 112 +++++++++++++++++++++++
1 file changed, 112 insertions(+)
create mode 100644 spacy/tests/regression/test_issue5230.py
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
new file mode 100644
index 000000000..e3d7c7e82
--- /dev/null
+++ b/spacy/tests/regression/test_issue5230.py
@@ -0,0 +1,112 @@
+import warnings
+
+import numpy
+import pytest
+import srsly
+
+from spacy.kb import KnowledgeBase
+from spacy.vectors import Vectors
+from spacy.language import Language
+from spacy.pipeline import Pipe
+from spacy.tests.util import make_tempdir
+
+
+@pytest.mark.xfail
+def test_language_to_disk_resource_warning():
+ nlp = Language()
+ with make_tempdir() as d:
+ with warnings.catch_warnings(record=True) as w:
+ # catch only warnings raised in spacy.language since there may be others from other components or pipelines
+ warnings.filterwarnings(
+ "always", module="spacy.language", category=ResourceWarning
+ )
+ nlp.to_disk(d)
+ assert len(w) == 0
+
+
+@pytest.mark.xfail
+def test_vectors_to_disk_resource_warning():
+ data = numpy.zeros((3, 300), dtype="f")
+ keys = ["cat", "dog", "rat"]
+ vectors = Vectors(data=data, keys=keys)
+ with make_tempdir() as d:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings("always", category=ResourceWarning)
+ vectors.to_disk(d)
+ assert len(w) == 0
+
+
+@pytest.mark.xfail
+def test_custom_pipes_to_disk_resource_warning():
+ # create dummy pipe partially implementing interface -- only want to test to_disk
+ class SerializableDummy(object):
+ def __init__(self, **cfg):
+ if cfg:
+ self.cfg = cfg
+ else:
+ self.cfg = None
+ super(SerializableDummy, self).__init__()
+
+ def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
+ return srsly.msgpack_dumps({"dummy": srsly.json_dumps(None)})
+
+ def from_bytes(self, bytes_data, exclude):
+ return self
+
+ def to_disk(self, path, exclude=tuple(), **kwargs):
+ pass
+
+ def from_disk(self, path, exclude=tuple(), **kwargs):
+ return self
+
+ class MyPipe(Pipe):
+ def __init__(self, vocab, model=True, **cfg):
+ if cfg:
+ self.cfg = cfg
+ else:
+ self.cfg = None
+ self.model = SerializableDummy()
+ self.vocab = SerializableDummy()
+
+ pipe = MyPipe(None)
+ with make_tempdir() as d:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings("always", category=ResourceWarning)
+ pipe.to_disk(d)
+ assert len(w) == 0
+
+
+@pytest.mark.xfail
+def test_tagger_to_disk_resource_warning():
+ nlp = Language()
+ nlp.add_pipe(nlp.create_pipe("tagger"))
+ tagger = nlp.get_pipe("tagger")
+ # need to add model for two reasons:
+ # 1. no model leads to error in serialization,
+ # 2. the affected line is the one for model serialization
+ tagger.begin_training(pipeline=nlp.pipeline)
+
+ with make_tempdir() as d:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings("always", category=ResourceWarning)
+ tagger.to_disk(d)
+ assert len(w) == 0
+
+
+@pytest.mark.xfail
+def test_entity_linker_to_disk_resource_warning():
+ nlp = Language()
+ nlp.add_pipe(nlp.create_pipe("entity_linker"))
+ entity_linker = nlp.get_pipe("entity_linker")
+ # need to add model for two reasons:
+ # 1. no model leads to error in serialization,
+ # 2. the affected line is the one for model serialization
+ kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ entity_linker.set_kb(kb)
+ entity_linker.begin_training(pipeline=nlp.pipeline)
+
+ with make_tempdir() as d:
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings("always", category=ResourceWarning)
+ entity_linker.to_disk(d)
+ assert len(w) == 0
From 1cd975d4a5cf50eb5a2b16a30e8b520c7778af40 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 18:54:32 +0200
Subject: [PATCH 02/69] issue5230: fixed resource warnings in language
---
spacy/language.py | 5 ++---
spacy/tests/regression/test_issue5230.py | 1 -
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index 56619080d..0eb062eae 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -903,9 +903,8 @@ class Language(object):
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
p, exclude=["vocab"]
)
- serializers["meta.json"] = lambda p: p.open("w").write(
- srsly.json_dumps(self.meta)
- )
+ serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
+
for name, proc in self.pipeline:
if not hasattr(proc, "name"):
continue
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index e3d7c7e82..be84875e7 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -11,7 +11,6 @@ from spacy.pipeline import Pipe
from spacy.tests.util import make_tempdir
-@pytest.mark.xfail
def test_language_to_disk_resource_warning():
nlp = Language()
with make_tempdir() as d:
From 273ed452bb4ba148d491dcec4b321a6293bdcd30 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 19:22:32 +0200
Subject: [PATCH 03/69] issue5230: added unicode declaration at top of the file
---
spacy/tests/regression/test_issue5230.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index be84875e7..9cfa3fc05 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -1,3 +1,4 @@
+# coding: utf8
import warnings
import numpy
From 71cc903d65b8946a4c6cd04cb2ca38b8a19eb5c4 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 20:30:41 +0200
Subject: [PATCH 04/69] issue5230: replaced open statements on path objects so
that serialization still works an files are closed
---
spacy/pipeline/pipes.pyx | 6 +++---
spacy/tests/regression/test_issue5230.py | 4 ----
spacy/vectors.pyx | 10 +++++++++-
3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a20c9b6df..ce95b2752 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -202,7 +202,7 @@ class Pipe(object):
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
- serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
+ serialize["model"] = self.model.to_disk
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
@@ -625,7 +625,7 @@ class Tagger(Pipe):
serialize = OrderedDict((
("vocab", lambda p: self.vocab.to_disk(p)),
("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
- ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
+ ("model", self.model.to_disk),
("cfg", lambda p: srsly.write_json(p, self.cfg))
))
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
@@ -1394,7 +1394,7 @@ class EntityLinker(Pipe):
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
if self.model not in (None, True, False):
- serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
+ serialize["model"] = self.model.to_disk
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 9cfa3fc05..716a4624b 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -24,7 +24,6 @@ def test_language_to_disk_resource_warning():
assert len(w) == 0
-@pytest.mark.xfail
def test_vectors_to_disk_resource_warning():
data = numpy.zeros((3, 300), dtype="f")
keys = ["cat", "dog", "rat"]
@@ -36,7 +35,6 @@ def test_vectors_to_disk_resource_warning():
assert len(w) == 0
-@pytest.mark.xfail
def test_custom_pipes_to_disk_resource_warning():
# create dummy pipe partially implementing interface -- only want to test to_disk
class SerializableDummy(object):
@@ -76,7 +74,6 @@ def test_custom_pipes_to_disk_resource_warning():
assert len(w) == 0
-@pytest.mark.xfail
def test_tagger_to_disk_resource_warning():
nlp = Language()
nlp.add_pipe(nlp.create_pipe("tagger"))
@@ -93,7 +90,6 @@ def test_tagger_to_disk_resource_warning():
assert len(w) == 0
-@pytest.mark.xfail
def test_entity_linker_to_disk_resource_warning():
nlp = Language()
nlp.add_pipe(nlp.create_pipe("entity_linker"))
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index f3c20fb7f..62d176c6c 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -376,8 +376,16 @@ cdef class Vectors:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
+
+ def save_vectors(path):
+ # the source of numpy.save indicates that the file object is closed after use.
+ # but it seems that somehow this does not happen, as ResourceWarnings are raised here.
+ # in order to not rely on this, wrap in context manager.
+ with path.open("wb") as _file:
+ save_array(self.data, _file)
+
serializers = OrderedDict((
- ("vectors", lambda p: save_array(self.data, p.open("wb"))),
+ ("vectors", save_vectors),
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
))
return util.to_disk(path, serializers, [])
From cde96f6c64220bf6a82cf4288f6e2bfbbc97eb0a Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 20:51:12 +0200
Subject: [PATCH 05/69] issue5230: optimized unit test a bit
---
spacy/tests/regression/test_issue5230.py | 61 +++++++++---------------
1 file changed, 23 insertions(+), 38 deletions(-)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 716a4624b..76d4d3e96 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -1,41 +1,28 @@
# coding: utf8
import warnings
-import numpy
import pytest
import srsly
-
+from numpy import zeros
from spacy.kb import KnowledgeBase
from spacy.vectors import Vectors
+
from spacy.language import Language
from spacy.pipeline import Pipe
from spacy.tests.util import make_tempdir
-def test_language_to_disk_resource_warning():
- nlp = Language()
- with make_tempdir() as d:
- with warnings.catch_warnings(record=True) as w:
- # catch only warnings raised in spacy.language since there may be others from other components or pipelines
- warnings.filterwarnings(
- "always", module="spacy.language", category=ResourceWarning
- )
- nlp.to_disk(d)
- assert len(w) == 0
+def nlp():
+ return Language()
-def test_vectors_to_disk_resource_warning():
- data = numpy.zeros((3, 300), dtype="f")
+def vectors():
+ data = zeros((3, 1), dtype="f")
keys = ["cat", "dog", "rat"]
- vectors = Vectors(data=data, keys=keys)
- with make_tempdir() as d:
- with warnings.catch_warnings(record=True) as w:
- warnings.filterwarnings("always", category=ResourceWarning)
- vectors.to_disk(d)
- assert len(w) == 0
+ return Vectors(data=data, keys=keys)
-def test_custom_pipes_to_disk_resource_warning():
+def custom_pipe():
# create dummy pipe partially implementing interface -- only want to test to_disk
class SerializableDummy(object):
def __init__(self, **cfg):
@@ -66,15 +53,10 @@ def test_custom_pipes_to_disk_resource_warning():
self.model = SerializableDummy()
self.vocab = SerializableDummy()
- pipe = MyPipe(None)
- with make_tempdir() as d:
- with warnings.catch_warnings(record=True) as w:
- warnings.filterwarnings("always", category=ResourceWarning)
- pipe.to_disk(d)
- assert len(w) == 0
+ return MyPipe(None)
-def test_tagger_to_disk_resource_warning():
+def tagger():
nlp = Language()
nlp.add_pipe(nlp.create_pipe("tagger"))
tagger = nlp.get_pipe("tagger")
@@ -82,15 +64,10 @@ def test_tagger_to_disk_resource_warning():
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
tagger.begin_training(pipeline=nlp.pipeline)
-
- with make_tempdir() as d:
- with warnings.catch_warnings(record=True) as w:
- warnings.filterwarnings("always", category=ResourceWarning)
- tagger.to_disk(d)
- assert len(w) == 0
+ return tagger
-def test_entity_linker_to_disk_resource_warning():
+def entity_linker():
nlp = Language()
nlp.add_pipe(nlp.create_pipe("entity_linker"))
entity_linker = nlp.get_pipe("entity_linker")
@@ -100,9 +77,17 @@ def test_entity_linker_to_disk_resource_warning():
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
entity_linker.set_kb(kb)
entity_linker.begin_training(pipeline=nlp.pipeline)
+ return entity_linker
+
+@pytest.mark.parametrize(
+ "obj",
+ [nlp(), vectors(), custom_pipe(), tagger(), entity_linker()],
+ ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
+)
+def test_to_disk_resource_warning(obj):
with make_tempdir() as d:
- with warnings.catch_warnings(record=True) as w:
+ with warnings.catch_warnings(record=True) as warnings_list:
warnings.filterwarnings("always", category=ResourceWarning)
- entity_linker.to_disk(d)
- assert len(w) == 0
+ obj.to_disk(d)
+ assert len(warnings_list) == 0
From b63871ceff4497ca61bd066c8432603bc73c6a8b Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Mon, 6 Apr 2020 21:04:06 +0200
Subject: [PATCH 06/69] issue5230: added contributors agreement
---
.github/contributors/lfiedler.md | 106 +++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 .github/contributors/lfiedler.md
diff --git a/.github/contributors/lfiedler.md b/.github/contributors/lfiedler.md
new file mode 100644
index 000000000..61f8ffeb4
--- /dev/null
+++ b/.github/contributors/lfiedler.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Leander Fiedler |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 06 April 2020 |
+| GitHub username | lfiedler |
+| Website (optional) | |
\ No newline at end of file
From e1e25c7e302876b85dc7a95c0f5cf768fbac3f1d Mon Sep 17 00:00:00 2001
From: lfiedler
Date: Mon, 6 Apr 2020 21:36:02 +0200
Subject: [PATCH 07/69] issue5230: added unittest test case for completion
---
spacy/tests/regression/test_issue5230.py | 28 +++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 76d4d3e96..1a03fa0d2 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -1,5 +1,6 @@
# coding: utf8
import warnings
+from unittest import TestCase
import pytest
import srsly
@@ -80,14 +81,31 @@ def entity_linker():
return entity_linker
-@pytest.mark.parametrize(
- "obj",
+objects_to_test = (
[nlp(), vectors(), custom_pipe(), tagger(), entity_linker()],
- ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
+ ["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"],
)
-def test_to_disk_resource_warning(obj):
+
+
+def write_obj_and_catch_warnings(obj):
with make_tempdir() as d:
with warnings.catch_warnings(record=True) as warnings_list:
warnings.filterwarnings("always", category=ResourceWarning)
obj.to_disk(d)
- assert len(warnings_list) == 0
+ return warnings_list
+
+
+@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
+def test_to_disk_resource_warning(obj):
+ warnings_list = write_obj_and_catch_warnings(obj)
+ assert len(warnings_list) == 0
+
+
+class TestToDiskResourceWarningUnittest(TestCase):
+ def test_resource_warning(self):
+ scenarios = zip(*objects_to_test)
+
+ for scenario in scenarios:
+ with self.subTest(msg=scenario[1]):
+ warnings_list = write_obj_and_catch_warnings(scenario[0])
+ self.assertEqual(len(warnings_list), 0)
From 8c1d0d628fb196abd33859b18a597eb0414e6c55 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Fri, 10 Apr 2020 20:35:52 +0200
Subject: [PATCH 08/69] issue5230 writer now checks instance of loc parameter
before trying to operate on it
---
spacy/kb.pyx | 4 ++--
spacy/tests/regression/test_issue5230.py | 15 ++++++++++++++-
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 63eb41b42..7c6865eed 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -446,10 +446,10 @@ cdef class KnowledgeBase:
cdef class Writer:
def __init__(self, object loc):
- if path.exists(loc):
- assert not path.isdir(loc), "%s is directory." % loc
if isinstance(loc, Path):
loc = bytes(loc)
+ if path.exists(loc):
+ assert not path.isdir(loc), "%s is directory." % loc
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(bytes_loc, 'wb')
if not self._fp:
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 1a03fa0d2..b7c6b9b1d 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -5,7 +5,7 @@ from unittest import TestCase
import pytest
import srsly
from numpy import zeros
-from spacy.kb import KnowledgeBase
+from spacy.kb import KnowledgeBase, Writer
from spacy.vectors import Vectors
from spacy.language import Language
@@ -101,6 +101,19 @@ def test_to_disk_resource_warning(obj):
assert len(warnings_list) == 0
+def test_writer_with_path_py35():
+ writer = None
+ with make_tempdir() as d:
+ path = d / "test"
+ try:
+ writer = Writer(path)
+ except Exception as e:
+ pytest.fail(str(e))
+ finally:
+ if writer:
+ writer.close()
+
+
class TestToDiskResourceWarningUnittest(TestCase):
def test_resource_warning(self):
scenarios = zip(*objects_to_test)
From a7bdfe42e13bdb2e61edcb3b4bf9203e041ef3f0 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Fri, 10 Apr 2020 21:14:33 +0200
Subject: [PATCH 09/69] issue5230 added print statement to warnings filter to
remotely debug failing python35(win) setup
---
spacy/tests/regression/test_issue5230.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index b7c6b9b1d..03027fe39 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -98,6 +98,8 @@ def write_obj_and_catch_warnings(obj):
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
def test_to_disk_resource_warning(obj):
warnings_list = write_obj_and_catch_warnings(obj)
+ for warning in warnings_list:
+ print(warning.message)
assert len(warnings_list) == 0
From 88ca40a15d010fe50da383f4664f8064046f7540 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Fri, 10 Apr 2020 21:45:53 +0200
Subject: [PATCH 10/69] issue5230 raise warnings as errors to remotely debug
failing python35(win) setup
---
spacy/tests/regression/test_issue5230.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 03027fe39..adc9307ce 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -90,7 +90,7 @@ objects_to_test = (
def write_obj_and_catch_warnings(obj):
with make_tempdir() as d:
with warnings.catch_warnings(record=True) as warnings_list:
- warnings.filterwarnings("always", category=ResourceWarning)
+ warnings.filterwarnings("error", category=ResourceWarning)
obj.to_disk(d)
return warnings_list
@@ -98,8 +98,6 @@ def write_obj_and_catch_warnings(obj):
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
def test_to_disk_resource_warning(obj):
warnings_list = write_obj_and_catch_warnings(obj)
- for warning in warnings_list:
- print(warning.message)
assert len(warnings_list) == 0
From ca2a7a44db29b3ffbcf24459a8c0332742c8b676 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Fri, 10 Apr 2020 22:26:55 +0200
Subject: [PATCH 11/69] issue5230 store string values of warnings to remotely
debug failing python35(win) setup
---
spacy/tests/regression/test_issue5230.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index adc9307ce..c78a84ad7 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -90,9 +90,9 @@ objects_to_test = (
def write_obj_and_catch_warnings(obj):
with make_tempdir() as d:
with warnings.catch_warnings(record=True) as warnings_list:
- warnings.filterwarnings("error", category=ResourceWarning)
+ warnings.filterwarnings("always", category=ResourceWarning)
obj.to_disk(d)
- return warnings_list
+ return list(map(lambda w: w.message, warnings_list))
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
From d2bb649227ce5a24e53d7526cf7892643eb297c9 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Fri, 10 Apr 2020 23:21:13 +0200
Subject: [PATCH 12/69] issue5230 filter warnings in addition to filterwarnings
to prevent deprecation warnings in python35(win) setup to pop up
---
spacy/tests/regression/test_issue5230.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index c78a84ad7..ae735c7bd 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -92,7 +92,8 @@ def write_obj_and_catch_warnings(obj):
with warnings.catch_warnings(record=True) as warnings_list:
warnings.filterwarnings("always", category=ResourceWarning)
obj.to_disk(d)
- return list(map(lambda w: w.message, warnings_list))
+ # in python3.5 it seems that deprecation warnings are not filtered by filterwarnings
+ return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list))
@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1])
From d60e2d3ebf33fc0c4280117b08f6e3ef9ad63ff9 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Sun, 12 Apr 2020 09:08:41 +0200
Subject: [PATCH 13/69] issue5230 added unit test for dumping and loading
knowledgebase
---
spacy/tests/regression/test_issue5230.py | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index ae735c7bd..337c82255 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -115,6 +115,23 @@ def test_writer_with_path_py35():
writer.close()
+def test_save_and_load_knowledge_base():
+ nlp = Language()
+ kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ with make_tempdir() as d:
+ path = d / "kb"
+ try:
+ kb.dump(path)
+ except Exception as e:
+ pytest.fail(str(e))
+
+ try:
+ kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ kb_loaded.load_bulk(path)
+ except Exception as e:
+ pytest.fail(str(e))
+
+
class TestToDiskResourceWarningUnittest(TestCase):
def test_resource_warning(self):
scenarios = zip(*objects_to_test)
From 67000068304b9a125ec792f32bed8491767dbed1 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Sun, 12 Apr 2020 09:34:54 +0200
Subject: [PATCH 14/69] issue5230 attempted fix of pytest segfault for
python3.5
---
spacy/kb.pyx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 7c6865eed..14327f0d6 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -491,10 +491,10 @@ cdef class Writer:
cdef class Reader:
def __init__(self, object loc):
- assert path.exists(loc)
- assert not path.isdir(loc)
if isinstance(loc, Path):
loc = bytes(loc)
+ assert path.exists(loc)
+ assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(bytes_loc, 'rb')
if not self._fp:
From cef0c909b9dc1afd37511db4cbfd1863f27a371a Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Wed, 15 Apr 2020 19:28:33 +0200
Subject: [PATCH 15/69] issue5230 changed reference to function to anonymous
function
---
spacy/pipeline/pipes.pyx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index ce95b2752..8af76a0fb 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -202,7 +202,7 @@ class Pipe(object):
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
- serialize["model"] = self.model.to_disk
+ serialize["model"] = lambda p: self.model.to_disk(p)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
From a3401b11946b9aba06dd3e83a1877c156e7ddeb4 Mon Sep 17 00:00:00 2001
From: Leander Fiedler
Date: Wed, 15 Apr 2020 21:52:52 +0200
Subject: [PATCH 16/69] issue5230 changed reference to function to anonymous
function
---
spacy/pipeline/pipes.pyx | 4 ++--
spacy/vectors.pyx | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 8af76a0fb..fc077fc82 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -625,7 +625,7 @@ class Tagger(Pipe):
serialize = OrderedDict((
("vocab", lambda p: self.vocab.to_disk(p)),
("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
- ("model", self.model.to_disk),
+ ("model", lambda p: self.model.to_disk(p)),
("cfg", lambda p: srsly.write_json(p, self.cfg))
))
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
@@ -1394,7 +1394,7 @@ class EntityLinker(Pipe):
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
if self.model not in (None, True, False):
- serialize["model"] = self.model.to_disk
+ serialize["model"] = lambda p: self.model.to_disk(p)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 62d176c6c..2877d2d7d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -385,7 +385,7 @@ cdef class Vectors:
save_array(self.data, _file)
serializers = OrderedDict((
- ("vectors", save_vectors),
+ ("vectors", lambda p: save_vectors(p)),
("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
))
return util.to_disk(path, serializers, [])
From ebaed7dcfa31ced738212f726d17285049291d7a Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Thu, 30 Apr 2020 10:17:06 +0200
Subject: [PATCH 17/69] Few more updates to the EL documentation
---
examples/training/train_entity_linker.py | 2 +-
website/docs/usage/training.md | 45 ++++++++++++------------
2 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index c7eba8a30..3a8deb7a0 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -64,7 +64,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
"""Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
The `vocab` should be the one used during creation of the KB."""
vocab = Vocab().from_disk(vocab_path)
- # create blank Language class with correct vocab
+ # create blank English model with correct vocab
nlp = spacy.blank("en", vocab=vocab)
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index ecdc6720b..0be14df69 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -619,25 +619,24 @@ https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py
#### Step by step guide {#step-by-step-kb}
-1. **Load the model** you want to start with, or create an **empty model** using
- [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and
- a pre-defined [`vocab`](/api/vocab) object.
-2. **Pretrain the entity embeddings** by running the descriptions of the
- entities through a simple encoder-decoder network. The current implementation
- requires the `nlp` model to have access to pretrained word embeddings, but a
- custom implementation of this encoding step can also be used.
-3. **Construct the KB** by defining all entities with their pretrained vectors,
- and all aliases with their prior probabilities.
+1. **Load the model** you want to start with. It should contain pretrained word
+ vectors.
+2. **Obtain the entity embeddings** by running the descriptions of the entities
+ through the `nlp` model and taking the average of all words with
+ `nlp(desc).vector`. At this point, a custom encoding step can also be used.
+3. **Construct the KB** by defining all entities with their embeddings, and all
+ aliases with their prior probabilities.
4. **Save** the KB using [`kb.dump`](/api/kb#dump).
-5. **Test** the KB to make sure the entities were added correctly.
+5. **Print** the contents of the KB to make sure the entities were added
+ correctly.
### Training an entity linking model {#entity-linker-model}
This example shows how to create an entity linker pipe using a previously
-created knowledge base. The entity linker pipe is then trained with your own
-examples. To do so, you'll need to provide **example texts**, and the
-**character offsets** and **knowledge base identifiers** of each entity
-contained in the texts.
+created knowledge base. The entity linker is then trained with a set of custom
+examples. To do so, you need to provide **example texts**, and the **character
+offsets** and **knowledge base identifiers** of each entity contained in the
+texts.
```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py
@@ -647,14 +646,16 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li
1. **Load the KB** you want to start with, and specify the path to the `Vocab`
object that was used to create this KB. Then, create an **empty model** using
- [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language.
- Don't forget to add the KB to the entity linker, and to add the entity linker
- to the pipeline. In practical applications, you will want a more advanced
- pipeline including also a component for
- [named entity recognition](/usage/training#ner). If you're using a model with
- additional components, make sure to disable all other pipeline components
- during training using [`nlp.disable_pipes`](/api/language#disable_pipes).
- This way, you'll only be training the entity linker.
+ [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. Add
+ a component for recognizing sentences en one for identifying relevant
+ entities. In practical applications, you will want a more advanced pipeline
+ including also a component for
+ [named entity recognition](/usage/training#ner). Then, create a new entity
+ linker component, add the KB to it, and then add the entity linker to the
+ pipeline. If you're using a model with additional components, make sure to
+ disable all other pipeline components during training using
+ [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ training the entity linker.
2. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through
the annotated examples of the input. For each combination of a mention in
From 5e55bfa8214835cf8d407ca6a6a5f8797b4ea005 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?=
Date: Tue, 5 May 2020 14:06:27 +0200
Subject: [PATCH 18/69] Fixed tests for Swedish that were written in Danish.
(#5395)
---
spacy/tests/lang/sv/test_exceptions.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py
index c977a4183..7c6fd5464 100644
--- a/spacy/tests/lang/sv/test_exceptions.py
+++ b/spacy/tests/lang/sv/test_exceptions.py
@@ -47,15 +47,15 @@ def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
def test_sv_tokenizer_handles_exc_in_text(sv_tokenizer):
- text = "Det er bl.a. ikke meningen"
+ text = "Det är bl.a. inte meningen"
tokens = sv_tokenizer(text)
assert len(tokens) == 5
assert tokens[2].text == "bl.a."
def test_sv_tokenizer_handles_custom_base_exc(sv_tokenizer):
- text = "Her er noget du kan kigge i."
+ text = "Här är något du kan titta på."
tokens = sv_tokenizer(text)
assert len(tokens) == 8
- assert tokens[6].text == "i"
+ assert tokens[6].text == "på"
assert tokens[7].text == "."
From a2345618f111552e141e128e1d48dd1d0a672a6b Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 8 May 2020 10:25:02 +0200
Subject: [PATCH 19/69] Fix Token API docs from #5375 (#5418)
---
website/docs/api/token.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 7280ac796..24a9dce79 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -351,6 +351,7 @@ property to `0` for the first word of the document.
- assert doc[4].sent_start == 1
+ assert doc[4].is_sent_start == True
```
+
## Token.is_sent_end {#is_sent_end tag="property" new="2"}
From 4a15b559bab705c11acc7d5fce62a73daa5135e7 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 8 May 2020 10:36:25 +0200
Subject: [PATCH 20/69] Clarify Token.pos as UPOS (#5419)
---
website/docs/api/token.md | 4 ++--
website/docs/usage/101/_pos-deps.md | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 24a9dce79..69dac23d6 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -477,8 +477,8 @@ The L2 norm of the token's vector representation.
| `like_email` | bool | Does the token resemble an email address? |
| `is_oov` | bool | Is the token out-of-vocabulary? |
| `is_stop` | bool | Is the token part of a "stop list"? |
-| `pos` | int | Coarse-grained part-of-speech. |
-| `pos_` | unicode | Coarse-grained part-of-speech. |
+| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
+| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `tag` | int | Fine-grained part-of-speech. |
| `tag_` | unicode | Fine-grained part-of-speech. |
| `dep` | int | Syntactic dependency relation. |
diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md
index 9d04d6ffc..1a438e424 100644
--- a/website/docs/usage/101/_pos-deps.md
+++ b/website/docs/usage/101/_pos-deps.md
@@ -25,7 +25,7 @@ for token in doc:
> - **Text:** The original word text.
> - **Lemma:** The base form of the word.
-> - **POS:** The simple part-of-speech tag.
+> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) part-of-speech tag.
> - **Tag:** The detailed part-of-speech tag.
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
> - **Shape:** The word shape – capitalization, punctuation, digits.
From c963e269bac9c41222d81abf82131b1937912325 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 8 May 2020 11:21:46 +0200
Subject: [PATCH 21/69] Add method to update / reset pkuseg user dict (#5404)
---
spacy/lang/zh/__init__.py | 16 ++++++++++++++++
spacy/tests/lang/zh/test_tokenizer.py | 13 +++++++++++++
2 files changed, 29 insertions(+)
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 701e696a4..ed0b3eb74 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer):
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
+ def pkuseg_update_user_dict(self, words, reset=False):
+ if self.pkuseg_seg:
+ if reset:
+ try:
+ import pkuseg
+ self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
+ except ImportError:
+ if self.use_pkuseg:
+ msg = (
+ "pkuseg not installed: unable to reset pkuseg "
+ "user dict. Please " + _PKUSEG_INSTALL_MSG
+ )
+ raise ImportError(msg)
+ for word in words:
+ self.pkuseg_seg.preprocesser.insert(word.strip(), '')
+
def _get_config(self):
config = OrderedDict(
(
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index bff7b1ed1..035798aa1 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
+from spacy.lang.zh import _get_pkuseg_trie_data
# fmt: off
@@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
assert tokens == expected_tokens
+def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
+ user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+ zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
+ updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+ assert len(user_dict) == len(updated_user_dict) - 1
+
+ # reset user dict
+ zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
+ reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+ assert len(reset_user_dict) == 0
+
+
def test_extra_spaces(zh_tokenizer_char):
# note: three spaces after "I"
tokens = zh_tokenizer_char("I like cheese.")
From d4cc18b7464e6713d5f0d6f368190cfbdd5c1e18 Mon Sep 17 00:00:00 2001
From: Travis Hoppe
Date: Fri, 8 May 2020 02:28:54 -0700
Subject: [PATCH 22/69] Added author information for NLPre (#5414)
* Add author links for NLPre and update category
* Add contributor statement
---
.github/contributors/thoppe.md | 106 +++++++++++++++++++++++++++++++++
website/meta/universe.json | 8 ++-
2 files changed, 113 insertions(+), 1 deletion(-)
create mode 100644 .github/contributors/thoppe.md
diff --git a/.github/contributors/thoppe.md b/.github/contributors/thoppe.md
new file mode 100644
index 000000000..9271a2601
--- /dev/null
+++ b/.github/contributors/thoppe.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Travis Hoppe |
+| Company name (if applicable) | |
+| Title or role (if applicable) | Data Scientist |
+| Date | 07 May 2020 |
+| GitHub username | thoppe |
+| Website (optional) | http://thoppe.github.io/ |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 50977b39c..cf587f5f0 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -114,7 +114,13 @@
" text = f(text)",
"print(text)"
],
- "category": ["scientific"]
+ "category": ["scientific", "biomedical"],
+ "author": "Travis Hoppe",
+ "author_links": {
+ "github": "thoppe",
+ "twitter":"metasemantic",
+ "website" : "http://thoppe.github.io/"
+ }
},
{
"id": "Chatterbot",
From 440b81bddc24669ffe89ef7501fb8c75f98b60d2 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 8 May 2020 15:10:57 +0200
Subject: [PATCH 23/69] Improve exceptions for 'd (would/had) in English
(#5379)
Instead of treating `'d` in contractions like `I'd` as `would` in all
cases in the tokenizer exceptions, leave the tagging and lemmatization
up to later components.
---
spacy/lang/en/tokenizer_exceptions.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index c45197771..62de81912 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -77,12 +77,12 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]:
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
- {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
+ {ORTH: "'d", NORM: "'d"},
]
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
- {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
+ {ORTH: "d", NORM: "'d"},
]
_exc[orth + "'d've"] = [
@@ -195,7 +195,10 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
{ORTH: "'d", NORM: "'d"},
]
- _exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}]
+ _exc[orth + "d"] = [
+ {ORTH: orth, LEMMA: word, NORM: word},
+ {ORTH: "d", NORM: "'d"}
+ ]
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word, NORM: word},
From 24e7108f80dd9e4a882b22fe62beda89b73158b6 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 May 2020 10:25:05 +0200
Subject: [PATCH 24/69] Modify array type to accommodate OOV_RANK (#5429)
Modify indices array type in `Vocab.prune_vectors` to accommodate
OOV_RANK index as max(uint64).
---
spacy/vocab.pyx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 0f3223025..e31d26f85 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -315,7 +315,7 @@ cdef class Vocab:
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
for lex in self if lex.orth in self.vectors.key2row]
priority.sort()
- indices = xp.asarray([i for (prob, i, key) in priority], dtype="i")
+ indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
From 07639dd6ac9db6f874d1f01ccb5e37a910924feb Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 May 2020 10:25:54 +0200
Subject: [PATCH 25/69] Remove TAG from da/sv tokenizer exceptions (#5428)
Remove `TAG` value from Danish and Swedish tokenizer exceptions because
it may not be included in a tag map (and these settings are problematic
as tokenizer exceptions anyway).
---
spacy/lang/da/tokenizer_exceptions.py | 6 +++---
spacy/lang/sv/tokenizer_exceptions.py | 4 ++--
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index 89b083186..9e4637bfb 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.
from __future__ import unicode_literals
-from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
+from ...symbols import ORTH, LEMMA, NORM
_exc = {}
@@ -52,7 +52,7 @@ for exc_data in [
{ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"},
- {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
+ {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
]:
_exc[exc_data[ORTH]] = [exc_data]
@@ -577,7 +577,7 @@ for h in range(1, 31 + 1):
for period in ["."]:
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
-_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
+_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index dd0976aa6..e95c67f37 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
-from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
+from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
_exc = {}
@@ -155,6 +155,6 @@ for orth in ABBREVIATIONS:
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens.
for orth in ["i", "m"]:
- _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}]
+ _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = _exc
From 113e7981d0c60f1e200eb0177c97b282927f61ac Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 May 2020 22:08:28 +0200
Subject: [PATCH 26/69] Check that row is within bounds when adding vector
(#5430)
Check that row is within bounds for the vector data array when adding a
vector.
Don't add vectors with rank OOV_RANK in `init-model` (change is due to
shift from OOV as 0 to OOV as OOV_RANK).
---
spacy/cli/init_model.py | 2 +-
spacy/errors.py | 2 ++
spacy/tests/vocab_vectors/test_vectors.py | 3 +++
spacy/vectors.pyx | 6 +++++-
spacy/vocab.pyx | 2 +-
5 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 31d627e9b..618266633 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -181,7 +181,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
for lex in nlp.vocab:
- if lex.rank:
+ if lex.rank and lex.rank != OOV_RANK:
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else:
if vectors_loc:
diff --git a/spacy/errors.py b/spacy/errors.py
index 779980490..32ccd3df7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
+
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
@@ -555,6 +556,7 @@ class Errors(object):
E195 = ("Matcher can be called on {good} only, got {got}.")
E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
"only be fixed with token.is_sent_start.")
+ E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
@add_codes
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 8987b7c89..322ef462a 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -307,6 +307,9 @@ def test_vocab_add_vector():
dog = vocab["dog"]
assert list(dog.vector) == [2.0, 2.0, 2.0]
+ with pytest.raises(ValueError):
+ vocab.vectors.add(vocab["hamster"].orth, row=1000000)
+
def test_vocab_prune_vectors():
vocab = Vocab(vectors_name="test_vocab_prune_vectors")
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index f3c20fb7f..2973ddb5b 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -9,6 +9,7 @@ import functools
import numpy
from collections import OrderedDict
import srsly
+import warnings
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
@@ -303,7 +304,10 @@ cdef class Vectors:
raise ValueError(Errors.E060.format(rows=self.data.shape[0],
cols=self.data.shape[1]))
row = deref(self._unset.begin())
- self.key2row[key] = row
+ if row < self.data.shape[0]:
+ self.key2row[key] = row
+ else:
+ raise ValueError(Errors.E197.format(row=row, key=key))
if vector is not None:
self.data[row] = vector
if self._unset.count(row):
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index e31d26f85..ef2e86bcc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -319,7 +319,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
- self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name)
+ self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
remap = {}
for i, key in enumerate(keys[nr_row:]):
From b04738903e3afc16f10bc3182c256742222ee3f6 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 13 May 2020 22:08:50 +0200
Subject: [PATCH 27/69] prevent None in gold fields (#5425)
* set gold fields to empty list instead of keeping them as None
* add unit test
---
spacy/gold.pyx | 10 +++++++++-
spacy/tests/parser/test_ner.py | 27 ++++++++++++++++++++++++++-
2 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 034bba08f..4b8a4f52d 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -658,7 +658,15 @@ cdef class GoldParse:
entdoc = None
# avoid allocating memory if the doc does not contain any tokens
- if self.length > 0:
+ if self.length == 0:
+ self.words = []
+ self.tags = []
+ self.heads = []
+ self.labels = []
+ self.ner = []
+ self.morphology = []
+
+ else:
if words is None:
words = [token.text for token in doc]
if tags is None:
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8329391ca..244e9fa25 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -7,7 +7,7 @@ from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, minibatch
from spacy.tokens import Doc
@@ -174,6 +174,31 @@ def test_accept_blocked_token():
assert ner2.moves.is_valid(state2, "U-")
+def test_train_empty():
+ """Test that training an empty text does not throw errors."""
+ train_data = [
+ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+ ("", {"entities": []}),
+ ]
+
+ nlp = English()
+ ner = nlp.create_pipe("ner")
+ ner.add_label("PERSON")
+ nlp.add_pipe(ner, last=True)
+
+ nlp.begin_training()
+ for itn in range(2):
+ losses = {}
+ batches = minibatch(train_data)
+ for batch in batches:
+ texts, annotations = zip(*batch)
+ nlp.update(
+ texts, # batch of texts
+ annotations, # batch of annotations
+ losses=losses,
+ )
+
+
def test_overwrite_token():
nlp = English()
ner1 = nlp.create_pipe("ner")
From 9ce059dd067ecc3f097d04023e3cfa0d70d35bb8 Mon Sep 17 00:00:00 2001
From: Vishnu Priya VR
Date: Thu, 14 May 2020 16:28:06 +0530
Subject: [PATCH 28/69] Limiting noun_chunks for specific languages (#5396)
* Limiting noun_chunks for specific langauges
* Limiting noun_chunks for specific languages
Contributor Agreement
* Addressing review comments
* Removed unused fixtures and imports
* Add fa_tokenizer in test suite
* Use fa_tokenizer in test
* Undo extraneous reformatting
Co-authored-by: adrianeboyd
---
.github/contributors/vishnupriyavr.md | 106 ++++++++++++++++++++++++
spacy/lang/de/syntax_iterators.py | 5 ++
spacy/lang/el/syntax_iterators.py | 5 ++
spacy/lang/en/syntax_iterators.py | 5 ++
spacy/lang/es/syntax_iterators.py | 5 ++
spacy/lang/fa/syntax_iterators.py | 5 ++
spacy/lang/fr/syntax_iterators.py | 5 ++
spacy/lang/id/syntax_iterators.py | 5 ++
spacy/lang/nb/syntax_iterators.py | 5 ++
spacy/lang/sv/syntax_iterators.py | 5 ++
spacy/tests/conftest.py | 5 ++
spacy/tests/lang/de/test_noun_chunks.py | 16 ++++
spacy/tests/lang/el/test_noun_chunks.py | 16 ++++
spacy/tests/lang/en/test_noun_chunks.py | 15 ++++
spacy/tests/lang/es/test_noun_chunks.py | 16 ++++
spacy/tests/lang/fa/test_noun_chunks.py | 17 ++++
spacy/tests/lang/fr/test_noun_chunks.py | 16 ++++
spacy/tests/lang/id/test_noun_chunks.py | 16 ++++
spacy/tests/lang/nb/test_noun_chunks.py | 16 ++++
spacy/tests/lang/sv/test_noun_chunks.py | 13 +++
spacy/tokens/doc.pyx | 3 +-
21 files changed, 298 insertions(+), 2 deletions(-)
create mode 100644 .github/contributors/vishnupriyavr.md
create mode 100644 spacy/tests/lang/de/test_noun_chunks.py
create mode 100644 spacy/tests/lang/el/test_noun_chunks.py
create mode 100644 spacy/tests/lang/es/test_noun_chunks.py
create mode 100644 spacy/tests/lang/fa/test_noun_chunks.py
create mode 100644 spacy/tests/lang/fr/test_noun_chunks.py
create mode 100644 spacy/tests/lang/id/test_noun_chunks.py
create mode 100644 spacy/tests/lang/nb/test_noun_chunks.py
diff --git a/.github/contributors/vishnupriyavr.md b/.github/contributors/vishnupriyavr.md
new file mode 100644
index 000000000..73657a772
--- /dev/null
+++ b/.github/contributors/vishnupriyavr.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ------------------------ |
+| Name | Vishnu Priya VR |
+| Company name (if applicable) | Uniphore |
+| Title or role (if applicable) | NLP/AI Engineer |
+| Date | 2020-05-03 |
+| GitHub username | vishnupriyavr |
+| Website (optional) | |
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index 89d784a0c..13bb857ca 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -28,6 +29,10 @@ def noun_chunks(obj):
"app",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add("nk")
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 5dfd44f07..f02619ac9 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -14,6 +15,10 @@ def noun_chunks(obj):
# Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod")
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index ed665ef29..5ff848124 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -20,6 +21,10 @@ def noun_chunks(obj):
"ROOT",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 6a78d86f7..0badddca1 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -2,10 +2,15 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from ...errors import Errors
def noun_chunks(obj):
doc = obj.doc
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
if not len(doc):
return
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index ed665ef29..5ff848124 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -20,6 +21,10 @@ def noun_chunks(obj):
"ROOT",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 4712d34d9..9495dcf1e 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -19,6 +20,10 @@ def noun_chunks(obj):
"nmod:poss",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 4712d34d9..9495dcf1e 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -19,6 +20,10 @@ def noun_chunks(obj):
"nmod:poss",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 4712d34d9..9495dcf1e 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -19,6 +20,10 @@ def noun_chunks(obj):
"nmod:poss",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 7a82e6b59..148884efe 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
def noun_chunks(obj):
@@ -20,6 +21,10 @@ def noun_chunks(obj):
"nmod:poss",
]
doc = obj.doc # Ensure works on both Doc and Span.
+
+ if not doc.is_parsed:
+ raise ValueError(Errors.E029)
+
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e52c5155f..d26f0ce5c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -88,6 +88,11 @@ def eu_tokenizer():
return get_lang_class("eu").Defaults.create_tokenizer()
+@pytest.fixture(scope="session")
+def fa_tokenizer():
+ return get_lang_class("fa").Defaults.create_tokenizer()
+
+
@pytest.fixture(scope="session")
def fi_tokenizer():
return get_lang_class("fi").Defaults.create_tokenizer()
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
new file mode 100644
index 000000000..12ece84b5
--- /dev/null
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_de(de_tokenizer):
+ """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = de_tokenizer("Er lag auf seinem")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
new file mode 100644
index 000000000..be14acc81
--- /dev/null
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_el(el_tokenizer):
+ """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 7dc47f9cc..1109af150 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -6,9 +6,24 @@ from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS
+import pytest
+
+
from ...util import get_doc
+def test_noun_chunks_is_parsed(en_tokenizer):
+ """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = en_tokenizer("This is a sentence")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
+
+
def test_en_noun_chunks_not_nested(en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
heads = [1, 0, 4, 3, -1, -2, -5]
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
new file mode 100644
index 000000000..71069d313
--- /dev/null
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_es(es_tokenizer):
+ """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = es_tokenizer("en Oxford este verano")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
new file mode 100644
index 000000000..a98aae061
--- /dev/null
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -0,0 +1,17 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_fa(fa_tokenizer):
+ """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+
+ doc = fa_tokenizer("این یک جمله نمونه می باشد.")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
new file mode 100644
index 000000000..876bc0ea4
--- /dev/null
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_fr(fr_tokenizer):
+ """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = fr_tokenizer("trouver des travaux antérieurs")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
new file mode 100644
index 000000000..7bac808b3
--- /dev/null
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_id(id_tokenizer):
+ """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = id_tokenizer("sebelas")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
new file mode 100644
index 000000000..17ec6cfda
--- /dev/null
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_noun_chunks_is_parsed_nb(nb_tokenizer):
+ """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = nb_tokenizer("Smørsausen brukes bl.a. til")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index ac7c066ba..38086c255 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -2,9 +2,22 @@
from __future__ import unicode_literals
import pytest
+
from ...util import get_doc
+def test_noun_chunks_is_parsed_sv(sv_tokenizer):
+ """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
+ To check this test, we're constructing a Doc
+ with a new Vocab here and forcing is_parsed to 'False'
+ to make sure the noun chunks don't run.
+ """
+ doc = sv_tokenizer("Studenten läste den bästa boken")
+ doc.is_parsed = False
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
+
+
SV_NP_TEST_EXAMPLES = [
(
"En student läste en bok", # A student read a book
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4dc438695..25a147208 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -597,8 +597,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#noun_chunks
"""
- if not self.is_parsed:
- raise ValueError(Errors.E029)
+
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
From a987e9e45d4084f30964a4cec9914ae6ed25a73c Mon Sep 17 00:00:00 2001
From: Ilia Ivanov
Date: Thu, 14 May 2020 14:14:15 +0200
Subject: [PATCH 29/69] Fix ErrorsWithCodes().__class__ return value
---
spacy/errors.py | 7 +++++--
spacy/tests/test_errors.py | 13 +++++++++++++
2 files changed, 18 insertions(+), 2 deletions(-)
create mode 100644 spacy/tests/test_errors.py
diff --git a/spacy/errors.py b/spacy/errors.py
index 32ccd3df7..b97ef3a8e 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -7,8 +7,11 @@ def add_codes(err_cls):
class ErrorsWithCodes(object):
def __getattribute__(self, code):
- msg = getattr(err_cls, code)
- return "[{code}] {msg}".format(code=code, msg=msg)
+ if not code.startswith('__'):
+ msg = getattr(err_cls, code)
+ return "[{code}] {msg}".format(code=code, msg=msg)
+ else:
+ return super().__getattribute__(code)
return ErrorsWithCodes()
diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py
new file mode 100644
index 000000000..ba24f4456
--- /dev/null
+++ b/spacy/tests/test_errors.py
@@ -0,0 +1,13 @@
+from inspect import isclass
+
+from spacy.errors import add_codes
+
+
+@add_codes
+class Errors(object):
+ E001 = "error description"
+
+
+def test_add_codes():
+ assert Errors.E001 == "[E001] error description"
+ assert isclass(Errors.__class__)
From 712d9d4820e902abe17b9b7a8ec5ac373b0b8e2d Mon Sep 17 00:00:00 2001
From: Ilia Ivanov
Date: Thu, 14 May 2020 15:45:58 +0200
Subject: [PATCH 30/69] fixup! Fix ErrorsWithCodes().__class__ return value
---
spacy/errors.py | 10 +++++-----
spacy/tests/test_errors.py | 4 ++++
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index b97ef3a8e..d99c96922 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -5,13 +5,13 @@ from __future__ import unicode_literals
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
- class ErrorsWithCodes(object):
+ class ErrorsWithCodes(err_cls):
def __getattribute__(self, code):
- if not code.startswith('__'):
- msg = getattr(err_cls, code)
- return "[{code}] {msg}".format(code=code, msg=msg)
+ msg = super().__getattribute__(code)
+ if code.startswith('__'): # python system attributes like __class__
+ return msg
else:
- return super().__getattribute__(code)
+ return "[{code}] {msg}".format(code=code, msg=msg)
return ErrorsWithCodes()
diff --git a/spacy/tests/test_errors.py b/spacy/tests/test_errors.py
index ba24f4456..1bd4eec7f 100644
--- a/spacy/tests/test_errors.py
+++ b/spacy/tests/test_errors.py
@@ -1,5 +1,7 @@
from inspect import isclass
+import pytest
+
from spacy.errors import add_codes
@@ -10,4 +12,6 @@ class Errors(object):
def test_add_codes():
assert Errors.E001 == "[E001] error description"
+ with pytest.raises(AttributeError):
+ Errors.E002
assert isclass(Errors.__class__)
From ee8fe37474ac9a0c092acc99ad1f13e8c4b97e2e Mon Sep 17 00:00:00 2001
From: Ilia Ivanov
Date: Thu, 14 May 2020 15:59:06 +0200
Subject: [PATCH 31/69] Add ilivans' contributor agreement
---
.github/contributors/ilivans.md | 106 ++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 .github/contributors/ilivans.md
diff --git a/.github/contributors/ilivans.md b/.github/contributors/ilivans.md
new file mode 100644
index 000000000..d471fde48
--- /dev/null
+++ b/.github/contributors/ilivans.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ------------------------ |
+| Name | Ilia Ivanov |
+| Company name (if applicable) | Chattermill |
+| Title or role (if applicable) | DL Engineer |
+| Date | 2020-05-14 |
+| GitHub username | ilivans |
+| Website (optional) | |
From 780b86934548661817813612debd50964b2e37d3 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 14 May 2020 16:51:03 +0200
Subject: [PATCH 32/69] Fix syntax iterators for Persian (#5437)
---
spacy/lang/fa/__init__.py | 2 ++
spacy/tests/lang/fa/__init__.py | 0
2 files changed, 2 insertions(+)
create mode 100644 spacy/tests/lang/fa/__init__.py
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 9d85f814a..c93bca671 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -10,6 +10,7 @@ from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .punctuation import TOKENIZER_SUFFIXES
+from .syntax_iterators import SYNTAX_ITERATORS
class PersianDefaults(Language.Defaults):
@@ -24,6 +25,7 @@ class PersianDefaults(Language.Defaults):
tag_map = TAG_MAP
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
+ syntax_iterators = SYNTAX_ITERATORS
class Persian(Language):
diff --git a/spacy/tests/lang/fa/__init__.py b/spacy/tests/lang/fa/__init__.py
new file mode 100644
index 000000000..e69de29bb
From e63880e0812b4bf45a8f4a96bc26c3f4a10d9fb7 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 14 May 2020 18:22:51 +0200
Subject: [PATCH 33/69] Use Token.sent_start for Span.sent (#5439)
Use `Token.sent_start` for sentence boundaries in `Span.sent` so that
`Doc.sents` and `Span.sent` return the same sentence boundaries.
---
spacy/tokens/span.pyx | 14 ++------------
1 file changed, 2 insertions(+), 12 deletions(-)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 347916a0a..2f1418a5b 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -389,19 +389,9 @@ cdef class Span:
return self.doc.user_span_hooks["sent"](self)
# This should raise if not parsed / no custom sentence boundaries
self.doc.sents
- # If doc is parsed we can use the deps to find the sentence
- # otherwise we use the `sent_start` token attribute
+ # Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0
- cdef int i
- if self.doc.is_parsed:
- root = &self.doc.c[self.start]
- while root.head != 0:
- root += root.head
- n += 1
- if n >= self.doc.length:
- raise RuntimeError(Errors.E038)
- return self.doc[root.l_edge:root.r_edge + 1]
- elif self.doc.is_sentenced:
+ if self.doc.is_sentenced:
# Find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
From f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 14 May 2020 18:23:19 +0200
Subject: [PATCH 34/69] Add Polish lemmatizer (#5413)
* Add Polish lemmatizer
Contributed by @ryszardtuora
* Add missing import
---
setup.cfg | 2 +-
spacy/lang/pl/__init__.py | 8 +++
spacy/lang/pl/lemmatizer.py | 107 ++++++++++++++++++++++++++++++++++++
3 files changed, 116 insertions(+), 1 deletion(-)
create mode 100644 spacy/lang/pl/lemmatizer.py
diff --git a/setup.cfg b/setup.cfg
index 3e0acd12f..af3579f88 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -59,7 +59,7 @@ install_requires =
[options.extras_require]
lookups =
- spacy_lookups_data>=0.0.5,<0.2.0
+ spacy_lookups_data>=0.3.1,<0.4.0
cuda =
cupy>=5.0.0b4,<9.0.0
cuda80 =
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 702a19063..0540bf535 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -6,12 +6,14 @@ from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
+from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
+from ...lookups import Lookups
class PolishDefaults(Language.Defaults):
@@ -26,6 +28,12 @@ class PolishDefaults(Language.Defaults):
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
+ @classmethod
+ def create_lemmatizer(cls, nlp=None, lookups=None):
+ if lookups is None:
+ lookups = Lookups()
+ return PolishLemmatizer(lookups)
+
class Polish(Language):
lang = "pl"
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
new file mode 100644
index 000000000..2be4b0fb7
--- /dev/null
+++ b/spacy/lang/pl/lemmatizer.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...lemmatizer import Lemmatizer
+from ...parts_of_speech import NAMES
+from ...errors import Errors
+
+
+class PolishLemmatizer(Lemmatizer):
+ # This lemmatizer implements lookup lemmatization based on
+ # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS
+ # It utilizes some prefix based improvements for
+ # verb and adjectives lemmatization, as well as case-sensitive
+ # lemmatization for nouns
+ def __init__(self, lookups, *args, **kwargs):
+ # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
+ super().__init__(lookups)
+ self.lemma_lookups = {}
+ for tag in [
+ "ADJ",
+ "ADP",
+ "ADV",
+ "AUX",
+ "NOUN",
+ "NUM",
+ "PART",
+ "PRON",
+ "VERB",
+ "X",
+ ]:
+ self.lemma_lookups[tag] = self.lookups.get_table(
+ "lemma_lookup_" + tag.lower(), {}
+ )
+ self.lemma_lookups["DET"] = self.lemma_lookups["X"]
+ self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"]
+
+ def __call__(self, string, univ_pos, morphology=None):
+ if isinstance(univ_pos, int):
+ univ_pos = NAMES.get(univ_pos, "X")
+ univ_pos = univ_pos.upper()
+
+ if univ_pos == "NOUN":
+ return self.lemmatize_noun(string, morphology)
+
+ if univ_pos != "PROPN":
+ string = string.lower()
+
+ if univ_pos == "ADJ":
+ return self.lemmatize_adj(string, morphology)
+ elif univ_pos == "VERB":
+ return self.lemmatize_verb(string, morphology)
+
+ lemma_dict = self.lemma_lookups.get(univ_pos, {})
+ return [lemma_dict.get(string, string.lower())]
+
+ def lemmatize_adj(self, string, morphology):
+ # this method utilizes different procedures for adjectives
+ # with 'nie' and 'naj' prefixes
+ lemma_dict = self.lemma_lookups["ADJ"]
+
+ if string[:3] == "nie":
+ search_string = string[3:]
+ if search_string[:3] == "naj":
+ naj_search_string = search_string[3:]
+ if naj_search_string in lemma_dict:
+ return [lemma_dict[naj_search_string]]
+ if search_string in lemma_dict:
+ return [lemma_dict[search_string]]
+
+ if string[:3] == "naj":
+ naj_search_string = string[3:]
+ if naj_search_string in lemma_dict:
+ return [lemma_dict[naj_search_string]]
+
+ return [lemma_dict.get(string, string)]
+
+ def lemmatize_verb(self, string, morphology):
+ # this method utilizes a different procedure for verbs
+ # with 'nie' prefix
+ lemma_dict = self.lemma_lookups["VERB"]
+
+ if string[:3] == "nie":
+ search_string = string[3:]
+ if search_string in lemma_dict:
+ return [lemma_dict[search_string]]
+
+ return [lemma_dict.get(string, string)]
+
+ def lemmatize_noun(self, string, morphology):
+ # this method is case-sensitive, in order to work
+ # for incorrectly tagged proper names
+ lemma_dict = self.lemma_lookups["NOUN"]
+
+ if string != string.lower():
+ if string.lower() in lemma_dict:
+ return [lemma_dict[string.lower()]]
+ elif string in lemma_dict:
+ return [lemma_dict[string]]
+ return [string.lower()]
+
+ return [lemma_dict.get(string, string)]
+
+ def lookup(self, string, orth=None):
+ return string.lower()
+
+ def lemmatize(self, string, index, exceptions, rules):
+ raise NotImplementedError
From 908dea39399bbc0c966c131796f339af5de54140 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 14 May 2020 18:26:12 +0200
Subject: [PATCH 35/69] Skip duplicate lexeme rank setting (#5401)
Skip duplicate lexeme rank setting within
`_fix_pretrained_vectors_name()`.
---
spacy/_ml.py | 13 +++++++------
spacy/language.py | 2 +-
2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 5cccabac1..60a0bbee0 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -279,18 +279,19 @@ class PrecomputableAffine(Model):
break
-def link_vectors_to_models(vocab):
+def link_vectors_to_models(vocab, skip_rank=False):
vectors = vocab.vectors
if vectors.name is None:
vectors.name = VECTORS_KEY
if vectors.data.size != 0:
warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
ops = Model.ops
- for word in vocab:
- if word.orth in vectors.key2row:
- word.rank = vectors.key2row[word.orth]
- else:
- word.rank = util.OOV_RANK
+ if not skip_rank:
+ for word in vocab:
+ if word.orth in vectors.key2row:
+ word.rank = vectors.key2row[word.orth]
+ else:
+ word.rank = util.OOV_RANK
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
diff --git a/spacy/language.py b/spacy/language.py
index e89f80f08..f23776def 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1072,7 +1072,7 @@ def _fix_pretrained_vectors_name(nlp):
else:
raise ValueError(Errors.E092)
if nlp.vocab.vectors.size != 0:
- link_vectors_to_models(nlp.vocab)
+ link_vectors_to_models(nlp.vocab, skip_rank=True)
for name, proc in nlp.pipeline:
if not hasattr(proc, "cfg"):
continue
From 72a25c9cef5c69316517650850b2ad7c04b63e01 Mon Sep 17 00:00:00 2001
From: Ilkyu Ju
Date: Sun, 17 May 2020 20:43:34 +0900
Subject: [PATCH 36/69] Very minor issues in Korean example sentences (#5446)
* Add contributor agreement
* Improve ko translation of example sentences
I fixed unnatural translations and word spacing errors.
* Update osori.md
---
.github/contributors/osori.md | 106 ++++++++++++++++++++++++++++++++++
spacy/lang/ko/examples.py | 6 +-
2 files changed, 109 insertions(+), 3 deletions(-)
create mode 100644 .github/contributors/osori.md
diff --git a/.github/contributors/osori.md b/.github/contributors/osori.md
new file mode 100644
index 000000000..93b5c7dd4
--- /dev/null
+++ b/.github/contributors/osori.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Ilkyu Ju |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2020-05-17 |
+| GitHub username | osori |
+| Website (optional) | |
diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py
index 7885ad801..0306e5db8 100644
--- a/spacy/lang/ko/examples.py
+++ b/spacy/lang/ko/examples.py
@@ -9,8 +9,8 @@ Example sentences to test spaCy and its language models.
"""
sentences = [
- "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
- "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
- "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
+ "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.",
+ "자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다",
+ "샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.",
"런던은 영국의 수도이자 가장 큰 도시입니다.",
]
From a5cd2032843b26fbff9d6e0b53637e9477af3f7f Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 19 May 2020 15:59:14 +0200
Subject: [PATCH 37/69] Reduce stored lexemes data, move feats to lookups
(#5238)
* Reduce stored lexemes data, move feats to lookups
* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
* Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
* Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
* Remove `SerializedLexemeC`
* Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
* Always create `Vocab.lookups` table `lexeme_norm` for
normalization exceptions
* Load base exceptions from `lang.norm_exceptions`, but load
language-specific exceptions from lookups
* Set `lex_attr_getter[NORM]` including new lookups table in
`BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
existing normalizations with the new normalizations (as a replacement
for the previous step that replaced all lexemes data with the
deserialized data)
* Skip English normalization test
Skip English normalization test because the data is now in
`spacy-lookups-data`.
* Remove norm exceptions
Moved to spacy-lookups-data.
* Move norm exceptions test to spacy-lookups-data
* Load extra lookups from spacy-lookups-data lazily
Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.
* Skip creating lexeme cache on load
To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.
* Identify numeric values in Lexeme.set_attrs()
With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.
* Skip lexeme cache init in from_bytes
* Unskip and update lookups tests for python3.6+
* Update vocab pickle to include lookups_extra
* Update vocab serialization tests
Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".
* Re-skip lookups test because of python3.5
* Skip PROB/float values in Lexeme.set_attrs
* Convert is_oov from lexeme flag to lex in vectors
Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.
Co-authored-by: Matthew Honnibal
---
spacy/attrs.pxd | 2 +-
spacy/attrs.pyx | 2 +-
spacy/cli/init_model.py | 7 +-
spacy/cli/train.py | 10 -
spacy/lang/da/__init__.py | 8 +-
spacy/lang/da/norm_exceptions.py | 527 ----
spacy/lang/de/__init__.py | 9 +-
spacy/lang/de/norm_exceptions.py | 16 -
spacy/lang/el/__init__.py | 9 +-
spacy/lang/el/norm_exceptions.py | 2642 -----------------
spacy/lang/en/__init__.py | 9 +-
spacy/lang/en/norm_exceptions.py | 1768 -----------
spacy/lang/id/__init__.py | 9 +-
spacy/lang/id/norm_exceptions.py | 532 ----
spacy/lang/lb/__init__.py | 9 +-
spacy/lang/lb/norm_exceptions.py | 16 -
spacy/lang/lex_attrs.py | 15 -
spacy/lang/pt/__init__.py | 9 +-
spacy/lang/pt/norm_exceptions.py | 23 -
spacy/lang/ru/__init__.py | 9 +-
spacy/lang/ru/norm_exceptions.py | 36 -
spacy/lang/sr/__init__.py | 9 +-
spacy/lang/sr/norm_exceptions.py | 26 -
spacy/lang/ta/norm_exceptions.py | 139 -
spacy/lang/th/__init__.py | 9 +-
spacy/lang/th/norm_exceptions.py | 113 -
spacy/language.py | 6 +-
spacy/lexeme.pxd | 24 +-
spacy/lexeme.pyx | 84 +-
spacy/lookups.py | 8 +-
spacy/structs.pxd | 23 -
spacy/symbols.pxd | 2 +-
spacy/symbols.pyx | 2 +-
spacy/tests/lang/da/test_exceptions.py | 8 -
spacy/tests/lang/de/test_exceptions.py | 14 -
spacy/tests/lang/en/test_exceptions.py | 1 +
spacy/tests/lang/lb/test_exceptions.py | 6 -
.../serialize/test_serialize_vocab_strings.py | 24 +-
spacy/tests/test_lemmatizer.py | 2 +-
spacy/tests/vocab_vectors/test_lexeme.py | 13 -
spacy/tests/vocab_vectors/test_lookups.py | 6 +-
spacy/tests/vocab_vectors/test_vectors.py | 12 +
spacy/tokens/token.pyx | 10 +-
spacy/vocab.pxd | 1 +
spacy/vocab.pyx | 134 +-
45 files changed, 161 insertions(+), 6182 deletions(-)
delete mode 100644 spacy/lang/da/norm_exceptions.py
delete mode 100644 spacy/lang/de/norm_exceptions.py
delete mode 100644 spacy/lang/el/norm_exceptions.py
delete mode 100644 spacy/lang/en/norm_exceptions.py
delete mode 100644 spacy/lang/id/norm_exceptions.py
delete mode 100644 spacy/lang/lb/norm_exceptions.py
delete mode 100644 spacy/lang/pt/norm_exceptions.py
delete mode 100644 spacy/lang/ru/norm_exceptions.py
delete mode 100644 spacy/lang/sr/norm_exceptions.py
delete mode 100644 spacy/lang/ta/norm_exceptions.py
delete mode 100644 spacy/lang/th/norm_exceptions.py
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 8f583b3a3..805dc2950 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -15,7 +15,7 @@ cdef enum attr_id_t:
LIKE_NUM
LIKE_EMAIL
IS_STOP
- IS_OOV
+ IS_OOV_DEPRECATED
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 2187f3c65..fe9895d06 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -16,7 +16,7 @@ IDS = {
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
- "IS_OOV": IS_OOV,
+ "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 618266633..3311a5120 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -157,15 +157,11 @@ def create_model(lang, lex_attrs, name=None):
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
- lex_added = 0
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
- lexeme.is_oov = False
- lex_added += 1
- lex_added += 1
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
@@ -193,8 +189,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
if vector_keys is not None:
for word in vector_keys:
if word not in nlp.vocab:
- lexeme = nlp.vocab[word]
- lexeme.is_oov = False
+ nlp.vocab[word]
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 6e6423131..7cb2d9745 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -15,7 +15,6 @@ import random
from .._ml import create_default_optimizer
from ..util import use_gpu as set_gpu
-from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
from ..compat import path2str
from .. import util
@@ -630,15 +629,6 @@ def _create_progress_bar(total):
def _load_vectors(nlp, vectors):
util.load_model(vectors, vocab=nlp.vocab)
- for lex in nlp.vocab:
- values = {}
- for attr, func in nlp.vocab.lex_attr_getters.items():
- # These attrs are expected to be set by data. Others should
- # be set by calling the language functions.
- if attr not in (CLUSTER, PROB, IS_OOV, LANG):
- values[lex.vocab.strings[attr]] = func(lex.orth_)
- lex.set_attrs(**values)
- lex.is_oov = False
def _load_pretrained_tok2vec(nlp, loc):
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index ac8c04954..92eec44b2 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@@ -12,17 +11,14 @@ from ..tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "da"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
morph_rules = MORPH_RULES
infixes = TOKENIZER_INFIXES
diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py
deleted file mode 100644
index dbffdb88b..000000000
--- a/spacy/lang/da/norm_exceptions.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# coding: utf8
-"""
-Special-case rules for normalizing tokens to improve the model's predictions.
-For example 'mysterium' vs 'mysterie' and similar.
-"""
-from __future__ import unicode_literals
-
-
-# Sources:
-# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
-# 2: http://www.tjerry-korrektur.dk/ord-med-flere-stavemaader/
-
-_exc = {
- # Alternative spelling
- "a-kraft-værk": "a-kraftværk", # 1
- "ålborg": "aalborg", # 2
- "århus": "aarhus",
- "accessoirer": "accessoires", # 1
- "affektert": "affekteret", # 1
- "afrikander": "afrikaaner", # 1
- "aftabuere": "aftabuisere", # 1
- "aftabuering": "aftabuisering", # 1
- "akvarium": "akvarie", # 1
- "alenefader": "alenefar", # 1
- "alenemoder": "alenemor", # 1
- "alkoholambulatorium": "alkoholambulatorie", # 1
- "ambulatorium": "ambulatorie", # 1
- "ananassene": "ananasserne", # 2
- "anførelsestegn": "anførselstegn", # 1
- "anseelig": "anselig", # 2
- "antioxydant": "antioxidant", # 1
- "artrig": "artsrig", # 1
- "auditorium": "auditorie", # 1
- "avocado": "avokado", # 2
- "bagerst": "bagest", # 2
- "bagstræv": "bagstræb", # 1
- "bagstræver": "bagstræber", # 1
- "bagstræverisk": "bagstræberisk", # 1
- "balde": "balle", # 2
- "barselorlov": "barselsorlov", # 1
- "barselvikar": "barselsvikar", # 1
- "baskien": "baskerlandet", # 1
- "bayrisk": "bayersk", # 1
- "bedstefader": "bedstefar", # 1
- "bedstemoder": "bedstemor", # 1
- "behefte": "behæfte", # 1
- "beheftelse": "behæftelse", # 1
- "bidragydende": "bidragsydende", # 1
- "bidragyder": "bidragsyder", # 1
- "billiondel": "billiontedel", # 1
- "blaseret": "blasert", # 1
- "bleskifte": "bleskift", # 1
- "blodbroder": "blodsbroder", # 2
- "blyantspidser": "blyantsspidser", # 2
- "boligministerium": "boligministerie", # 1
- "borhul": "borehul", # 1
- "broder": "bror", # 2
- "buldog": "bulldog", # 2
- "bådhus": "bådehus", # 1
- "børnepleje": "barnepleje", # 1
- "børneseng": "barneseng", # 1
- "børnestol": "barnestol", # 1
- "cairo": "kairo", # 1
- "cambodia": "cambodja", # 1
- "cambodianer": "cambodjaner", # 1
- "cambodiansk": "cambodjansk", # 1
- "camouflage": "kamuflage", # 2
- "campylobacter": "kampylobakter", # 1
- "centeret": "centret", # 2
- "chefskahyt": "chefkahyt", # 1
- "chefspost": "chefpost", # 1
- "chefssekretær": "chefsekretær", # 1
- "chefsstol": "chefstol", # 1
- "cirkulærskrivelse": "cirkulæreskrivelse", # 1
- "cognacsglas": "cognacglas", # 1
- "columnist": "kolumnist", # 1
- "cricket": "kricket", # 2
- "dagplejemoder": "dagplejemor", # 1
- "damaskesdug": "damaskdug", # 1
- "damp-barn": "dampbarn", # 1
- "delfinarium": "delfinarie", # 1
- "dentallaboratorium": "dentallaboratorie", # 1
- "diaramme": "diasramme", # 1
- "diaré": "diarré", # 1
- "dioxyd": "dioxid", # 1
- "dommedagsprædiken": "dommedagspræken", # 1
- "donut": "doughnut", # 2
- "driftmæssig": "driftsmæssig", # 1
- "driftsikker": "driftssikker", # 1
- "driftsikring": "driftssikring", # 1
- "drikkejogurt": "drikkeyoghurt", # 1
- "drivein": "drive-in", # 1
- "driveinbiograf": "drive-in-biograf", # 1
- "drøvel": "drøbel", # 1
- "dødskriterium": "dødskriterie", # 1
- "e-mail-adresse": "e-mailadresse", # 1
- "e-post-adresse": "e-postadresse", # 1
- "egypten": "ægypten", # 2
- "ekskommunicere": "ekskommunikere", # 1
- "eksperimentarium": "eksperimentarie", # 1
- "elsass": "Alsace", # 1
- "elsasser": "alsacer", # 1
- "elsassisk": "alsacisk", # 1
- "elvetal": "ellevetal", # 1
- "elvetiden": "ellevetiden", # 1
- "elveårig": "elleveårig", # 1
- "elveårs": "elleveårs", # 1
- "elveårsbarn": "elleveårsbarn", # 1
- "elvte": "ellevte", # 1
- "elvtedel": "ellevtedel", # 1
- "energiministerium": "energiministerie", # 1
- "erhvervsministerium": "erhvervsministerie", # 1
- "espaliere": "spaliere", # 2
- "evangelium": "evangelie", # 1
- "fagministerium": "fagministerie", # 1
- "fakse": "faxe", # 1
- "fangstkvota": "fangstkvote", # 1
- "fader": "far", # 2
- "farbroder": "farbror", # 1
- "farfader": "farfar", # 1
- "farmoder": "farmor", # 1
- "federal": "føderal", # 1
- "federalisering": "føderalisering", # 1
- "federalisme": "føderalisme", # 1
- "federalist": "føderalist", # 1
- "federalistisk": "føderalistisk", # 1
- "federation": "føderation", # 1
- "federativ": "føderativ", # 1
- "fejlbeheftet": "fejlbehæftet", # 1
- "femetagers": "femetages", # 2
- "femhundredekroneseddel": "femhundredkroneseddel", # 2
- "filmpremiere": "filmpræmiere", # 2
- "finansimperium": "finansimperie", # 1
- "finansministerium": "finansministerie", # 1
- "firehjulstræk": "firhjulstræk", # 2
- "fjernstudium": "fjernstudie", # 1
- "formalier": "formalia", # 1
- "formandsskift": "formandsskifte", # 1
- "fornemst": "fornemmest", # 2
- "fornuftparti": "fornuftsparti", # 1
- "fornuftstridig": "fornuftsstridig", # 1
- "fornuftvæsen": "fornuftsvæsen", # 1
- "fornuftægteskab": "fornuftsægteskab", # 1
- "forretningsministerium": "forretningsministerie", # 1
- "forskningsministerium": "forskningsministerie", # 1
- "forstudium": "forstudie", # 1
- "forsvarsministerium": "forsvarsministerie", # 1
- "frilægge": "fritlægge", # 1
- "frilæggelse": "fritlæggelse", # 1
- "frilægning": "fritlægning", # 1
- "fristille": "fritstille", # 1
- "fristilling": "fritstilling", # 1
- "fuldttegnet": "fuldtegnet", # 1
- "fødestedskriterium": "fødestedskriterie", # 1
- "fødevareministerium": "fødevareministerie", # 1
- "følesløs": "følelsesløs", # 1
- "følgeligt": "følgelig", # 1
- "førne": "førn", # 1
- "gearskift": "gearskifte", # 2
- "gladeligt": "gladelig", # 1
- "glosehefte": "glosehæfte", # 1
- "glædeløs": "glædesløs", # 1
- "gonoré": "gonorré", # 1
- "grangiveligt": "grangivelig", # 1
- "grundliggende": "grundlæggende", # 2
- "grønsag": "grøntsag", # 2
- "gudbenådet": "gudsbenådet", # 1
- "gudfader": "gudfar", # 1
- "gudmoder": "gudmor", # 1
- "gulvmop": "gulvmoppe", # 1
- "gymnasium": "gymnasie", # 1
- "hackning": "hacking", # 1
- "halvbroder": "halvbror", # 1
- "halvelvetiden": "halvellevetiden", # 1
- "handelsgymnasium": "handelsgymnasie", # 1
- "hefte": "hæfte", # 1
- "hefteklamme": "hæfteklamme", # 1
- "heftelse": "hæftelse", # 1
- "heftemaskine": "hæftemaskine", # 1
- "heftepistol": "hæftepistol", # 1
- "hefteplaster": "hæfteplaster", # 1
- "heftestraf": "hæftestraf", # 1
- "heftning": "hæftning", # 1
- "helbroder": "helbror", # 1
- "hjemmeklasse": "hjemklasse", # 1
- "hjulspin": "hjulspind", # 1
- "huggevåben": "hugvåben", # 1
- "hulmurisolering": "hulmursisolering", # 1
- "hurtiggående": "hurtigtgående", # 2
- "hurtigttørrende": "hurtigtørrende", # 2
- "husmoder": "husmor", # 1
- "hydroxyd": "hydroxid", # 1
- "håndmikser": "håndmixer", # 1
- "højtaler": "højttaler", # 2
- "hønemoder": "hønemor", # 1
- "ide": "idé", # 2
- "imperium": "imperie", # 1
- "imponerthed": "imponerethed", # 1
- "inbox": "indboks", # 2
- "indenrigsministerium": "indenrigsministerie", # 1
- "indhefte": "indhæfte", # 1
- "indheftning": "indhæftning", # 1
- "indicium": "indicie", # 1
- "indkassere": "inkassere", # 2
- "iota": "jota", # 1
- "jobskift": "jobskifte", # 1
- "jogurt": "yoghurt", # 1
- "jukeboks": "jukebox", # 1
- "justitsministerium": "justitsministerie", # 1
- "kalorifere": "kalorifer", # 1
- "kandidatstipendium": "kandidatstipendie", # 1
- "kannevas": "kanvas", # 1
- "kaperssauce": "kaperssovs", # 1
- "kigge": "kikke", # 2
- "kirkeministerium": "kirkeministerie", # 1
- "klapmydse": "klapmyds", # 1
- "klimakterium": "klimakterie", # 1
- "klogeligt": "klogelig", # 1
- "knivblad": "knivsblad", # 1
- "kollegaer": "kolleger", # 2
- "kollegium": "kollegie", # 1
- "kollegiehefte": "kollegiehæfte", # 1
- "kollokviumx": "kollokvium", # 1
- "kommissorium": "kommissorie", # 1
- "kompendium": "kompendie", # 1
- "komplicerthed": "komplicerethed", # 1
- "konfederation": "konføderation", # 1
- "konfedereret": "konfødereret", # 1
- "konferensstudium": "konferensstudie", # 1
- "konservatorium": "konservatorie", # 1
- "konsulere": "konsultere", # 1
- "kradsbørstig": "krasbørstig", # 2
- "kravsspecifikation": "kravspecifikation", # 1
- "krematorium": "krematorie", # 1
- "krep": "crepe", # 1
- "krepnylon": "crepenylon", # 1
- "kreppapir": "crepepapir", # 1
- "kricket": "cricket", # 2
- "kriterium": "kriterie", # 1
- "kroat": "kroater", # 2
- "kroki": "croquis", # 1
- "kronprinsepar": "kronprinspar", # 2
- "kropdoven": "kropsdoven", # 1
- "kroplus": "kropslus", # 1
- "krøllefedt": "krølfedt", # 1
- "kulturministerium": "kulturministerie", # 1
- "kuponhefte": "kuponhæfte", # 1
- "kvota": "kvote", # 1
- "kvotaordning": "kvoteordning", # 1
- "laboratorium": "laboratorie", # 1
- "laksfarve": "laksefarve", # 1
- "laksfarvet": "laksefarvet", # 1
- "laksrød": "lakserød", # 1
- "laksyngel": "lakseyngel", # 1
- "laksørred": "lakseørred", # 1
- "landbrugsministerium": "landbrugsministerie", # 1
- "landskampstemning": "landskampsstemning", # 1
- "langust": "languster", # 1
- "lappegrejer": "lappegrej", # 1
- "lavløn": "lavtløn", # 1
- "lillebroder": "lillebror", # 1
- "linear": "lineær", # 1
- "loftlampe": "loftslampe", # 2
- "log-in": "login", # 1
- "login": "log-in", # 2
- "lovmedholdig": "lovmedholdelig", # 1
- "ludder": "luder", # 2
- "lysholder": "lyseholder", # 1
- "lægeskifte": "lægeskift", # 1
- "lærvillig": "lærevillig", # 1
- "løgsauce": "løgsovs", # 1
- "madmoder": "madmor", # 1
- "majonæse": "mayonnaise", # 1
- "mareridtagtig": "mareridtsagtig", # 1
- "margen": "margin", # 2
- "martyrium": "martyrie", # 1
- "mellemstatlig": "mellemstatslig", # 1
- "menneskene": "menneskerne", # 2
- "metropolis": "metropol", # 1
- "miks": "mix", # 1
- "mikse": "mixe", # 1
- "miksepult": "mixerpult", # 1
- "mikser": "mixer", # 1
- "mikserpult": "mixerpult", # 1
- "mikslån": "mixlån", # 1
- "miksning": "mixning", # 1
- "miljøministerium": "miljøministerie", # 1
- "milliarddel": "milliardtedel", # 1
- "milliondel": "milliontedel", # 1
- "ministerium": "ministerie", # 1
- "mop": "moppe", # 1
- "moder": "mor", # 2
- "moratorium": "moratorie", # 1
- "morbroder": "morbror", # 1
- "morfader": "morfar", # 1
- "mormoder": "mormor", # 1
- "musikkonservatorium": "musikkonservatorie", # 1
- "muslingskal": "muslingeskal", # 1
- "mysterium": "mysterie", # 1
- "naturalieydelse": "naturalydelse", # 1
- "naturalieøkonomi": "naturaløkonomi", # 1
- "navnebroder": "navnebror", # 1
- "nerium": "nerie", # 1
- "nådeløs": "nådesløs", # 1
- "nærforestående": "nærtforestående", # 1
- "nærstående": "nærtstående", # 1
- "observatorium": "observatorie", # 1
- "oldefader": "oldefar", # 1
- "oldemoder": "oldemor", # 1
- "opgraduere": "opgradere", # 1
- "opgraduering": "opgradering", # 1
- "oratorium": "oratorie", # 1
- "overbookning": "overbooking", # 1
- "overpræsidium": "overpræsidie", # 1
- "overstatlig": "overstatslig", # 1
- "oxyd": "oxid", # 1
- "oxydere": "oxidere", # 1
- "oxydering": "oxidering", # 1
- "pakkenellike": "pakkenelliker", # 1
- "papirtynd": "papirstynd", # 1
- "pastoralseminarium": "pastoralseminarie", # 1
- "peanutsene": "peanuttene", # 2
- "penalhus": "pennalhus", # 2
- "pensakrav": "pensumkrav", # 1
- "pepperoni": "peperoni", # 1
- "peruaner": "peruvianer", # 1
- "petrole": "petrol", # 1
- "piltast": "piletast", # 1
- "piltaste": "piletast", # 1
- "planetarium": "planetarie", # 1
- "plasteret": "plastret", # 2
- "plastic": "plastik", # 2
- "play-off-kamp": "playoffkamp", # 1
- "plejefader": "plejefar", # 1
- "plejemoder": "plejemor", # 1
- "podium": "podie", # 2
- "praha": "prag", # 2
- "preciøs": "pretiøs", # 2
- "privilegium": "privilegie", # 1
- "progredere": "progrediere", # 1
- "præsidium": "præsidie", # 1
- "psykodelisk": "psykedelisk", # 1
- "pudsegrejer": "pudsegrej", # 1
- "referensgruppe": "referencegruppe", # 1
- "referensramme": "referenceramme", # 1
- "refugium": "refugie", # 1
- "registeret": "registret", # 2
- "remedium": "remedie", # 1
- "remiks": "remix", # 1
- "reservert": "reserveret", # 1
- "ressortministerium": "ressortministerie", # 1
- "ressource": "resurse", # 2
- "resætte": "resette", # 1
- "rettelig": "retteligt", # 1
- "rettetaste": "rettetast", # 1
- "returtaste": "returtast", # 1
- "risici": "risikoer", # 2
- "roll-on": "rollon", # 1
- "rollehefte": "rollehæfte", # 1
- "rostbøf": "roastbeef", # 1
- "rygsæksturist": "rygsækturist", # 1
- "rødstjært": "rødstjert", # 1
- "saddel": "sadel", # 2
- "samaritan": "samaritaner", # 2
- "sanatorium": "sanatorie", # 1
- "sauce": "sovs", # 1
- "scanning": "skanning", # 2
- "sceneskifte": "sceneskift", # 1
- "scilla": "skilla", # 1
- "sejflydende": "sejtflydende", # 1
- "selvstudium": "selvstudie", # 1
- "seminarium": "seminarie", # 1
- "sennepssauce": "sennepssovs ", # 1
- "servitutbeheftet": "servitutbehæftet", # 1
- "sit-in": "sitin", # 1
- "skatteministerium": "skatteministerie", # 1
- "skifer": "skiffer", # 2
- "skyldsfølelse": "skyldfølelse", # 1
- "skysauce": "skysovs", # 1
- "sladdertaske": "sladretaske", # 2
- "sladdervorn": "sladrevorn", # 2
- "slagsbroder": "slagsbror", # 1
- "slettetaste": "slettetast", # 1
- "smørsauce": "smørsovs", # 1
- "snitsel": "schnitzel", # 1
- "snobbeeffekt": "snobeffekt", # 2
- "socialministerium": "socialministerie", # 1
- "solarium": "solarie", # 1
- "soldebroder": "soldebror", # 1
- "spagetti": "spaghetti", # 1
- "spagettistrop": "spaghettistrop", # 1
- "spagettiwestern": "spaghettiwestern", # 1
- "spin-off": "spinoff", # 1
- "spinnefiskeri": "spindefiskeri", # 1
- "spolorm": "spoleorm", # 1
- "sproglaboratorium": "sproglaboratorie", # 1
- "spækbræt": "spækkebræt", # 2
- "stand-in": "standin", # 1
- "stand-up-comedy": "standupcomedy", # 1
- "stand-up-komiker": "standupkomiker", # 1
- "statsministerium": "statsministerie", # 1
- "stedbroder": "stedbror", # 1
- "stedfader": "stedfar", # 1
- "stedmoder": "stedmor", # 1
- "stilehefte": "stilehæfte", # 1
- "stipendium": "stipendie", # 1
- "stjært": "stjert", # 1
- "stjærthage": "stjerthage", # 1
- "storebroder": "storebror", # 1
- "stortå": "storetå", # 1
- "strabads": "strabadser", # 1
- "strømlinjet": "strømlinet", # 1
- "studium": "studie", # 1
- "stænkelap": "stænklap", # 1
- "sundhedsministerium": "sundhedsministerie", # 1
- "suppositorium": "suppositorie", # 1
- "svejts": "schweiz", # 1
- "svejtser": "schweizer", # 1
- "svejtserfranc": "schweizerfranc", # 1
- "svejtserost": "schweizerost", # 1
- "svejtsisk": "schweizisk", # 1
- "svigerfader": "svigerfar", # 1
- "svigermoder": "svigermor", # 1
- "svirebroder": "svirebror", # 1
- "symposium": "symposie", # 1
- "sælarium": "sælarie", # 1
- "søreme": "sørme", # 2
- "søterritorium": "søterritorie", # 1
- "t-bone-steak": "t-bonesteak", # 1
- "tabgivende": "tabsgivende", # 1
- "tabuere": "tabuisere", # 1
- "tabuering": "tabuisering", # 1
- "tackle": "takle", # 2
- "tackling": "takling", # 2
- "taifun": "tyfon", # 1
- "take-off": "takeoff", # 1
- "taknemlig": "taknemmelig", # 2
- "talehørelærer": "tale-høre-lærer", # 1
- "talehøreundervisning": "tale-høre-undervisning", # 1
- "tandstik": "tandstikker", # 1
- "tao": "dao", # 1
- "taoisme": "daoisme", # 1
- "taoist": "daoist", # 1
- "taoistisk": "daoistisk", # 1
- "taverne": "taverna", # 1
- "teateret": "teatret", # 2
- "tekno": "techno", # 1
- "temposkifte": "temposkift", # 1
- "terrarium": "terrarie", # 1
- "territorium": "territorie", # 1
- "tesis": "tese", # 1
- "tidsstudium": "tidsstudie", # 1
- "tipoldefader": "tipoldefar", # 1
- "tipoldemoder": "tipoldemor", # 1
- "tomatsauce": "tomatsovs", # 1
- "tonart": "toneart", # 1
- "trafikministerium": "trafikministerie", # 1
- "tredve": "tredive", # 1
- "tredver": "trediver", # 1
- "tredveårig": "trediveårig", # 1
- "tredveårs": "trediveårs", # 1
- "tredveårsfødselsdag": "trediveårsfødselsdag", # 1
- "tredvte": "tredivte", # 1
- "tredvtedel": "tredivtedel", # 1
- "troldunge": "troldeunge", # 1
- "trommestikke": "trommestik", # 1
- "trubadur": "troubadour", # 2
- "trøstepræmie": "trøstpræmie", # 2
- "tummerum": "trummerum", # 1
- "tumultuarisk": "tumultarisk", # 1
- "tunghørighed": "tunghørhed", # 1
- "tus": "tusch", # 2
- "tusind": "tusinde", # 2
- "tvillingbroder": "tvillingebror", # 1
- "tvillingbror": "tvillingebror", # 1
- "tvillingebroder": "tvillingebror", # 1
- "ubeheftet": "ubehæftet", # 1
- "udenrigsministerium": "udenrigsministerie", # 1
- "udhulning": "udhuling", # 1
- "udslaggivende": "udslagsgivende", # 1
- "udspekulert": "udspekuleret", # 1
- "udviklingsministerium": "udviklingsministerie", # 1
- "uforpligtigende": "uforpligtende", # 1
- "uheldvarslende": "uheldsvarslende", # 1
- "uimponerthed": "uimponerethed", # 1
- "undervisningsministerium": "undervisningsministerie", # 1
- "unægtelig": "unægteligt", # 1
- "urinale": "urinal", # 1
- "uvederheftig": "uvederhæftig", # 1
- "vabel": "vable", # 2
- "vadi": "wadi", # 1
- "vaklevorn": "vakkelvorn", # 1
- "vanadin": "vanadium", # 1
- "vaselin": "vaseline", # 1
- "vederheftig": "vederhæftig", # 1
- "vedhefte": "vedhæfte", # 1
- "velar": "velær", # 1
- "videndeling": "vidensdeling", # 2
- "vinkelanførelsestegn": "vinkelanførselstegn", # 1
- "vipstjært": "vipstjert", # 1
- "vismut": "bismut", # 1
- "visvas": "vissevasse", # 1
- "voksværk": "vokseværk", # 1
- "værtdyr": "værtsdyr", # 1
- "værtplante": "værtsplante", # 1
- "wienersnitsel": "wienerschnitzel", # 1
- "yderliggående": "yderligtgående", # 2
- "zombi": "zombie", # 1
- "ægbakke": "æggebakke", # 1
- "ægformet": "æggeformet", # 1
- "ægleder": "æggeleder", # 1
- "ækvilibrist": "ekvilibrist", # 2
- "æselsøre": "æseløre", # 1
- "øjehule": "øjenhule", # 1
- "øjelåg": "øjenlåg", # 1
- "øjeåbner": "øjenåbner", # 1
- "økonomiministerium": "økonomiministerie", # 1
- "ørenring": "ørering", # 2
- "øvehefte": "øvehæfte", # 1
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index dee1841c8..ca01428ba 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
@@ -10,18 +9,14 @@ from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "de"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py
deleted file mode 100644
index 3dbd4c7e3..000000000
--- a/spacy/lang/de/norm_exceptions.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# Here we only want to include the absolute most common words. Otherwise,
-# this list would get impossibly long for German – especially considering the
-# old vs. new spelling rules, and all possible cases.
-
-
-_exc = {"daß": "dass"}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 6d551cc4e..d03a42da9 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -10,21 +10,16 @@ from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class GreekDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "el"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py
deleted file mode 100644
index d4384ff3c..000000000
--- a/spacy/lang/el/norm_exceptions.py
+++ /dev/null
@@ -1,2642 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-# These exceptions are used to add NORM values based on a token's ORTH value.
-# Norms are only set if no alternative is provided in the tokenizer exceptions.
-
-_exc = {
- "αγιορίτης": "αγιορείτης",
- "αγόρι": "αγώρι",
- "έωλος": "αίολος",
- "αλλοίθωρος": "αλλήθωρος",
- "αλλοιώς": "αλλιώς",
- "αλλοιώτικος": "αλλκότικος",
- "αναµιγνύω": "αναµειγνύω",
- "ανάµιξη": "ανάµειξη",
- "ανανδρεία": "ανανδρία",
- "αναφιλυτό": "αναφιλητό",
- "ανελλειπώς": "ανελλιπώς",
- "ανεξιθρησκεία": "ανεξιθρησκία",
- "αντικρυνός": "αντικρινός",
- "απάγκιο": "απάγκεω",
- "αρµατωλός": "αρµατολός",
- "αρρώστεια": "αρρώστια",
- "ατόφιος": "ατόφυος",
- "αφίνω": "αφήνω",
- "χιβάδα": "χηβάδα",
- "αχρηστεία": "αχρηστία",
- "βαρυγκωµώ": "βαρυγγωµώ",
- "βεβαρυµένος": "βεβαρηµένος",
- "βερύκοκκο": "βερίκοκο",
- "βλήτο": "βλίτο",
- "βογκώ": "βογγώ",
- "βραδυά": "βραδιά",
- "βραδυάζει": "βραδίάζει",
- "Βρεταννία": "Βρετανία",
- "Βρεττανία": "Βρετανία",
- "βολοδέρνω": "βωλοδέρνω",
- "γέλοιο": "γέλιο",
- "γκάµα": "γκάµµα",
- "γλύφω": "γλείφω",
- "γλήνα": "γλίνα",
- "διαφήµηση": "διαφήµιση",
- "δικλείδα": "δικλίδα",
- "διοξείδιο": "διοξίδιο",
- "διορία": "διωρία",
- "δυόροφος": "διώροφος",
- "δυόµισυ": "δυόµισι",
- "διόσµος": "δυόσμος",
- "δυσφήμιση": "δυσφήµηση",
- "δοσίλογος": "δωσίλογος",
- "εγχείριση": "εγχείρηση",
- "ειδωλολατρεία": "ειδωλολατρία",
- "εληά": "ελιά",
- "ελιξίριο": "ελιξήριο",
- "έλκυθρο": "έλκηθρο",
- "ελλειπής": "ελλίπής",
- "ενάµισυς": "ενάµισης",
- "ενάµισυ": "ενάµισι",
- "ενανθρώπιση": "ενανθρώπηση",
- "έξη": "έξι",
- "επί τούτο": "επί τούτω",
- "εταιρία": "εταιρεία",
- "εφορεία": "εφορία",
- "ζηλειάρης": "ζηλιάρης",
- "Θεοφάνεια": "Θεοφάνια",
- "καυγάς": "καβγάς",
- "καθίκι": "καθοίκι",
- "καινούριος": "καινούργιος",
- "κακάβι": "κακκάβι",
- "κακαβιά": "κακκαβιά",
- "καµµία": "καµία",
- "κανέλα": "Καννέλα",
- "κανονιοφόρος": "κανονιοφόρος",
- "καντίλι": "καντήλι",
- "κατεβοδώνω": "κατευοδώνω",
- "κοίτοµαι": "κείτοµαι",
- "κελαϊδώ": "κελαηδώ",
- "κυάλια": "κιάλια",
- "κλύδωνας": "κλήδονας",
- "κλωτσώ": "κλοτσώ",
- "κολλιτσίδα": "κολλητσίδα",
- "κουκί": "κουκκί",
- "κουλός": "κουλλός",
- "κρεββάτι": "κρεβάτι",
- "κροκόδειλος": "κροκόδιλος",
- "κοβιός": "κωβιός",
- "λάκισα": "λάκησα",
- "λιµέρι": "ληµέρι",
- "λώξυγγας": "λόξυγγας",
- "µαγγούρα": "µαγκούρα",
- "µαζή": "μαζί",
- "µακρυά": "µακριά",
- "µαµή": "µαµµή",
- "µαµόθρεφτος": "µαµµόθρεφτος",
- "µίγµα": "µείγµα",
- "µίξη": "µείξη",
- "µετώπη": "µετόπη",
- "µυρολόι": "µοιρολόι",
- "µοτοσικλέτα": "µοτοσυκλέτα",
- "µπαλωµατής": "µπαλλωµατής",
- "µιζίθρα": "µυζήθρα",
- "νεοτερίζω": "νεωτερίζω",
- "νεοτερισµός": "νεωτερισμός",
- "νεοτεριστής": "νεωτεριστής",
- "νινί": "νηνί",
- "νοιώθω": "νιώθω",
- "νονός": "νοννός",
- "ξενιτιά": "ξενιτειά",
- "ξαίρω": "ξέρω",
- "ξίγκι": "ξίγγι",
- "ξείδι": "ξίδι",
- "ξώβεργα": "ξόβεργα",
- "ξιπάζω": "ξυπάζω",
- "ξιπασµένος": "ξυπασµένος",
- "ξυπόλητος": "ξυπόλυτος",
- "ξωκλήσι": "ξωκκλήσι",
- "οξυά": "οξιά",
- "ορθοπεδικός": "ορθοπαιδικός",
- "ωχ": "οχ",
- "παπάς": "παππάς",
- "παραγιός": "παραγυιός",
- "περηφάνεια": "περηφάνια",
- "πιλάλα": "πηλάλα",
- "πίννα": "πίνα",
- "πηρούνι": "πιρούνι",
- "πιτσιλώ": "πιτσυλώ",
- "πιτσιλίζω": "πιτσυλίζω",
- "πλατυάζω": "πλατειάζω",
- "πληµµυρίδα": "πληµυρίδα",
- "πληγούρι": "πλιγούρι",
- "πωπώ": "ποπό",
- "πουγγί": "πουγκί",
- "πρίγκηπας": "πρίγκιπας",
- "προάστειο": "προάστιο",
- "προεδρεία": "προεδρία",
- "πρίµα": "πράµα",
- "πρωτήτερα": "πρωτύτερα",
- "προτύτερα": "πρωτύτερα",
- "πόρωση": "πώρωση",
- "ρεβύθι": "ρεβίθι",
- "ρέγγα": "ρέΥκα",
- "ρηγώνω": "ριγώνω",
- "ρωµανικός": "ροµανικός",
- "ρίζι": "ρύζι",
- "Ρώσσος": "Ρώσος",
- "σακκούλα": "σακούλα",
- "συνάφι": "σινάφι",
- "σειρίτι": "σιρίτι",
- "σιφόνι": "σιφώνι",
- "συχαίνοµαι": "σιχαίνοµαι",
- "σκιρόδεµα": "σκυρόδεµα",
- "σπάγγος": "σπάγκος",
- "στυλιάρι": "στειλιάρι",
- "στοιβάδα": "στιβάδα",
- "στίβα": "στοίβα",
- "στριµώνω": "στρυµώνω",
- "στριμώχνω": "στρυμώχνω",
- "συγχύζω": "συγχίζω",
- "σηκώτι": "συκώτι",
- "σιναγρίδα": "συναγρίδα",
- "συνοδεία": "συνοδία",
- "σίφιλη": "σύφιλη",
- "τανιέµαι": "τανυέµαι",
- "τανίζω": "τανύζω",
- "τέσσερις": "τέσσερεις",
- "τζιτζιφιά": "τζιτζυφιά",
- "τόνος": "τόννος",
- "τοπείο": "τοπίο",
- "τρέλλα": "τρέλα",
- "τσαγγάρης": "τσαγκάρης",
- "τσανάκα": "τσαννάκα",
- "τσανακογλείφτης": "τσαννακογλείφτης",
- "τσιτώνω": "τσητώνω",
- "τσιγκλώ": "τσυγκλώ",
- "τσίµα": "τσύµα",
- "υννί": "υνί",
- "υπερηφάνια": "υπερηφάνεια",
- "υπόχρεως": "υπόχρεος",
- "φάκελλος": "φάκελος",
- "φείδι": "φίδι",
- "φιλονεικώ": "φιλονικώ",
- "φιλονεικία": "φιλονικία",
- "φυρί-φυρί": "φιρί-φιρί",
- "φτιάνω": "φτειάχνω",
- "φτιάχνω": "φτειάχνω",
- "φτώχεια": "φτώχια",
- "φυσαλίδα": "φυσαλλίδα",
- "χάνος": "χάννος",
- "χυνόπωρο": "χινόπωρο",
- "χεινόπωρο": "χινόπωρο",
- "χιµίζω": "χυµίζω",
- "χιμίζω": "χυμιζώ",
- "γκωλ": "γκολ",
- "αιρκοντίσιον": "ερκοντίσιον",
- "καρµπυρατέρ": "καρµπφατέρ",
- "κυλόττα": "κιλότα",
- "κλή ρινγκ": "κλίρινγκ",
- "κωλγκέρλ": "κολγκέρλ",
- "κοµπιναιζόν": "κοµπινεζόν",
- "κοπυράιτ": "κοπιράιτ",
- "µυλαίδη": "µιλέδη",
- "µποϋκοτάζ": "µποϊκοτάζ",
- "πέναλτυ": "πέναλτι",
- "πορτραίτο": "πορτρέτο",
- "ρεστωράν": "ρεστοράν",
- "ροσµπήφ": "ροσµπίφ",
- "σαντιγύ": "σαντιγί",
- "στριπτήζ": "στριπτίζ",
- "ταµπλώ": "ταµπλό",
- "τζόκεϋ": "τζόκεϊ",
- "φουτµπώλ": "φουτµπόλ",
- "τρόλλεϋ": "τρόλεϊ",
- "χίππυ": "χίπι",
- "φέρρυ-µπωτ": "φεριµπότ",
- "χειρούργος": "χειρουργός",
- "αβαείο": "αββαείο",
- "αβάς": "αββάς",
- "αβάσκαµα": "βάσκαµα",
- "αβασκανία": "βασκανία",
- "αβάφτιστος": "αβάπτιστος",
- "αβάφτιστη": "αβάπτιστη",
- "αβάφτιστο": "αβάπτιστο",
- "αβγίλα": "αβγουλίλα",
- "αυτί": "αφτί",
- "αβδέλλα": "βδέλλα",
- "Αβράµ": "'Αβραάµ",
- "αγγινάρα": "αγκινάρα",
- "αγγόνα": "εγγονή",
- "αγγόνι": "εγγόνι",
- "αγγονός": "εγγονός",
- "άγειρτος": "άγερτος",
- "άγειρτη": "άγερτη",
- "άγειρτο": "άγερτο",
- "αγέρας": "αέρας",
- "αγκλέουρας": "αγλέορας",
- "αγκλίτοα": "γκλίτσα",
- "Αγκόλα": "Ανγκόλα",
- "αγκορά": "ανγκορά",
- "αγκοστοίιρα": "ανγκοστούρα",
- "άγνεστος": "άγνεθος",
- "άγνεστη": "άγνεθη",
- "άγνεστο": "άγνεθο",
- "αγώρι": "αγόρι",
- "αγωρίστικος": "αγορίστικος",
- "αγωρίστικη": "αγορίστικη",
- "αγωρίστικο": "αγορίστικο",
- "αγωροκόριτσο": "αγοροκόριστο",
- "αγουρόλαδο": "αγουρέλαιο",
- "αγροικώ": "γροικώ",
- "αδάµαντας": "αδάµας",
- "αδερφή": "αδελφή",
- "αδέρφι": "αδέλφι",
- "αδερφικός": "αδελφικός",
- "αδερφική": "αδελφική",
- "αδερφικό": "αδελφικό",
- "αδερφοποιτός": "αδελφοποιτός",
- "αδερφός": "αδελφός",
- "αδερφοσύνη": "αδελφοσύνη",
- "αέρι": "αγέρι",
- "αερόµπικ": "αεροβική",
- "αεροστρόβιλος": "αεριοστρόβιλος",
- "αητός": "αετός",
- "αιµατοποσία": "αιµοποσία",
- "άιντε": "άντε",
- "αισθηµατισµός": "συναισθηµατισµός",
- "αιτιακός": "αιτιώδης",
- "αιτιακή": "αιτιώδης",
- "αιτιακό": "αιτιώδες",
- "ακατανόµαστος": "ακατονόµαστος",
- "ακατανόμαστη": "ακατονόμαστη",
- "ακατονόμαστο": "ακατανόμαστο",
- "ακέραιος": "ακέριος",
- "ακέραια": "ακέρια",
- "ακέραιο": "ακέριο",
- "άκρον": "άκρο",
- "ακτύπητος": "αχτύπητος",
- "ακτύπητη": "αχτύπητη",
- "ακτύπητο": "αχτύπητο",
- "ακυριολεκτώ": "ακυρολεκτώ",
- "ακυριολεξία": "ακυρολεξία",
- "αλάτι": "άλας",
- "αλατένιος": "αλάτινος",
- "αλατένια": "αλάτινη",
- "αλατένιο": "αλάτινο",
- "αλαφραίνω": "ελαφρώνω",
- "αλαφριός": "ελαφρύς",
- "αλαφριό": "ελαφρύ",
- "αλαφρόµυαλος": "ελαφρόµυαλος",
- "αλαφρόμυαλη": "ελαφρόμυαλη",
- "αλαφρόμυαλο": "ελαφρόμυαλο",
- "αλείβω": "αλείφω",
- "άλευρο": "αλεύρι",
- "αλησµονησιά": "λησµονιά",
- "αλκολίκι": "αλκοολίκι",
- "αλλέως": "αλλιώς",
- "αλληλοεπίδραση": "αλληλεπίδραση",
- "αλλήθωρος": "αλλοίθωρος",
- "αλλήθωρη": "αλλοίθωρη",
- "αλλήθωρο": "αλλοίθωρο",
- "αλλοίµονο": "αλίµονο",
- "αµνηστεία": "αµνηστία",
- "αµπαρόριζα": "αρµπαρόριζα",
- "αµπέχωνο": "αµπέχονο",
- "αµυγδαλάτος": "αµυγδαλωτός",
- "αμυγδαλάτη": "αμυγδαλωτή",
- "αμυγδαλάτο": "αμυγδαλωτό",
- "αµυγδαλόλαδο": "αµυγδαλέλαιο",
- "αµφίλογος": "αµφιλεγόµενος",
- "αμφίλογη": "αμφιλεγόμενη",
- "αμφίλογο": "αμφιλεγόμενο",
- "αναβατός": "ανεβατός",
- "αναβατή": "ανεβατή",
- "αναβατό": "ανεβατό",
- "αναδεχτός": "αναδεκτός",
- "αναθρέφω": "ανατρέφω",
- "ανακατώνω": "ανακατεύω",
- "ανακάτωση": "ανακάτεµα",
- "αναλίσκω": "αναλώνω",
- "αναμειγνύω": "αναμιγνύω",
- "αναμείκτης": "αναμίκτης",
- "ανάµεικτος": "ανάµικτος",
- "ανάμεικτη": "ανάμικτη",
- "ανάμεικτο": "ανάμικτο",
- "αναπαµός": "ανάπαυση",
- "αναπαρασταίνω": "αναπαριστάνω",
- "ανάπρωρος": "ανάπλωρος",
- "ανάπρωρη": "ανάπλωρη",
- "ανάπρωρο": "ανάπλωρο",
- "αναπτυγµένος": "ανεπτυγμένος",
- "αναπτυγµένη": "ανεπτυγμένη",
- "αναπτυγµένο": "ανεπτυγμένο",
- "άναστρος": "ανάστερος",
- "αναστυλώνω": "αναστηλώνω",
- "αναστύλωση": "αναστήλωση",
- "ανεγνωρισµένος": "αναγνωρισµένος",
- "αναγνωρισμένη": "αναγνωρισµένη",
- "αναγνωρισμένο": "αναγνωρισµένο",
- "ανέµυαλος": "άμυαλος",
- "ανέμυαλη": "άμυαλη",
- "ανέμυαλο": "άμυαλο",
- "ανεπάντεχος": "αναπάντεχος",
- "ανεπάντεχη": "αναπάντεχη",
- "ανεπάντεχο": "αναπάντεχο",
- "ανεψιά": "ανιψιά",
- "ανεψιός": "ανιψιός",
- "ανήρ": "άνδρας",
- "ανηφόρι": "ανήφορος",
- "ανηψιά": "ανιψιά",
- "ανηψιός": "ανιψιός",
- "άνθιση": "άνθηση",
- "ανταλλάζω": "ανταλλάσσω",
- "ανταπεξέρχοµαι": "αντεπεξέρχοµαι",
- "αντζούγια": "αντσούγια",
- "αντιεισαγγελέας": "αντεισαγγελέας",
- "αντικατασταίνω": "αντικαθιστώ",
- "αντικρύζω": "αντικρίζω",
- "αντιµολία": "αντιµωλία",
- "αντιπροσωπεία": "αντιπροσωπία",
- "αντισταµινικό": "αντιισταµινικός",
- "αντίχτυπος": "αντίκτυπος",
- "άντρας": "άνδρας",
- "αντρόγυνο": "ανδρόγυνο",
- "αντρώνω": "ανδρώνω",
- "άξια": "άξιος",
- "απακούµπι": "αποκούµπι",
- "απαλάµη": "παλάµη",
- "Απαλάχια": "Αππαλάχια",
- "απάνω": "επάνω",
- "απέδρασα": "αποδιδράσκω",
- "απλούς": "απλός",
- "απλούν": "απλό",
- "απόγαιο": "απόγειο",
- "αποδείχνω": "αποδεικνύω",
- "αποθαµός": "πεθαµός",
- "αποθανατίζω": "απαθανατίζω",
- "αποκεντροποίηση": "αποκέντρωση",
- "απολαυή": "απολαβή",
- "αποξεραίνω": "αποξηραίνω",
- "απόξυοη": "απόξεση",
- "απόξω": "απέξω",
- "απόσχω": "απέχω",
- "αποτίω": "αποτίνω",
- "αποτυχαίνω": "αποτυγχάνω",
- "αποχαιρετίζω": "αποχαιρετώ",
- "απόχτηµα": "απόκτηµα",
- "απόχτηση": "απόκτηση",
- "αποχτώ": "αποκτώ",
- "Απρίλης": "Απρίλιος",
- "αρκαντάσης": "καρντάσης",
- "αρµάρι": "ερµάριο",
- "άρµη": "άλµη",
- "αρµοστεία": "αρµοστία",
- "άρµπουρο": "άλµπουρο",
- "αρµύρα": "αλµύρα",
- "αρµυρίκι": "αλµυρίκι",
- "άρρην": "άρρεν",
- "αρσανάς": "ταρσανάς",
- "αρτύνω": "αρταίνω",
- "αρχινίζω": "αρχίζω",
- "αρχινώ": "αρχίζω",
- "αρχίτερα": "αρχύτερα",
- "ασκηµάδα": "ασχήµια",
- "ασκηµαίνω": "ασχηµαίνω",
- "ασκήµια": "ασχήµια",
- "ασκηµίζω": "ασχηµίζω",
- "άσσος": "άσος",
- "αστράπτω": "αστράφτω",
- "αστράπτω": "αστράφτω",
- "αταχτώ": "ατακτώ",
- "ατσάλινος": "ατσαλένιος",
- "ατσάλινη": "ατσαλένια",
- "ατσάλινο": "ατσαλένιο",
- "Ατσιγγάνος": "Τσιγγάνος",
- "Ατσίγγανος": "Τσιγγάνος",
- "αυγαταίνω": "αβγατίζω",
- "αυγατίζω": "αβγατίζω",
- "αυγό": "αβγό",
- "αυγοειδής": "αυγοειδής",
- "αυγοειδές": "αβγοειδές",
- "αυγοθήκη": "αβγοθήκη",
- "αυγοκόβω": "αβγοκόβω",
- "αυγοτάραχο": "αβγοτάραχο",
- "αύλακας": "αυλάκι",
- "αυτί": "αφτί",
- "αυτιάζοµαι": "αφτιάζοµαι",
- "αφορεσµός": "αφορισµός",
- "άφρονας": "άφρων",
- "αχείλι": "χείλι",
- "άχερο": "άχυρο",
- "αχερώνας": "αχυρώνας",
- "αχιβάδα": "αχηβάδα",
- "αχτίδα": "ακτίνα",
- "βαβουίνος": "µπαµπουίνος",
- "Βαγγέλης": "Ευάγγελος",
- "βαγγέλιο": "ευαγγέλιο",
- "Βάγια": "Βάί'α",
- "βαζιβουζούκος": "βασιβουζούκος",
- "βαθύνω": "βαθαίνω",
- "βάιο": "βάγιο",
- "βακαλάος": "µπακαλιάρος",
- "βαλάντιο": "βαλλάντιο",
- "βαλαντώνω": "βαλλαντώνω",
- "βάνω": "βάζω",
- "βαρειά": "βαριά",
- "βαριεστίζω": "βαργεστώ",
- "βαριεστώ": "βαργεστώ",
- "βαρώ": "βαράω",
- "βαρώνος": "βαρόνος",
- "βασιλέας": "βασιλιάς",
- "βασµούλος": "γασµούλος",
- "Βαυαρία": "Βαβαρία",
- "Βαυαροκρατία": "Βαβαροκρατία",
- "βαφτίζω": "βαπτίζω",
- "βάφτιση": "βάπτιση",
- "βάφτισµα": "βάπτισµα",
- "βαφτιστής": "βαπτιστής",
- "βαφτιστικός": "βαπτιστικός",
- "βαφτιστική": "βαπτιστική",
- "βαφτιστικιά": "βαπτιστική",
- "βαφτιστικό": "βαπτιστικό",
- "βδοµάδα": "εβδοµάδα",
- "βεγόνια": "µπιγκόνια",
- "βελανίδι": "βαλανίδι",
- "βελανιδιά": "βαλανιδιά",
- "βενζίνα": "βενζίνη",
- "βεράτιο": "µπεράτι",
- "βερόκοκο": "βερίκοκο",
- "βιγόνια": "µπιγκόνια",
- "βλάφτω": "βλάπτω",
- "βλογιά": "ευλογιά",
- "βλογάω": "ευλογώ",
- "βογγίζω": "βογγώ",
- "βόγγος": "βογγητό",
- "βογκητό": "βογγητό",
- "βοδάµαξα": "βοϊδάµαξα",
- "βόλλεϋ": "βόλεϊ",
- "βολοκοπώ": "βωλοκοπώ",
- "βόλος": "βώλος",
- "βουβάλι": "βούβαλος",
- "βουή": "βοή",
- "βούλα": "βούλλα",
- "βούλωµα": "βούλλωµα",
- "βουλώνω": "βουλλώνω",
- "βουρβόλακας": "βρικόλακας",
- "βουρκόλακας": "βρικόλακας",
- "βους": "βόδι",
- "βραδι": "βράδυ",
- "βρυκόλακας": "βρικόλακας",
- "βρώµα": "βρόµα",
- "βρώµη": "βρόµη",
- "βρωµιά": "βροµιά",
- "βρωµίζω": "βροµίζω",
- "βρώµιο": "βρόµιο",
- "βρωµώ": "βροµώ",
- "βωξίτης": "βοξίτης",
- "γάβρος": "γαύρος",
- "γαϊδάρα": "γαϊδούρα",
- "γαίµα": "αίµα",
- "γαλακτόπιτα": "γαλατόπιτα",
- "γάµα": "γάµµα",
- "γαµβρός": "γαµπρός",
- "γαρίφαλο": "γαρύφαλλο",
- "γαρούφαλλο": "γαρύφαλλο",
- "γαυγίζω": "γαβγίζω",
- "γελάδα": "αγελάδα",
- "γελέκο": "γιλέκο",
- "γένοµαι": "γίνοµαι",
- "γενότυπος": "γονότυπος",
- "Γένουα": "Γένοβα",
- "γεράζω": "γερνώ",
- "γέρακας": "γεράκι",
- "γερατειά": "γηρατειά",
- "γεροκοµείο": "γηροκοµείο",
- "γεροκοµώ": "γηροκοµώ",
- "Γεσθηµανή": "Γεθσηµανή",
- "γεώδης": "γαιώδης",
- "γαιώδες": "γαιώδες",
- "γηρασµός": "γήρανση",
- "Γιάννενα": "Ιωάννινα",
- "Γιάννινα": "Ιωάννινα",
- "γιάνω": "γιαίνω",
- "γιαουρτλού": "γιογουρτλού",
- "Γιαπωνέζος": "Ιαπωνέζος",
- "γιγαντεύω": "γιγαντώνω",
- "γιεγιές": "γεγές",
- "Γιεν": "γεν",
- "γιέσµαν": "γέσµαν",
- "γιόκας": "γυιόκας",
- "γιορτασµός": "εορτασµός",
- "γιος": "γυιος",
- "Γιούλης": "Ιούλιος",
- "Γιούνης": "Ιούνιος",
- "γιοφύρι": "γεφύρι",
- "Γιώργος": "Γεώργιος",
- "γιωτ": "γιοτ",
- "γιωτακισµός": "ιωτακισµός",
- "γκάγκστερ": "γκάνγκστερ",
- "γκαγκστερισµός": "γκανγκστερισµός",
- "γκαµήλα": "καµήλα",
- "γκεµπελίσκος": "γκαιµπελίσκος",
- "γκιουβέτσι": "γιουβέτσι",
- "γκιώνης": "γκιόνης",
- "γκλοµπ": "κλοµπ",
- "γκογκ": "γκονγκ",
- "Γκιόνα": "Γκιώνα",
- "γκόρφι": "γκόλφι",
- "γκρα": "γκρας",
- "Γκράβαρα": "Κράβαρα",
- "γκυ": "γκι",
- "γλαϋξ": "γλαύκα",
- "γλιτώνω": "γλυτώνω",
- "γλύκισµα": "γλύκυσµα",
- "γλυστρώ": "γλιστρώ",
- "γλωσσίδα": "γλωττίδα",
- "γνέφαλλο": "γνάφαλλο",
- "γνοιάζοµαι": "νοιάζοµαι",
- "γόµα": "γόµµα",
- "γόνα": "γόνατο",
- "γονιός": "γονέας",
- "γόπα": "γώπα",
- "γούµενος": "ηγούµενος",
- "γουµένισσα": "ηγουµένη",
- "γουώκµαν": "γουόκµαν",
- "γραία": "γριά",
- "Γράµος": "Γράµµος",
- "γρασίδι": "γρασσίδι",
- "γρεγολεβάντες": "γραιγολεβάντες",
- "γρέγος": "γραίγος",
- "γρικώ": "γροικώ",
- "Γροιλανδία": "Γροιλανδία",
- "γρίνια": "γκρίνια",
- "γροθοκοπώ": "γρονθοκοπώ",
- "γρούµπος": "γρόµπος",
- "γυαλοπωλείο": "υαλοπωλείο",
- "γυρνώ": "γυρίζω",
- "γόρωθε": "γύροθε",
- "γωβιός": "κωβιός",
- "δάγκάµα": "δάγκωµα",
- "δαγκαµατιά": "δαγκωµατιά",
- "δαγκανιά": "δαγκωνιά",
- "δαιµονοπληξία": "δαιµονιόπληκτος",
- "δαίµων": "δαίµονας",
- "δακτυλήθρα": "δαχτυλήθρα",
- "δακτυλίδι": "δαχτυλίδι",
- "∆αυίδ": "∆αβίδ",
- "δαχτυλογραφία": "δακτυλογραφία",
- "δαχτυλογράφος": "δακτυλογράφος",
- "δεικνύω": "δείχνω",
- "δείλι": "δειλινό",
- "δείχτης": "δείκτης",
- "δελής": "ντελής",
- "δενδρογαλή": "δεντρογαλιά",
- "δεντρολίβανο": "δενδρολίβανο",
- "δεντροστοιχία": "δενδροστοιχία",
- "δεντροφυτεία": "δενδροφυτεία",
- "δεντροφυτεύω": "δενδροφυτεύω",
- "δεντρόφυτος": "δενδρόφυτος",
- "δεξής": "δεξιό",
- "δερµατώδης": "δερµατοειδής",
- "δερματώδες": "δερµατοειδές",
- "δέσποτας": "δεσπότης",
- "δεφτέρι": "τεφτέρι",
- "διαβατάρης": "διαβάτης",
- "διάβηκα": "διαβαίνω",
- "διαβιβρώσκω": "διαβρώνω",
- "διαθρέψω": "διατρέφω",
- "διακόνεµα": "διακονιά",
- "διάολος": "διάβολος",
- "∆ιαµαντής": "Αδαµάντιος",
- "διαολιά": "διαβολιά",
- "διαολογυναίκα": "διαβολογυναίκα",
- "διαολοθήλυκο": "διαβολοθήλυκο",
- "διαολόκαιρος": "διαβολόκαιρος",
- "διαολοκόριτσο": "διαβολοκόριτσο",
- "διαολόπαιδο": "διαβολόπαιδο",
- "διάολος": "διάβολος",
- "διασκελιά": "δρασκελιά",
- "διαχύνω": "διαχέω",
- "δίδω": "δίνω",
- "δίκηο": "δίκιο",
- "δοβλέτι": "ντοβλέτι",
- "δοσίλογος": "δωσίλογος",
- "δράχνω": "αδράχνω",
- "δρέπανο": "δρεπάνι",
- "δρόσος": "δροσιά",
- "δώνω": "δίνω",
- "εγγίζω": "αγγίζω",
- "εδώθε": "δώθε",
- "εδωνά": "εδωδά",
- "εικοσάρι": "εικοσάρικο",
- "εικών": "εικόνα",
- "εισαγάγω": "εισάγω",
- "εισήγαγα": "εισάγω",
- "εισήχθην": "εισάγω",
- "έκαμα": "έκανα",
- "εκατόν": "εκατό",
- "εκατοστάρης": "κατοστάρης",
- "εκατοστάρι": "κατοστάρι",
- "εκατοστάρικο": "κατοστάρικο",
- "εκλαίρ": "εκλέρ",
- "Ελδοράδο": "Ελντοράντο",
- "ελευθεροτεκτονισµός": "τεκτονισµός",
- "ελευτεριά": "ελευθερία",
- "Ελεφαντοστού Ακτή": "Ακτή Ελεφαντοστού",
- "ελληνικάδικο": "ελληνάδικο",
- "Ελπίδα": "Ελπίς",
- "εµορφιά": "οµορφιά",
- "εµορφάδα": "οµορφιά",
- "έµπορας": "έµπορος",
- "εµώ": "εξεµώ",
- "ένδεκα": "έντεκα",
- "ενενήκοντα": "ενενήντα",
- "ενωρίς": "νωρίς",
- "εξανέστην": "εξανίσταµαι",
- "εξήκοντα": "εξήντα",
- "έξις": "έξη",
- "εξωκκλήσι": "ξωκκλήσι",
- "εξωµερίτης": "ξωµερίτης",
- "επανωφόρι": "πανωφόρι",
- "επιµειξία": "επιµιξία",
- "επίστοµα": "απίστοµα",
- "επτάζυµο": "εφτάζυµο",
- "επταήµερος": "εφταηµερος",
- "επταθέσιος": "εφταθέσιος",
- "επταµελής": "εφταµελης",
- "επταµηνία": "εφταµηνία",
- "επταµηνίτικος": "εφταµηνίτικος",
- "επταπλασιάζω": "εφταπλασιάζω",
- "επταπλάσιος": "εφταπλάσιος",
- "επτασύλλαβος": "εφτασύλλαβος",
- "επτατάξιος": "εφτατάξιος",
- "επτάτοµος": "εφτάτοµος",
- "επτάφυλλος": "εφτάφυλλος",
- "επτάχρονα": "εφτάχρονα",
- "επτάχρονος": "εφτάχρονος",
- "επταψήφιος": "εφταψήφιος",
- "επτάωρος": "εφτάωρος",
- "επταώροφος": "εφταώροφος",
- "έργον": "έργο",
- "ευκή": "ευχή",
- "ευρό": "ευρώ",
- "ευσπλαχνίζοµαι": "σπλαχνίζοµαι",
- "εφεντης": "αφέντης",
- "εφηµεριακός": "εφηµέριος",
- "εφημεριακή": "εφηµέρια",
- "εφημεριακό": "εφηµέριο",
- "εφτά": "επτά",
- "εφταετία": "επταετία",
- "εφτακόσια": "επτακόσια",
- "εφτακόσιοι": "επτακόσιοι",
- "εφτακοσιοστός": "επτακοσιοστός",
- "εχθές": "χθες",
- "ζάπι": "ζάφτι",
- "ζαχαριάζω": "ζαχαρώνω",
- "ζαχαροµύκητας": "σακχαροµύκητας",
- "ζεµανφού": "ζαµανφού",
- "ζεµανφουτισµός": "ζαµανφουτισµός",
- "ζέστα": "ζέστη",
- "ζεύλα": "ζεύγλα",
- "Ζηλανδία": "Νέα Ζηλανδία",
- "ζήλεια": "ζήλια",
- "ζιµπούλι": "ζουµπούλι",
- "ζο": "ζώο",
- "ζουρλαµάρα": "ζούρλα",
- "ζωοφόρος": "ζωφόρος",
- "ηλεκτροκόλληση": "ηλεκτροσυγκόλληση",
- "ηλεκτροοπτική": "ηλεκτροπτική",
- "ήλιο": "ήλιον",
- "ηµιόροφος": "ηµιώροφος",
- "θαλάµι": "θαλάµη",
- "θάµα": "θαύµα",
- "θαµπώνω": "θαµβώνω",
- "θάµπος": "θάµβος",
- "θάφτω": "θάβω",
- "θεοψία": "θεοπτία",
- "θέσει": "θέση",
- "θηλειά": "θηλιά",
- "Θόδωρος": "Θεόδωρος",
- "θρύβω": "θρύπτω",
- "θυµούµαι": "θυµάµαι",
- "Ιαµάϊκή": "Τζαµάικα",
- "ιατρεύω": "γιατρεύω",
- "ιατρός": "γιατρός",
- "ιατροσόφιο": "γιατροσόφι",
- "I.Q.": "αϊ-κιού",
- "ινατι": "γινάτι",
- "ιονίζω": "ιοντίζω",
- "ιονιστής": "ιοντιστής",
- "ιονόσφαιρα": "ιοντόσφαιρα",
- "Ιούλης": "Ιούλιος",
- "ίσασµα": "ίσιωµα",
- "ισιάζω": "ισιώνω",
- "ίσκιος": "ήσκιος",
- "ισκιώνω": "ησκιώνω",
- "ίσωµα": "ίσιωµα",
- "ισώνω": "ισιώνω",
- "ιχθύαση": "ιχθύωση",
- "ιώτα": "γιώτα",
- "καββαλισµός": "καβαλισµός",
- "κάβουρος": "κάβουρας",
- "καδής": "κατής",
- "καδρίλια": "καντρίλια",
- "Καζακστάν": "Καζαχστάν",
- "καθέκλα": "καρέκλα",
- "κάθησα": "κάθισα",
- "[1766]. καθίκι": "καθοίκι",
- "καΐλα": "καήλα",
- "καϊξής": "καϊκτσής",
- "καλδέρα": "καλντέρα",
- "καλεντάρι": "καλαντάρι",
- "καλήν εσπέρα": "καλησπέρα",
- "καλιά": "καλειά",
- "καλιακούδα": "καλοιακούδα",
- "κάλλια": "κάλλιο",
- "καλλιά": "κάλλιο",
- "καλόγηρος": "καλόγερος",
- "καλόρχεται": "καλοέρχεται",
- "καλσόν": "καλτσόν",
- "καλυµµαύκι": "καµιλαύκι",
- "καλύµπρα": "καλίµπρα",
- "καλωσύνη": "καλοσύνη",
- "καµαρωτός": "καµαρότος",
- "καµηλαύκι": "καµιλαύκι",
- "καµτσίκι": "καµουτσίκι",
- "καναβάτσο": "κανναβάτσο",
- "κανακίζω": "κανακεύω",
- "κανάτα": "καννάτα",
- "κανατάς": "καννατάς",
- "κανάτι": "καννάτι",
- "κανελής": "καννελής",
- "κανελιά": "καννελή",
- "κανελί": "καννελή",
- "κανελονι": "καννελόνι",
- "κανελλόνι": "καννελόνι",
- "κανένας": "κανείς",
- "κάνη": "κάννη",
- "κανί": "καννί",
- "κάνναβης": "κάνναβις",
- "καννιβαλισµός": "κανιβαλισµός",
- "καννίβαλος": "κανίβαλος",
- "κανοκιάλι": "καννοκιάλι",
- "κανόνι": "καννόνι",
- "κανονιά": "καννονιά",
- "κανονίδι": "καννονίδι",
- "κανονιέρης": "καννονιέρης",
- "κανονιοβολητής": "καννονιοβολητής",
- "κανονιοβολισµός": "καννονιοβολισµός",
- "κανονιοβολώ": "καννονιοβολώ",
- "κανονιοστάσιο": "καννονιοστάσιο",
- "κανονιοστοιχία": "καννονιοστοιχία",
- "κανονοθυρίδα": "καννονοθυρίδα",
- "κάνουλα": "κάννουλα",
- "κανών": "κανόνας",
- "κάπα": "κάππα",
- "κάπαρη": "κάππαρη",
- "καπαρντίνα": "καµπαρντίνα",
- "καραβόσκοινο": "καραβόσχοινο",
- "καρένα": "καρίνα",
- "κάρκάδο": "κάκαδο",
- "καροτίνη": "καρωτίνη",
- "καρότο": "καρώτο",
- "καροτόζουµο": "καρωτόζουµο",
- "καροτοσαλάτα": "καρωτοσαλάτα",
- "καρπούµαι": "καρπώνοµαι",
- "καρρώ": "καρό",
- "κάρυ": "κάρι",
- "καρυοφύλλι": "καριοφίλι",
- "καταΐφι": "κανταΐφι",
- "κατακάθηµαι": "κατακάθοµαι",
- "κατάντια": "κατάντηµα",
- "κατασκοπεία": "κατασκοπία",
- "καταφτάνω": "καταφθάνω",
- "καταχράσθηκα": "καταχράστηκα",
- "κατάχτηση": "κατάκτηση",
- "καταχτητής": "κατακτητής",
- "καταχτώ": "κατακτώ",
- "καταχωρώ": "καταχωρίζω",
- "κατέβαλα": "καταβάλλω",
- "Κατερίνα": "Αικατερίνη",
- "κατοστίζω": "εκατοστίζω",
- "κάτου": "κάτω",
- "κατρουλιό": "κατουρλιό",
- "καυναδίζω": "καβγαδίζω",
- "καϋµός": "καηµός",
- "'κεί": "εκεί",
- "κείθε": "εκείθε",
- "καψόνι": "καψώνι",
- "καψύλλιο": "καψούλι",
- "κελάρης": "κελλάρης",
- "κελί": "κελλί",
- "κεντήτρια": "κεντήστρα",
- "κεσέµι": "γκεσέµι",
- "κέσιο": "καίσιο",
- "κηπάριο": "κήπος",
- "κινάρα": "αγκινάρα",
- "κιοφτές": "κεφτές",
- "κλαίγω": "κλαίω",
- "κλαπάτσα": "χλαπάτσα",
- "κλασσικίζω": "κλασικίζω",
- "κλασσικιστής": "κλασικιστής",
- "κλέπτης": "κλέφτης",
- "κληθρα": "σκλήθρα",
- "κλήρινγκ": "κλίρινγκ",
- "κλιπ": "βιντεοκλίπ",
- "κλωσά": "κλώσσα",
- "κλωτσιά": "κλοτσιά",
- "κογκλάβιο": "κονκλάβιο",
- "κογκρέσο": "κονγκρέσο",
- "κοιµίσης": "κοίµησης",
- "κοιµούµαι": "κοιµάµαι",
- "κοιτώ": "κοιτάζω",
- "κοιτάω": "κοιτάζω",
- "κόκαλο": "κόκκαλο",
- "κοκίτης": "κοκκύτης",
- "κοκκίαση": "κοκκίωση",
- "κοκκοφοίνικας": "κοκοφοίνικας",
- "κολάζ": "κολλάζ",
- "κολαντρίζω": "κουλαντρίζω",
- "κολαρίζω": "κολλαρίζω",
- "κολεχτίβα": "κολεκτίβα",
- "κολεχτιβισµός": "κολεκτιβισµός",
- "κολιγιά": "κολληγιά",
- "κολίγος": "κολλήγας",
- "κολίγας": "κολλήγας",
- "κολικόπονος": "κωλικόπονος",
- "κολιός": "κολοιός",
- "κολιτσίνα": "κολτσίνα",
- "κολυµπήθρα": "κολυµβήθρα",
- "κολώνα": "κολόνα",
- "κολώνια": "κολόνια",
- "κοµβόι": "κονβόι",
- "κόµις": "κόµης",
- "κόµισσα": "κόµης",
- "κόµιτας": "κόµης",
- "κοµιτεία": "κοµητεία",
- "κόµµατα": "κοµµάτι",
- "κοµµούνα": "κοµούνα",
- "κοµµουναλισµός": "κοµουναλισµός",
- "κοµµούνι": "κοµούνι",
- "κοµµουνίζω": "κοµουνίζω",
- "κοµµουνισµός": "κοµουνισµός",
- "κοµµουνιστής": "κοµουνιστής",
- "κονδυλοειδής": "κονδυλώδης",
- "κονδυλοειδές": "κονδυλώδες",
- "κονσέρτο": "κοντσέρτο",
- "κόντραµπαντιέρης": "κοντραµπατζής",
- "κοντσίνα": "κολτσίνα",
- "κονφορµισµός": "κοµφορµισµός",
- "κονφορµιστής": "κομφορμιστής",
- "κοπελιά": "κοπέλα",
- "κοπλιµέντο": "κοµπλιµέντο",
- "κόπτω": "κόβω",
- "κόπυραιτ": "κοπιράιτ",
- "Κοριτσα": "Κορυτσά",
- "κοριτσόπουλο": "κορίτσι",
- "κορνέτο": "κορνέτα",
- "κορνιζώνω": "κορνιζάρω",
- "κορόιδεµα": "κοροϊδία",
- "κορόνα": "κορώνα",
- "κορφή": "κορυφή",
- "κοσάρι": "εικοσάρικο",
- "κοσάρικο": "εικοσάρικο",
- "κοσµετολογία": "κοσµητολογία",
- "κοτάω": "κοτώ",
- "κουβαρνταλίκι": "χουβαρνταλίκι",
- "κουβαρντάς": "χουβαρντάς",
- "κουβερνάντα": "γκουβερνάντα",
- "κούκος": "κούκκος",
- "κουλλουρτζής": "κουλλουράς",
- "κουλούρας": "κουλλουράς",
- "κουλούρι": "κουλλούρι",
- "κουλουριάζω": "κουλλουριάζω",
- "κουλουρτζής": "κουλλουράς",
- "κουρδιστής": "χορδιστής",
- "κουρντιστής": "χορδιστής",
- "κουρντίζω": "κουρδίζω",
- "κουρντιστήρι": "κουρδιστήρι",
- "κουστούµι": "κοστούµι",
- "κουτεπιέ": "κουντεπιέ",
- "κόφτης": "κόπτης",
- "κόχη": "κόγχη",
- "κοψοχείλης": "κοψαχείλης",
- "κρεµάζω": "κρεµώ",
- "κροντήρι": "κρωντήρι",
- "κροµµύδι": "κρεµµύδι",
- "κροµµυδίλα": "κρεµµυδίλα",
- "κρουσταλλιάζω": "κρυσταλλιάζω",
- "κτένα": "χτένα",
- "κτενάκι": "χτενάκι",
- "κτένι": "χτένι",
- "κτενίζω": "χτενίζω",
- "κτένισµα": "χτένισµα",
- "κτίριο": "κτήριο",
- "κυλίω": "κυλώ",
- "κυττάζω": "κοιτάζω",
- "κωλ-γκέρλ": "κολ-γκέρλ",
- "κωλοµπαράς": "κολοµπαράς",
- "κωσταντινάτο": "κωνσταντινάτο",
- "Κώστας": "Κωνσταντίνος",
- "κώχη": "κόγχη",
- "λάβδα": "λάµβδα",
- "λαγούτο": "λαούτο",
- "λαγύνι": "λαγήνι",
- "λαίδη": "λέδη",
- "λαϊκάντζα": "λαϊκούρα",
- "λαιµά": "λαιµός",
- "λαΐνι": "λαγήνι",
- "λαµπράδα": "λαµπρότητα",
- "λάρος": "γλάρος",
- "λατόµι": "λατοµείο",
- "λαύδανο": "λάβδανο",
- "λαυράκι": "λαβράκι",
- "λαφίνα": "ελαφίνα",
- "λαφόπουλο": "ελαφόπουλο",
- "λειβάδι": "λιβάδι",
- "Λειβαδιά": "Λιβάδια",
- "λεϊµόνι": "λεµόνι",
- "λεϊµονιά": "λεµονιά",
- "Λειψία": "Λιψία",
- "λέοντας": "λέων",
- "λεπτά": "λεφτά",
- "λεπτύνω": "λεπταίνω",
- "λευκαστής": "λευκαντής",
- "Λευτέρης": "Ελευθέριος",
- "λευτερώνω": "ελευθερώνω",
- "λέω": "λέγω",
- "λιανεµπόριο": "λειανεµπόριο",
- "λιανίζω": "λειανίζω",
- "λιανοτούφεκο": "λειανοτούφεκο",
- "λιανοντούφεκο": "λειανοντούφεκο",
- "λιανοπούληµα": "λειανοπούληµα",
- "λιανοπωλητής": "λειανοπωλητής",
- "λιανοτράγουδο": "λειανοτράγουδο",
- "λιγοψυχία": "ολιγοψυχία",
- "λιθρίνι": "λυθρίνι",
- "λιµένας": "λιµάνι",
- "λίµπρα": "λίβρα",
- "λιοβολιά": "ηλιοβολία",
- "λιόδεντρο": "ελαιόδεντρο",
- "λιόλαδο": "ελαιόλαδο",
- "λιόσπορος": "ηλιόσπορος",
- "λιοτρίβειο": "ελαιοτριβείο",
- "λιοτρόπι": "ηλιοτρόπιο",
- "λιόφως": "ηλιόφως",
- "λιχουδιά": "λειχουδιά",
- "λιώνω": "λειώνω",
- "λογιωτατίζω": "λογιοτατίζω",
- "λογιώτατος": "λογιότατος",
- "λόγκος": "λόγγος",
- "λόξιγκας": "λόξυγγας",
- "λοτόµος": "υλοτόµος",
- "Λουµπλιάνα": "Λιουµπλιάνα",
- "λούω": "λούζω",
- "λύγξ": "λύγκας",
- "λυµφατισµός": "λεµφατισµός",
- "λυντσάρω": "λιντσάρω",
- "λυσσιακό": "λυσσακό",
- "λυώνω": "λειώνω",
- "Λωξάντρα": "Λοξάντρα",
- "λωρένσιο": "λορένσιο",
- "λωρίδα": "λουρίδα",
- "µαγγάνιο": "µαγκάνιο",
- "µαγγιώρος": "µαγκιόρος",
- "µαγειριά": "µαγεριά",
- "µάγειρος": "µάγειρας",
- "µόγερας": "µάγειρας",
- "µαγιώ": "µαγιό",
- "µαγκανοπήγαδο": "µαγγανοπήγαδο",
- "µαγκώνω": "µαγγώνω",
- "µαγνόλια": "µανόλια",
- "Μαγυάρος": "Μαγιάρος",
- "µαζύ": "µαζί",
- "µαζώνω": "µαζεύω",
- "µαιζονέτα": "µεζονέτα",
- "µαιτρ": "µετρ",
- "µαιτρέσα": "µετρέσα",
- "µακριός": "µακρύς",
- "μακριά": "µακρυά",
- "μακριό": "µακρύ",
- "µαλάσσω": "µαλάζω",
- "µαµά": "µαµµά",
- "µαµouδι": "µαµούνι",
- "µάνα": "µάννα",
- "µανδαρινέα": "µανταρινιά",
- "µανδήλι": "µαντήλι",
- "µάνδρα": "µάντρα",
- "µανές": "αµανές",
- "Μανόλης": "Εµµανουήλ",
- "µαντζούνι": "µατζούνι",
- "µαντζουράνα": "µατζουράνα",
- "µαντίλα": "µαντήλα",
- "µαντίλι": "µαντήλι",
- "µαντµαζέλ": "µαµαζέλ",
- "µαντρίζω": "µαντρώνω",
- "µαντώ": "µαντό",
- "Μανώλης": "Εµµανουήλ",
- "µάρτυς": "µάρτυρας",
- "µασκάλη": "µασχάλη",
- "µατοκυλίζω": "αιµατοκυλίζω",
- "µατοκύλισµα": "αιµατοκυλίζω",
- "µατσέτα": "µασέτα",
- "µαυράδα": "µαυρίλα",
- "μεγαλόπολη": "µεγαλούπολη",
- "µεγαλοσπληνία": "σπληνοµεγαλία",
- "µέγγενη": "µέγκενη",
- "μείκτης": "µίκτης",
- "µελίγγι": "µηλίγγι",
- "µεντελισµός": "µενδελισµός",
- "µενχίρ": "µενίρ",
- "µέρα": "ηµέρα",
- "µεράδι": "µοιράδι",
- "µερεύω": "ηµερεύω",
- "µέρµηγκας": "µυρµήγκι",
- "µερµήγκι": "µυρµήγκι",
- "µερσίνα": "µυρσίνη",
- "µερσίνη": "µυρσίνη",
- "µέρωµα": "ηµερώνω",
- "µερώνω": "ηµερώνω",
- "µέσον": "µέσο",
- "µεσοούρανα": "µεσούρανα",
- "µεταλίκι": "µεταλλίκι",
- "µεταπούληση": "µεταπώληση",
- "µεταπουλω": "µεταπωλώ",
- "µετοχιάριος": "µετοχάρης",
- "µητάτο": "µιτάτο",
- "µητριά": "µητρυιά",
- "µητριός": "µητρυιός",
- "Μιανµάρ": "Μυανµάρ",
- "Μίκι Μάους": "Μίκυ Μάους",
- "µικρύνω": "µικραίνω",
- "µινουέτο": "µενουέτο",
- "µιξοπαρθένα": "µειξοπαρθένα",
- "µισοφόρι": "µεσοφόρι",
- "µίτζα": "µίζα",
- "µολογώ": "οµολογώ",
- "μολογάω": "οµολογώ",
- "µοµία": "µούµια",
- "µοµιοποίηση": "µουµιοποίηση",
- "µονάρχιδος": "µόνορχις",
- "µονιάζω": "µονοιάζω",
- "µορφιά": "οµορφιά",
- "µορφονιός": "οµορφονιός",
- "µοσκάρι": "µοσχάρι",
- "µοσκοβολιά": "µοσκοβολιά",
- "µοσκοβολώ": "µοσχοβολώ",
- "µοσκοκαρυδιά": "µοσχοκαρυδιά",
- "µοσκοκάρυδο": "µοσχοκάρυδο",
- "µοσκοκάρφι": "µοσχοκάρφι",
- "µοσκολίβανο": "µοσχολίβανο",
- "µοσκοµπίζελο": "µοσχοµπίζελο",
- "µοσκοµυρίζω": "µοσχοµυρίζω",
- "µοσκοπουλώ": "µοσχοπουλώ",
- "µόσκος": "µόσχος",
- "µοσκοσάπουνο": "µοσχοσάπουνο",
- "µοσκοστάφυλο": "µοσχοστάφυλο",
- "µόσχειος": "µοσχαρήσιος",
- "μόσχειο": "µοσχαρήσιο",
- "µουλώνω": "µουλαρώνω",
- "µουρταδέλα": "µορταδέλα",
- "µουσικάντης": "µουζικάντης",
- "µουσσώνας": "µουσώνας",
- "µουστάκα": "µουστάκι",
- "µουστακοφόρος": "µυστακοφόρος",
- "µπαγάζια": "µπαγκάζια",
- "πάγκα": "µπάνκα",
- "µπαγκαδορος": "µπανκαδόρος",
- "µπογκέρης": "µπανκέρης",
- "µπάγκος": "πάγκος",
- "µπαιν-µαρί": "µπεν-µαρί",
- "µπαλάντα": "µπαλλάντα",
- "µπαλαντέζα": "µπαλλαντέζα",
- "µπαλαντέρ": "µπαλλαντέρ",
- "µπαλάντζα": "παλάντζα",
- "µπαλένα": "µπαλαίνα",
- "µπαλέτο": "µπαλλέτο",
- "µπάλος": "µπάλλος",
- "µπάλσαµο": "βάλσαµο",
- "µπαλσάµωµα": "βαλσάµωµα",
- "µπαλσαµώνω": "βαλσαµώνω",
- "µπάλωµα": "µπάλλωµα",
- "µπαλώνω": "µπαλλώνω",
- "µπαµπάκι": "βαµβάκι",
- "µπαµπακόσπορος": "βαµβακόσπορος",
- "Μπάµπης": "Χαραλάµπης",
- "µπάµπω": "βάβω",
- "µπανέλα": "µπαναίλα",
- "µπαρµπρίζ": "παρµπρίζ",
- "µπατίστα": "βατίστα",
- "µπαχτσές": "µπαξές",
- "µπαχτσίσι": "µπαξίσι",
- "µπεζεβέγκης": "πεζεβέγκης",
- "µπελτές": "πελτές",
- "µπεντόνι": "µπιντόνι",
- "µπερδουκλώνω": "µπουρδουκλώνω",
- "µπερκέτι": "µπερεκέτι",
- "µπετόνι": "µπιτόνι",
- "µπεχαβιορισµός": "µπιχεβιορισµός",
- "µπεχλιβάνης": "πεχλιβάνης",
- "µπιγκουτί": "µπικουτί",
- "µπιµπίλα": "µπιρµπίλα",
- "µπιµπλό": "µπιµπελό",
- "µπιρσίµι": "µπρισίµι",
- "µπις": "µπιζ",
- "µπιστόλα": "πιστόλα",
- "µπιστόλι": "πιστόλι",
- "µπιστολιά": "πιστολιά",
- "µπιτόνι": "µπιντόνι",
- "µπογιάρος": "βογιάρος",
- "µπονάτσα": "µπουνάτσα",
- "µπονατσάρει": "µπουνατσάρει",
- "µπουά": "µποά",
- "µπουκαµβίλια": "βουκαµβίλια",
- "µποϋκοταζ": "µποϊκοτάζ",
- "µποϋκοτάρω": "µποϊκοτάρω",
- "µπουλβάρ": "βουλεβάρτο",
- "µπουρδέλο": "µπορντέλο",
- "µπουρµπουάρ": "πουρµπουάρ",
- "µπρίζα": "πρίζα",
- "µπριτζόλα": "µπριζόλα",
- "µπρος": "εµπρός",
- "µπύρα": "µπίρα",
- "µπυραρία": "µπιραρία",
- "µπυροποσία": "µπιροποσία",
- "µυγδαλιά": "αµυγδαλιά",
- "µύγδαλο": "αµύγδαλο",
- "µυλόρδος": "µιλόρδος",
- "μυρουδιά": "µυρωδιά",
- "µυτζήθρα": "µυζήθρα",
- "µύωψ": "µύωπας",
- "µώλος": "µόλος",
- "νέθω": "γνέθω",
- "νι": "νυ",
- "νίκελ": "νικέλιο",
- "νοµεύς": "νοµέας",
- "νοστιµίζω": "νοστιµεύω",
- "νουννός": "νοννός",
- "νταβάνι": "ταβάνι",
- "ντάβανος": "τάβανος",
- "νταβανόσκουπα": "ταβανόσκουπα",
- "νταβούλι": "νταούλι",
- "νταλαβέρι": "νταραβέρι",
- "νταµπλάς": "ταµπλάς",
- "ντελαπάρω": "ντεραπάρω",
- "ντενεκές": "τενεκές",
- "ντερβεναγος": "δερβέναγας",
- "ντερβένι": "δερβένι",
- "ντερβίσης": "δερβίσης",
- "ντερβισόπαιδο": "δερβισόπαιδο",
- "ντοκυµανταίρ": "ντοκιµαντέρ",
- "ντουνρού": "ντογρού",
- "ντουζ": "ντους",
- "ντουζιέρα": "ντουσιέρα",
- "Ντούµα": "∆ούµα",
- "ντούπλεξ": "ντούµπλεξ",
- "ντουφέκι": "τουφέκι",
- "ντουφεκίδι": "τουφεκίδι",
- "ντουφεκίζω": "τουφεκίζω",
- "ντουφεξής": "τουφεξής",
- "νύκτα": "νύχτα",
- "νυκτωδία": "νυχτωδία",
- "νωµατάρχης": "ενωµοτάρχης",
- "ξανεµίζω": "εξανεµίζω",
- "ξεγνοιάζω": "ξενοιάζω",
- "ξεγνοιασιά": "ξενοιασιά",
- "ξελαφρώνω": "ξαλαφρώνω",
- "ξεπίτηδες": "επίτηδες",
- "ξεπιτούτου": "εξεπιτούτου",
- "ξεσκάζω": "ξεσκάω",
- "ξεσπάζω": "ξεσπώ",
- "ξεσχίζω": "ξεσκίζω",
- "ξέσχισµα": "ξεσκίζω",
- "ξευτελίζω": "εξευτελίζω",
- "ξεφτίζω": "ξεφτύζω",
- "ξεφτίλα": "ξευτίλα",
- "ξεφτίλας": "ξευτίλας",
- "ξεφτιλίζω": "ξευτιλίζω",
- "ξεχάνω": "ξεχνώ",
- "ξηγώ": "εξηγώ",
- "ξηροφαγία": "ξεροφαγία",
- "ξηροφαγιά": "ξεροφαγία",
- "ξι": "ξει",
- "ξιπασιά": "ξυπασιά",
- "ξίπασµα": "ξύπασµα",
- "ξιπολησιά": "ξυπολυσιά",
- "ξιπολιέµαι": "ξυπολιέµαι",
- "εξοµολόγηση": "ξομολόγηση",
- "ξοµολογητής": "εξοµολογητής",
- "ξοµολόγος": "εξοµολόγος",
- "ξοµολογώ": "εξοµολογώ",
- "ξουράφι": "ξυράφι",
- "ξουράφια": "ξυραφιά",
- "ξόφληση": "εξόφληση",
- "ξύγγι": "ξίγγι",
- "ξύγκι": "ξίγγι",
- "ξύδι": "ξίδι",
- "ξυλοσκίστης": "ξυλοσχίστης",
- "ξυλώνω": "ξηλώνω",
- "ξυνωρίδα": "συνωρίδα",
- "ξώθυρα": "εξώθυρα",
- "ξώπορτα": "εξώπορτα",
- "ξώφυλλο": "εξώφυλλο",
- "οδοντογιατρός": "οδοντίατρος",
- "οδοντόπονος": "πονόδοντος",
- "οικογενειακά": "οικογενειακώς",
- "οικοκυρά": "νοικοκυρά",
- "οκτάς": "οκτάδα",
- "οκταετής": "οχταετής",
- "οκταετές": "οχταετές",
- "οκταετία": "οχταετία",
- "οµοιάζω": "µοιάζω",
- "οµοιώνω": "εξοµοιώνω",
- "οµόµετρο": "ωµόµετρο",
- "οµορφάδα": "οµορφιά",
- "οµπρός": "εµπρός",
- "ονείρεµα": "όνειρο",
- "οξείδιο": "οξίδιο",
- "οξειδοαναγωγή": "οξιδοαναγωγή",
- "οξειδώνω": "οξιδώνω",
- "οξείδωση": "οξίδωση",
- "οξειδωτής": "οξιδωτής",
- "οξιζενέ": "οξυζενέ",
- "οπίσω": "πίσω",
- "οργιά": "οργυιά",
- "όρνεο": "όρνιο",
- "όρνις": "όρνιθα",
- "ορρός": "ορός",
- "όσµωση": "ώσµωση",
- "οστεΐτιδα": "οστίτιδα",
- "οστεογονία": "οστεογένεση",
- "οφίτσιο": "οφίκιο",
- "οφφίκιο": "οφίκιο",
- "οχτάβα": "οκτάβα",
- "οχτάδα": "οκτάδα",
- "οχταετία": "οκταετία",
- "οχτακόσια": "οκτακόσια",
- "οχτακόσιοι": "οκτακόσιοι",
- "οχτακόσιες": "οκτακόσιες",
- "οχτακόσια": "οκτακόσια",
- "όχτρητα": "έχθρητα",
- "οχτώ": "οκτώ",
- "Οχτώβρης": "Οκτώβριος",
- "οψιανός": "οψιδιανός",
- "παγαίνω": "πηγαίνω",
- "παγόνι": "παγώνι",
- "παιγνίδι": "παιχνίδι",
- "παίδαρος": "παίδαρος",
- "παίχτης": "παίκτης",
- "παλικαράς": "παλληκαράς",
- "παλικάρι": "παλληκάρι",
- "παλικαριά": "παλληκαριά",
- "παλικαροσύνη": "παλληκαροσύνη",
- "παλληκαρίστίκος": "παλληκαρήσιος",
- "παλληκαρίστικη": "παλληκαρήσια",
- "παλληκαρίστικο": "παλληκαρήσιο",
- "παλληκαροσύνη": "παλληκαριά",
- "πανταλόνι": "παντελόνι",
- "παντατίφ": "πανταντίφ",
- "πανταχούσα": "απανταχούσα",
- "Πάντοβα": "Πάδοβα",
- "παντούφλα": "παντόφλα",
- "παντοχή": "απαντοχή",
- "πανψυχισµός": "παµψυχισµός",
- "πάνω": "επάνω",
- "παπαδάκι": "παππαδάκι",
- "παπαδαρειό": "παππαδαρειό",
- "παπαδιά": "παππαδιά",
- "παπαδοκόρη": "παππαδοκόρη",
- "παπαδοκρατούµαι": "παππαδοκρατούµαι",
- "παπαδολόι": "παππαδολόι",
- "παπαδοπαίδι": "παππαδοπαίδι",
- "παπαδοπούλα": "παππαδοπούλα",
- "Παπαδόπουλο": "παππαδόπουλο",
- "παπατζής": "παππατζής",
- "παπατρέχας": "παππατρέχας",
- "παραγιάς": "παραγυιός",
- "παρανυχίδα": "παρωνυχίδα",
- "παρεισφρύω": "παρεισφρέω",
- "παρεννοώ": "παρανοώ",
- "παρ' ολίγο": "παραλίγο",
- "πασαβιόλα": "µπασαβιόλα",
- "πασάλειµµα": "πασσάλειµµα",
- "πασαλείφω": "πασσαλείφω",
- "πασκίζω": "πασχίζω",
- "παστρουµάς": "παστουρµάς",
- "πατερµά": "πατερηµά",
- "πατήρ": "πατέρας",
- "πατούνα": "πατούσα",
- "πατριός": "πατρυιός",
- "πάτρονας": "πάτρωνας",
- "πάψη": "παύση",
- "πεθυµώ": "επιθυµώ",
- "πείρος": "πίρος",
- "πελέκι": "πέλεκυς",
- "πελεκίζω": "πελεκώ",
- "πελλόγρα": "πελάγρα",
- "πεντήκοντα": "πενήντα",
- "πεντόβολα": "πεντόβωλα",
- "πεντόδραχµο": "πεντάδραχµο",
- "περβολάρης": "περιβολάρης",
- "περβόλι": "περιβόλι",
- "περδικλώνω": "πεδικλώνω",
- "περηφανεύοµαι": "υπερηφανεύοµαι",
- "περηφάνια": "υπερηφάνεια",
- "περικόβω": "περικόπτω",
- "περιπατώ": "περπατώ",
- "περιστεριώνας": "περιστερώνας",
- "περιτάµω": "περιτέµνω",
- "περιφάνεια": "περηφάνια",
- "περιφράζω": "περιφράσσω",
- "περιχαράζω": "περιχαράσσω",
- "περιχέω": "περιχύνω",
- "περντάχι": "µπερντάχι",
- "πέρπυρο": "υπέρπυρο",
- "πέρσι": "πέρυσι",
- "πετούγια": "µπετούγια",
- "πευκιάς": "πευκώνας",
- "πηγεµός": "πηγαιµός",
- "πηγούνι": "πιγούνι",
- "πήτα": "πίτα",
- "πήχυς": "πήχης",
- "πι": "πει",
- "πιζάµα": "πιτζάµα",
- "πιθαµή": "σπιθαµή",
- "πιθώνω": "απιθώνω",
- "πίκρισµα": "πικρίζω",
- "πιλαλώ": "πηλαλώ",
- "Πιλάτος": "Πόντιος Πιλάτος",
- "πιοτό": "ποτό",
- "πιπίζω": "πιππίζω",
- "πιρέξ": "πυρέξ",
- "πίστοµα": "απίστοµα",
- "πιτσιλάδα": "πιτσυλάδα",
- "πιτσιλιά": "πιτσυλιά",
- "πίττα": "πίτα",
- "πίτυρον": "πίτουρο",
- "πλάγι": "πλάι",
- "πλανάρω": "πλανίζω",
- "πλάσσω": "πλάθω",
- "πλειονοψηφία": "πλειοψηφία",
- "πλείονοψηφώ": "πλειοψηφώ",
- "πλεξίδα": "πλεξούδα",
- "πλερωµή": "πληρωµή",
- "πλερώνω": "πληρώνω",
- "πλέυ µπόυ": "πλεϊµπόι",
- "πλέχτης": "πλέκτης",
- "πληµµύρα": "πληµύρα",
- "πνιγµός": "πνίξιµο",
- "πνευµονόκοκκος": "πνευµονιόκοκκος",
- "ποιµήν": "ποιµένας",
- "πόλις": "πόλη",
- "πόλιτσµαν": "πόλισµαν",
- "πολιτσµάνος": "πόλισµαν",
- "πολύµπριζο": "πολύπριζο",
- "πολυπάω": "πολυπηγαίνω",
- "πολύπους": "πολύποδας",
- "Πόρτο Ρίκο": "Πουέρτο Ρίκο",
- "ποταπαγόρευση": "ποτοαπαγόρευση",
- "πούντρα": "πούδρα",
- "πράµα": "πράγµα",
- "πρεβάζι": "περβάζι",
- "πρέπον": "πρέπων",
- "προαγάγω": "προάγω",
- "προδίνω": "προδίδω",
- "προιξ": "προίκα",
- "προποτζής": "προπατζής",
- "προσαγάγω": "προσάγω",
- "πρόσµιξη": "πρόσµειξη",
- "προσφύγω": "προσφεύγω",
- "προφθάνω": "προφταίνω",
- "προφυλάω": "προφυλάσσω",
- "προψές": "προχθές",
- "πρύµη": "πρύµνη",
- "πταρνίζοµαι": "φταρνίζοµαι",
- "πτελέα": "φτελιά",
- "πτέρνα": "φτέρνα",
- "πτερυγίζω": "φτερουγίζω",
- "πτιφούρ": "πετιφούρ",
- "πτι-φούρ": "πετιφούρ",
- "πτωχαίνω": "φτωχαίνω",
- "πτώχεια": "φτώχια",
- "πυκνά": "πυκνός",
- "πυλωτή": "πιλοτή",
- "πύο": "πύον",
- "πυρογενής": "πυριγενής",
- "πυρογενές": "πυριγενές",
- "πυτζάµα": "πιτζάµα",
- "ραγκλόν": "ρεγκλάν",
- "ραγού": "ραγκού",
- "ραΐζω": "ραγίζω",
- "ραίντνκεν": "ρέντγκεν",
- "ράντζο": "ράντσο",
- "ράπτω": "ράβω",
- "ρεβανί": "ραβανί",
- "ρέγγε": "ρέγκε",
- "Ρεγγίνα": "Ρεγκίνα",
- "ρεµούλκα": "ρυµούλκα",
- "ασκέρι": "ασκέρι",
- "ρεοβάση": "ρευµατοβάση",
- "ρεπανάκι": "ραπανάκι",
- "ρεπάνι": "ραπάνι",
- "ρεύω": "ρέβω",
- "ρήγα": "ρίγα",
- "ρηµοκκλήσι": "ερηµοκκλήσι",
- "ριγκ": "ρινγκ",
- "ριζότο": "ρυζότο",
- "ροβίθι": "ρεβίθι",
- "ροβιθιά": "ρεβιθιά",
- "ροδακινιά": "ρωδακινιά",
- "ροδάκινο": "ρωδάκινο",
- "ρόιδι": "ρόδι",
- "ροϊδιά": "ροδιά",
- "ρόιδο": "ρόδι",
- "ροοστάτης": "ρεοστάτης",
- "ροφώ": "ρουφώ",
- "ρωδιός": "ερωδιός",
- "ρωθωνίζω": "ρουθουνίζω",
- "ρωµαντισµός": "ροµαντισµός",
- "Ρωσσία": "Ρωσία",
- "ρωτώ": "ερωτώ",
- "σάζω": "σιάζω",
- "σαιζλόνγκ": "σεζλόνγκ",
- "σαιζόν": "σεζόν",
- "σαγολαίφα": "σακολαίβα",
- "σάκκα": "σάκα",
- "σακκάκι": "σακάκι",
- "σακκάς": "σακάς",
- "σακκί": "σακί",
- "σακκίδιο": "σακίδιο",
- "σακκοβελόνα": "σακοβελόνα",
- "σακκογκόλιθος": "σακογκόλιθος",
- "σακκοειδής": "σακοειδής",
- "σακκοειδές": "σακοειδες",
- "σακκοράφα": "σακοράφα",
- "σάκκος": "σάκος",
- "σακκουλα": "σακούλα",
- "σακκουλάκι": "σακούλι",
- "σακκουλεύοµαι": "σακουλεύοµαι",
- "σακκούλι": "σακούλι",
- "σακκουλιάζω": "σακουλιάζω",
- "σακχαροδιαβήτης": "ζαχαροδιαβήτης",
- "σάκχαροκαλάµο": "ζαχαροκάλαµο",
- "σακχαροποιία": "ζαχαροποιία",
- "σακχαρότευτλον": "ζαχαρότευτλο",
- "σαλιαρίστρα": "σαλιάρα",
- "σαλπιστής": "σαλπιγκτής",
- "σαντακρούτα": "σατακρούτα",
- "σαντάλι": "σανδάλι",
- "σάνταλο": "σανδάλι",
- "σάρρα": "σάρα",
- "σαφρίδι": "σαυρίδι",
- "σαχάνι": "σαγάνι",
- "σβολιάζω": "σβωλιάζω",
- "σβώλιασμα": "σβόλιασµα",
- "σβόλος": "σβώλος",
- "σβύνω": "σβήνω",
- "σγουρώνω": "σγουραίνω",
- "σενκόντο": "σεκόντο",
- "σεγκούνα": "σιγκούνα",
- "σεγόντο": "σεκόντο",
- "Σειληνός": "Σιληνός",
- "σείρακας": "σείρικας",
- "σειρήτι": "σιρίτι",
- "σεκονταρω": "σιγοντάρω",
- "σεγκοντάρω": "σιγοντάρω",
- "σελιλόιντ": "σελουλόιντ",
- "σέλλα": "σέλα",
- "σεξπιριστής": "σαιξπηριστής",
- "Σεράγεβο": "Σαράγεβο",
- "σεστέτο": "σεξτέτο",
- "σετέτο": "σεπτέτο",
- "σέχτα": "σέκτα",
- "σεχταρισµός": "σεκταρισµός",
- "σηµαφόρος": "σηµατοφόρος",
- "σήριαλ": "σίριαλ",
- "σηψίνη": "σηπτίνη",
- "σιγάρο": "τσιγάρο",
- "σιγαροθήκη": "τσιγαροθήκη",
- "σίγλος": "σίκλος",
- "σιγόντο": "σεκόντο",
- "Σίδνεϊ": "Σύδνεϋ",
- "σίελος": "σίαλος",
- "σινθεσάιζερ": "συνθεσάιζερ",
- "σιντέφι": "σεντέφι",
- "σιορ": "σινιόρ",
- "σιρυΐάνι": "σεργιάνι",
- "σιρµαγιά": "σερµαγιά",
- "σίτα": "σήτα",
- "σταρέµπορος": "σιτέµπορος",
- "σκανδαλιά": "σκανταλιά",
- "σκάνταλο": "σκάνδαλο",
- "σκάπτω": "σκάβω",
- "σκάρα": "σχάρα",
- "σκαρµός": "σκαλµός",
- "σκάφτω": "σκάβω",
- "σκεβρώνω": "σκευρώνω",
- "σκερπάνι": "σκεπάρνι",
- "σκίζα": "σχίζα",
- "σκίζω": "σχίζω",
- "σκίνος": "σχίνος",
- "σκίσιµο": "σχίσιµο",
- "σκισµάδα": "σχισµάδα",
- "σκισµή": "σχισµή",
- "σκλήρωση": "σκλήρυνση",
- "σκοινάκι": "σχοινάκι",
- "σκονί": "σχοινί",
- "σκοινί": "σχοινί",
- "σκοίνος": "σχοίνος",
- "σκολάω": "σχολώ",
- "σκολειαρόπαιδο": "σχολειαρόπαιδο",
- "σκολειαρούδι": "σχολειαρούδι",
- "σκολειό": "σχολείο",
- "σκόλη": "σχόλη",
- "σκολιαρόπαιδο": "σχολειαρόπαιδο",
- "σκολιαρούδι": "σχολειαρούδι",
- "σκολιό": "σχολειό",
- "σκολνώ": "σχολώ",
- "σκολώ": "σχολώ",
- "Σκοτία": "Σκωτία",
- "σκότισµα": "σκοτισµός",
- "Σκοτσέζος": "Σκωτσέζος",
- "σκουντούφληµα": "σκουντούφλα",
- "σκώληξ": "σκουλήκι",
- "σκώτι": "συκώτι",
- "σοβαντεπί": "σοβατεπί",
- "σοβατίζω": "σοβαντίζω",
- "σοροκολεβάντες": "σιροκολεβάντες",
- "σορόκος": "σιρόκος",
- "σοροπιάζω": "σιροπιάζω",
- "σουβατίζω": "σοβαντίζω",
- "σουβαντίζω": "σοβαντίζω",
- "σουβάς": "σοβάς",
- "σουβατεπί": "σοβαντεπί",
- "σοβατεπί": "σοβαντεπί",
- "σουµιέ": "σοµιέ",
- "σούρσιµο": "σύρσιµο",
- "σουσπασιόν": "σισπανσιόν",
- "σοφεράρω": "σοφάρω",
- "σπαής": "σπαχής",
- "σπαράσσω": "σπαράζω",
- "σπερµατσετο": "σπαρµατσέτο",
- "σπερµίνη": "σπερµατίνη",
- "σπερµοβλάστη": "σπερµατοβλάστη",
- "σπερµογονία": "σπερµατογονία",
- "σπερµοδότης": "σπερµατοδότης",
- "σπερµοδόχος": "σπερµατοδόχος",
- "σπερμοδόχο": "σπερµατοδόχο",
- "σπερµοθήκη": "σπερµατοθήκη",
- "σπερµοκτόνος": "σπερµατοκτόνος",
- "σπερμοκτόνο": "σπερµατοκτόνο",
- "σπερµοτοξίνη": "σπερµατοτοξίνη",
- "σπερµοφάγος": "σπερµατοφάγος",
- "σπερμοφάγο": "σπερµατοφάγο",
- "σπερµοφόρος": "σπερµατοφόρος",
- "σπερμοφόρο": "σπερµατοφόρο",
- "σπινάρω": "σπινιάρω",
- "σπιράλ": "σπειράλ",
- "σπλάχνο": "σπλάγχνο",
- "σπογγίζω": "σφουγγίζω",
- "σπω": "σπάζω",
- "Στάθης": "Ευστάθιος",
- "στάλαµα": "στάλαγµα",
- "σταλαµατιά": "σταλαγµατιά",
- "σταλαξιά": "σταλαγµατιά",
- "σταλίτσα": "σταλιά",
- "σταρήθρα": "σιταρήθρα",
- "στάρι": "σιτάρι",
- "σταρότοπος": "σιταρότοπος",
- "σταχολογώ": "σταχυολογώ",
- "στειρεύω": "στερεύω",
- "στειροποιώ": "στειρώνω",
- "Στέλιος": "Στυλιανός",
- "Στέλλα": "Στυλιανή",
- "στεναχώρια": "στενοχώρια",
- "στεναχωρώ": "στενοχωρώ",
- "στένω": "στήνω",
- "στέριωµα": "στερέωµα",
- "στεριώνω": "στερεώνω",
- "στέρξιµο": "στέργω",
- "στιλ": "στυλ",
- "στιλάκι": "στυλάκι",
- "στιλιζάρω": "στυλιζάρω",
- "στιλίστας": "στυλίστας",
- "στιλό": "στυλό",
- "στιφάδο": "στυφάδο",
- "στορίζω": "ιστορώ",
- "στόρισµα": "ιστόρηση",
- "στραβοµάρα": "στραβωµάρα",
- "στραγγουλίζω": "στραγγαλίζω",
- "Στρατής": "Ευστράτιος",
- "στρατί": "στράτα",
- "στρατοποίηση": "στρατιωτικοποίηση",
- "Στράτος": "Ευστράτιος",
- "στρένω": "στέργω",
- "στριµόκωλα": "στρυµόκωλα",
- "στριµωξίδι": "στρυµωξίδι",
- "στριµώχνω": "στρυµώχνω",
- "στύβω": "στείβω",
- "στυπώνω": "στουπώνω",
- "σύγνεφο": "σύννεφο",
- "συγνώµη": "συγγνώµη",
- "συδαυλίζω": "συνδαυλίζω",
- "συµπαρασέρνω": "συµπαρασύρω",
- "συµπεθεριά": "συµπεθεριό",
- "δεκαέξι": "δεκάξι",
- "συνήθιο": "συνήθειο",
- "συντάµω": "συντέµνω",
- "συντριβάνι": "σιντριβάνι",
- "συνυφάδα": "συννυφάδα",
- "συφορά": "συµφορά",
- "συχώρεση": "συγχώρηση",
- "συχωρώ": "συγχωρώ",
- "συχωροχάρτι": "συγχωροχάρτι",
- "σφαλνώ": "σφαλίζω",
- "σφεντάµι": "σφένδαµνος",
- "σφερδούκλι": "σπερδούκλι",
- "σφόνδυλος": "σπόνδυλος",
- "σωβινισµός": "σοβινισµός",
- "σωβινιστής": "σοβινιστής",
- "σώνω": "σώζω",
- "σωρείτης": "σωρίτης",
- "σωτάρω": "σοτάρω",
- "σωτέ": "σοτέ",
- "Σωτήρης": "Σωτήριος",
- "σωφέρ": "σοφέρ",
- "ταβατούρι": "νταβαντούρι",
- "ταβερνούλα": "ταβέρνα",
- "ταβλάς": "ταµπλάς",
- "ταγιαδόρος": "ταλιαδόρος",
- "ταγίζω": "ταΐζω",
- "τάγισµα": "τάισµα",
- "ταγκό": "τανγκό",
- "ταή": "ταγή",
- "τάλαρο": "τάλιρο",
- "τάλληρο": "τάλιρο",
- "ταµίευση": "αποταµίευση",
- "ταµιεύω": "αποταµιεύω",
- "ταµώ": "τέµνω",
- "ταξείδι": "ταξίδι",
- "ταπεραµέντο": "ταµπεραµέντο",
- "ταράσσω": "ταράζω",
- "ταχτοποίηση": "τακτοποίηση",
- "ταχτοποιώ": "τακτοποιώ",
- "τελάλης": "ντελάλης",
- "τελολογία": "τελεολογία",
- "τεριρέµ": "τερερέµ",
- "τερραίν": "τερέν",
- "τέσσαρα": "τέσσερα",
- "τετράς": "τετράδα",
- "τζέντζερης": "τέντζερης",
- "τζετζερέδια": "τεντζερέδια",
- "τζιριτζάντζουλα": "τζυριτζάτζουλα",
- "τζίρος": "τζύρος",
- "τζιτζιµπίρα": "τσιτσιµπίρα",
- "τηκ": "τικ",
- "τηλοµοιοτύπηµα": "τηλεοµοιοτύπηµα",
- "τηλοµοιοτυπία": "τηλεοµοιοτυπία",
- "τηλοµοιοτυπώ": "τηλεοµοιοτυπώ",
- "τιτιβίζω": "τιττυβίζω",
- "τµήθηκα": "τέµνω",
- "τµήσω": "τέµνω",
- "Τόκιο": "Τόκυο",
- "τοµάτα": "ντοµάτα",
- "τοµατιά": "ντοµατιά",
- "τοµατοπολτός": "ντοµατοπολτός",
- "τοµατοσαλάτα": "ντοµατοσαλάτα",
- "τονθορύζω": "υποτονθορύζω",
- "τορβάς": "ντορβάς",
- "τορνάρω": "τορνεύω",
- "τορπίλα": "τορπίλη",
- "τούνδρα": "τούντρα",
- "Τουρκάλα": "Τούρκος",
- "τράβαλα": "ντράβαλα",
- "τραΐ": "τραγί",
- "τραινάρισµα": "τρενάρισµα",
- "τραινάρω": "τρενάρω",
- "τραίνο": "τρένο",
- "τρακόσοι": "τριακόσιοι",
- "τραπεζάκι": "τραπέζι",
- "τρέµουλο": "τρεµούλα",
- "τρέψω": "τρέπω",
- "τριάµισι": "τρεισήµισι",
- "τρικλίζω": "τρεκλίζω",
- "τρίκλισµα": "τρέκλισµα",
- "τρίπλα": "ντρίπλα",
- "τριπλαδόρος": "ντριπλαδόρος",
- "τριπλάρω": "ντριπλάρω",
- "τρίπους": "τρίποδας",
- "τρόπις": "τρόπιδα",
- "τρυκ": "τρικ",
- "τσαγγαράδικο": "τσαγκαράδικο",
- "τσογγάρης": "τσαγκάρης",
- "τσαγγάρικο": "τσαγκάρικο",
- "τσαγγαροδευτέρα": "τσαγκαροδευτέρα",
- "τσάµπα": "τζάµπα",
- "τσαµπατζής": "τζαµπατζής",
- "τσαντίζω": "τσατίζω",
- "τσαντίλα": "τσατίλα",
- "τσαντίλας": "τσατίλας",
- "τσάντισµα": "τσάτισµα",
- "τσίβα": "τζίβα",
- "τσίκλα": "τσίχλα",
- "τσιµεντώνω": "τσιµεντάρω",
- "τσιπούρα": "τσιππούρα",
- "τσιρίζω": "τσυρίζω",
- "τσιριτσάντζουλα": "τζιριτζάντζουλα",
- "τσιρότο": "τσηρώτο",
- "τσίτα": "τσήτα",
- "τσιτσιρίζω": "τσυτσυρίζω",
- "τσιτσίρισµα": "τσυτσυρίζω",
- "τσίτωµα": "τσήτωµα",
- "τσοµπάνος": "τσοµπάνης",
- "τσοπάνης": "τσοµπάνης",
- "τσοπανόπουλο": "τσοµπανόπουλο",
- "τσοπάνος": "τσοµπάνης",
- "τσύνορο": "τσίνορο",
- "τυράγνισµα": "τυράννισµα",
- "τυραγνω": "τυραννώ",
- "τυφεκίζω": "τουφεκίζω",
- "τυφεκισµός": "τουφεκισµός",
- "υαλόχαρτον": "γυαλόχαρτο",
- "υαλόχαρτο": "γυαλόχαρτο",
- "υάρδα": "γιάρδα",
- "ύβρη": "ύβρις",
- "υδατοσκοπια": "υδροσκοπία",
- "υδραέριο": "υδαταέριο",
- "ύελος": "ύαλος",
- "Υόρκη Νέα": "Νέα Υόρκη",
- "υποδείχνω": "υποδεικνύω",
- "υπόδεσις": "υπόδηση",
- "υποκάµισο": "πουκάµισο",
- "φαγκρί": "φαγγρί",
- "φαγοκύτωση": "φαγοκυττάρωση",
- "ψόγουσα": "φαγέδαινα",
- "φαγωµός": "φαγωµάρα",
- "φάδι": "υφάδι",
- "φαινοµεναλισµός": "φαινοµενοκρατία",
- "φαινοµενισµός": "φαινοµενοκρατία",
- "φαίνω": "υφαίνω",
- "φαλακρώνω": "φαλακραίνω",
- "φαµίλια": "φαµελιά",
- "φαµφάρα": "φανφάρα",
- "φαµφαρονισµος": "φανφαρονισµός",
- "φαµφαρόνος": "φανφαρόνος",
- "φαράκλα": "φαλάκρα",
- "φαρµασόνος": "φραµασόνος",
- "φαρµπαλάς": "φραµπαλάς",
- "φασουλάδα": "φασολάδα",
- "φασουλάκια": "φασολάκια",
- "φασουλιά": "φασολιά",
- "φασούλι": "φασόλι",
- "φελόνι": "φαιλόνιο",
- "φελώ": "ωφελώ",
- "φεουδαλισµός": "φεουδαρχισµός",
- "φερµάνι": "φιρµάνι",
- "φέτος": "εφέτος",
- "φθήνια": "φτήνια",
- "Φιλανδία": "Φινλανδία",
- "φιλενάδα": "φιλαινάδα",
- "φιλιστρίνι": "φινιστρίνι",
- "φιλόφρονας": "φιλόφρων",
- "φιντάνι": "φυντάνι",
- "φιορντ": "φιόρδ",
- "φίσκα": "φύσκα",
- "φκειάνω": "φτειάχνω",
- "φκιάνω": "φτειάχνω",
- "φκειασιδι": "φτειασίδι",
- "φκειασίδωµα": "φτειασίδωµα",
- "φκειασιδώνω": "φτειασιδώνω",
- "φκιασιδι": "φτειασίδι",
- "φκιασίδωµα": "φτειασίδωµα",
- "φκιασιδώνω": "φτειασιδώνω",
- "φκυάρι": "φτυάρι",
- "Φλάνδρα": "Φλαµανδία",
- "φλισκούνι": "φλησκούνι",
- "φλοίδα": "φλούδα",
- "φλοµιάζω": "φλοµώνω",
- "φλορίνι": "φιορίνι",
- "φλυτζάνι": "φλιτζάνι",
- "φοβούµαι": "φοβάµαι",
- "φονεύς": "φονιάς",
- "φόντα": "φόντο",
- "φουσέκι": "φισέκι",
- "φούχτα": "χούφτα",
- "φουχτώνω": "χουφτώνω",
- "Φραγκφούρτη": "Φρανκφούρτη",
- "φράσσω": "φράζω",
- "Φρίντα": "Φρειδερίκη",
- "Φροσύνη": "Ευφροσύνη",
- "Φρόσω": "Ευφροσύνη",
- "φροϋδισµος": "φροϊδισµός",
- "φρουµάζω": "φριµάζω",
- "φρούµασµα": "φρίµασµα",
- "φτάνω": "φθάνω",
- "φταρνίζοµαι": "φτερνίζοµαι",
- "φτειάνω": "φτειάχνω",
- "φτηνά": "φθηνά",
- "φτηναίνω": "φθηναίνω",
- "φτιασίδι": "φτειασίδι",
- "φτιασιδώνοµαι": "φτειασιδώνοµαι",
- "φτωχοκοµείο": "πτωχοκοµείο",
- "φυγάδας": "φυγάς",
- "φύγω": "φεύγω",
- "φυλάγω": "φυλάσσω",
- "φυλλαράκι": "φύλλο",
- "φυλλόδεντρο": "φιλόδεντρο",
- "φυλώ": "φυλάσσω",
- "φυσέκι": "φισέκι",
- "φυσεκλίκι": "φισεκλίκι",
- "φυσιοθεραπεία": "φυσικοθεραπεία",
- "φυστίκι": "φιστίκι",
- "φυστικιά": "φιστικιά",
- "φύω": "φύοµαι",
- "φχαριστώ": "ευχαριστώ",
- "φωβισµός": "φοβισµός",
- "φωβιστής": "φοβισµός",
- "Φώτης": "Φώτιος",
- "φωτογραφώ": "φωτογραφίζω",
- "φωτοβολή": ", φωτοβολία",
- "χάβω": "χάφτω",
- "χαΐδεµα": "χαϊδεύω",
- "χάιδι": "χάδι",
- "χαλνώ": "χαλώ",
- "χαλυβώνω": "χαλυβδώνω",
- "χάµου": "χάµω",
- "χαµψίνι": "χαµσίνι",
- "χάνδρα": "χάντρα",
- "χαντζής": "χανιτζής",
- "χαραµατιά": "χαραγµατιά",
- "χάραξ": "χάρακας",
- "χάροντας": "χάρος",
- "χατζάρα": "χαντζάρα",
- "χατζάρι": "χαντζάρι",
- "χεγκελιανισµός": "εγελιανισµός",
- "χειρόβολο": "χερόβολο",
- "χειροµάχηµα": "χεροµαχώ",
- "χειροµάχισσα": "χεροµάχος",
- "χειροµάχος": "χεροµάχος",
- "χειροµαχώ": "χεροµαχώ",
- "χέρα": "χέρι",
- "χερόµυλος": "χειρόµυλος",
- "χεροπόδαρα": "χειροπόδαρα",
- "χηνάρι": "χήνα",
- "χι": "χει",
- "χιµώ": "χυµώ",
- "χιών": "χιόνι",
- "χλεµπάνια": "πλεµπάγια",
- "χλοΐζω": "χλοάζω",
- "χλόισµα": "χλόασµα",
- "χνώτο": "χνότο",
- "χορδίζω": "κουρδίζω",
- "χόρδισµα": "κούρδισμα",
- "χοχλάζω": "κοχλάζω",
- "χοχλακιάζω": "κοχλάζω",
- "χοχλακίζω": "κοχλάζω",
- "χοχλακώ": "κοχλάζω",
- "χρεογραφο": "χρεώγραφο",
- "χρεοκοπία": "χρεωκοπία",
- "χρεοκοπώ": "χρεωκοπώ",
- "χρεολυσία": "χρεωλυσία",
- "χρεολύσιο": "χρεωλύσιο",
- "χρεόλυτρο": "χρεώλυτρο",
- "χρεοπιστώνω": "πιστοχρεώνω",
- "χρεοπίστωση": "πιστοχρεώνω",
- "χρεοστάσιο": "χρεωστάσιο",
- "χρεοφειλέτης": "χρεωφειλέτης",
- "Χρήστος": "Χρίστος",
- "χρωµατόσωµα": "χρωµόσωµα",
- "χρωµογόνος": "χρωµατογόνος",
- "χρωµογόνο": "χρωµατογόνο",
- "χρωµοφόρος": "χρωµατοφόρος",
- "χρωµοφόρο": "χρωµατοφόρο",
- "χτες": "χθες",
- "χτήµα": "κτήµα",
- "χτίζω": "κτίζω",
- "χτίσιµο": "κτίσιµο",
- "χτίσµα": "κτίσµα",
- "χτίστης": "κτίστης",
- "χτύπηµα": "κτύπηµα",
- "χτύπος": "κτύπος",
- "χτυπώ": "κτυπώ",
- "χυµίζω": "χυµώ",
- "χωλ": "χολ",
- "χώνεψη": "χώνευση",
- "χωριατοσύνη": "χωριατιά",
- "ψένω": "ψήνω",
- "ψηλαφώ": "ψηλαφίζω",
- "ψηφιδοθέτης": "ψηφοθέτης",
- "ψιττακίαση": "ψιττάκωση",
- "ψίχαλο": "ψίχουλο",
- "ψυχεδελισµός": "ψυχεδέλεια",
- "ψυχογιός": "ψυχογυιός",
- "ψώριασµα": "ψωριάζω",
- "ωγκρατέν": "ογκρατέν",
- "ωράριο": "οράριο",
- "ώς": "έως",
- "ωτασπίδα": "ωτοασπίδα",
- "ωτοστόπ": "οτοστόπ",
- "ωφελιµοκρατία": "ωφελιµισµός",
- "ωχαδερφισµός": "οχαδερφισµός",
- "ώχου": "όχου",
- "άγυρτος": "άγειρτος",
- "άγυρτη": "άγειρτη",
- "άγυρτο": "άγειρτο",
- "ανηµέρευτος": "ανηµέρωτος",
- "ανηµέρευτη": "ανηµέρωτη",
- "ανηµέρευτο": "ανηµέρωτο",
- "ανοικτός": "ανοιχτός",
- "ανοικτή": "ανοιχτή",
- "ανοικτό": "ανοιχτό",
- "αντιελληνικός": "ανθελληνικός",
- "αντιελληνική": "ανθελληνική",
- "αντιελληνικό": "ανθελληνικό",
- "αντιεπιστηµονικος": "αντεπιστηµονικός",
- "αντιεπιστηµονικη": "αντεπιστηµονική",
- "αντιεπιστηµονικο": "αντεπιστηµονικό",
- "αξόφλητος": "ανεξόφλητος",
- "αξόφλητη": "ανεξόφλητη",
- "αξόφλητο": "ανεξόφλητο",
- "άπαιχτος": "άπαικτος",
- "άπαιχτη": "άπαικτη",
- "άπαιχτο": "άπαικτο",
- "απηρχαιωµένος": "απαρχαιωµένος",
- "απηρχαιωµένη": "απαρχαιωµένη",
- "απηρχαιωµένο": "απαρχαιωµένο",
- "άπιωτος": "άπιοτος",
- "άπιωτη": "άπιοτη",
- "άπιωτο": "άπιοτο",
- "άπραχτος": "άπρακτος",
- "άπραχτη": "άπρακτη",
- "άπραχτο": "άπρακτο",
- "άραχλος": "άραχνος",
- "άραχλη": "άραχνη",
- "άραχλο": "άραχνο",
- "αρήγωτος": "αρίγωτος",
- "αρήγωτη": "αρίγωτη",
- "αρήγωτο": "αρίγωτο",
- "αρµενικός": "αρµενιακός",
- "αρµενική": "αρµενιακή",
- "αρµενικό": "αρµενιακό",
- "αρµυρός": "αλµυρός",
- "αρµυρή": "αλµυρή",
- "αρµυρό": "αλµυρό",
- "άσβεστος": "άσβηστος",
- "άσβεστη": "άσβηστη",
- "άσβεστο": "άσβηστο",
- "άσκηµος": "άσχηµος",
- "άσκηµη": "άσχηµη",
- "άσκηµο": "άσχηµο",
- "άστυφτος": "άστειφτος",
- "άστυφτη": "άστειφτη",
- "άστυφτο": "άστειφτο",
- "ασυχώρετος": "ασυγχώρητος",
- "ασυχώρετη": "ασυγχώρητη",
- "ασυχώρετο": "ασυγχώρητο",
- "άταχτος": "άτακτος",
- "άταχτη": "άτακτη",
- "άταχτο": "άτακτο",
- "άφκιαστος": "άφτειαχτος",
- "άφκιαστη": "άφτειαχτη",
- "άφκιαστο": "άφτειαχτο",
- "άφκειαστος": "άφτειαχτος",
- "άφκειαστη": "άφτειαχτη",
- "άφκειαστο": "άφτειαχτο",
- "άφταστος": "άφθαστος",
- "άφταστη": "άφθαστη",
- "άφταστο": "άφθαστο",
- "άφτερος": "άπτερος",
- "άφτερη": "άπτερη",
- "άφτερο": "άπτερο",
- "αχτιδωτος": "ακτινωτός",
- "αχτιδωτη": "ακτινωτή",
- "αχτιδωτο": "ακτινωτό",
- "άχτιστος": "άκτιστος",
- "άχτιστη": "άκτιστη",
- "άχτιστο": "άκτιστο",
- "βιωτικός": "βιοτικός",
- "βιωτική": "βιοτική",
- "βιωτικό": "βιοτικό",
- "βλάστηµος": "βλάσφηµος",
- "βλάστηµη": "βλάσφηµη",
- "βλάστηµο": "βλάσφηµο",
- "βλογηµένος": "ευλογηµένος",
- "βλογηµένη": "ευλογηµένη",
- "βλογηµένο": "ευλογηµένο",
- "βοϊδινός": "βοδινός",
- "βοϊδινή": "βοδινή",
- "βοϊδινό": "βοδινό",
- "βορινός": "βορεινός",
- "βορινή": "βορεινή",
- "βορινό": "βορεινό",
- "βρωµερός": "βροµερός",
- "βρωµερή": "βροµερή",
- "βρωµερό": "βροµερό",
- "βρώµικος": "βρόµικος",
- "βρώµικη": "βρόµικη",
- "βρώµικο": "βρόµικο",
- "γαλατερός": "γαλακτερός",
- "γαλατερή": "γαλακτερή",
- "γαλατερό": "γαλακτερό",
- "γδυµνός": "γυµνός",
- "γδυµνή": "γυµνή",
- "γδυµνό": "γυµνό",
- "γελαδινός": "αγελαδινός",
- "γελαδινή": "αγελαδινή",
- "γελαδινό": "αγελαδινό",
- "γερτός": "γειρτός",
- "γερτή": "γειρτή",
- "γερτό": "γειρτό",
- "γιοµάτος": "γεµάτος",
- "γιοµάτη": "γεµάτη",
- "γιοµάτο": "γεµάτο",
- "γκεµπελικός": "γκαιµπελικός",
- "γκεµπελική": "γκαιµπελική",
- "γκεµπελικό": "γκαιµπελικό",
- "γλήγορος": "γρήγορος",
- "γλήγορη": "γρήγορη",
- "γλήγορο": "γρήγορο",
- "γρανίτινος": "γρανιτένιος",
- "γρανίτινη": "γρανιτένιη",
- "γρανίτινο": "γρανιτένιο",
- "γραφτός": "γραπτός",
- "γραφτή": "γραπτή",
- "γραφτό": "γραπτό",
- "γυρτός": "γειρτός",
- "γυρτή": "γειρτή",
- "γυρτό": "γειρτό",
- "δαιµονόπληκτος": "δαιµονιόπληκτος",
- "δαιµονόπληκτη": "δαιµονιόπληκτη",
- "δαιµονόπληκτο": "δαιµονιόπληκτο",
- "δερµικός": "δερµατικός",
- "δερµική": "δερµατική",
- "δερµικό": "δερµατικό",
- "δεχτός": "δεκτός",
- "δεχτή": "δεκτή",
- "δεχτό": "δεκτό",
- "διαλεκτός": "διαλεχτός",
- "διαλεκτή": "διαλεχτή",
- "διαλεκτό": "διαλεχτό",
- "διαολεµένος": "διαβολεµένος",
- "διαολεµένη": "διαβολεµένη",
- "διαολεµένο": "διαβολεµένο",
- "δυσέλεγκτος": "δυσεξέλεγκτος",
- "δυσέλεγκτη": "δυσεξέλεγκτη",
- "δυσέλεγκτο": "δυσεξέλεγκτο",
- "δυσλεκτικός": "δυσλεξικός",
- "δυσλεκτική": "δυσλεξική",
- "δυσλεκτικό": "δυσλεξικό",
- "εκδοµένος": "εκδεδοµένος",
- "εκδοµένη": "εκδεδοµένη",
- "εκδοµένο": "εκδεδοµένο",
- "ελεύτερος": "ελεύθερος",
- "ελεύτερη": "ελεύθερη",
- "ελεύτερο": "ελεύθερο",
- "εξώφθαλµος": "εξόφθαλµος",
- "εξώφθαλµη": "εξόφθαλµη",
- "εξώφθαλµο": "εξόφθαλµο",
- "επανωτός": "απανωτός",
- "επανωτή": "απανωτή",
- "επανωτό": "απανωτό",
- "επεξηγητικος": "επεξηγηµατικός",
- "επεξηγητικη": "επεξηγηµατική",
- "επεξηγητικο": "επεξηγηµατικό",
- "έρµος": "έρηµος",
- "έρµη": "έρηµη",
- "έρµο": "έρηµο",
- "ετερόκλητος": "ετερόκλιτος",
- "ετερόκλητη": "ετερόκλιτη",
- "ετερόκλητο": "ετερόκλιτο",
- "ετούτος": "τούτος",
- "ετούτη": "τούτη",
- "ετούτο": "τούτο",
- "εφετεινός": "εφετινός",
- "εφετεινή": "εφετινή",
- "εφετεινό": "εφετινό",
- "εφταήµερος": "επταήµερος",
- "εφταήµερη": "επταήµερη",
- "εφταήµερο": "επταήµερο",
- "ζάµπλουτος": "ζάπλουτος",
- "ζάµπλουτη": "ζάπλουτη",
- "ζάµπλουτο": "ζάπλουτο",
- "ζαχαράτος": "ζαχαρωτός",
- "ζαχαράτη": "ζαχαρωτή",
- "ζαχαράτο": "ζαχαρωτό",
- "θαµβός": "θαµπός",
- "θαµβή": "θαµπή",
- "θαµβό": "θαµπό",
- "θραψερός": "θρεψερός",
- "θραψερή": "θρεψερή",
- "θραψερό": "θρεψερό",
- "ιονικός": "ιοντικός",
- "ιονική": "ιοντική",
- "ιονικό": "ιοντικό",
- "καββαλιστικός": "καβαλιστικός",
- "καββαλιστική": "καβαλιστική",
- "καββαλιστικό": "καβαλιστικό",
- "καλλίτερος": "καλύτερος",
- "καλλίτερη": "καλύτερη",
- "καλλίτερο": "καλύτερο",
- "καταχτητικός": "κατακτητικός",
- "καταχτητική": "κατακτητική",
- "καταχτητικό": "κατακτητικό",
- "καταψυγµένος": "κατεψυγµένος",
- "καταψυγµένη": "κατεψυγµένη",
- "καταψυγµένο": "κατεψυγµένο",
- "καυδιανός": "καβδιανός",
- "καυδιανή": "καβδιανή",
- "καυδιανό": "καβδιανό",
- "καϋµένος": "καηµένος",
- "καϋµένη": "καηµένη",
- "καϋµένο": "καηµένο",
- "κέδρινος": "κέδρος",
- "κέδρινη": "κέδρη",
- "κέδρινο": "κέδρο",
- "κεραµεικος": "κεραµικός",
- "κεραµεικη": "κεραµική",
- "κεραµεικο": "κεραµικό",
- "κλασσικός": "κλασικός",
- "κλασσική": "κλασική",
- "κλασσικό": "κλασικό",
- "κόλαριστός": "κολλαριστός",
- "κόλαριστή": "κολλαριστή",
- "κόλαριστό": "κολλαριστό",
- "κοµµουνιστικός": "κοµουνιστικός",
- "κοµµουνιστική": "κοµουνιστική",
- "κοµµουνιστικό": "κοµουνιστικό",
- "κοράλλινος": "κοραλλένιος",
- "κοράλλινη": "κοραλλένιη",
- "κοράλλινο": "κοραλλένιο",
- "κτυπητός": "χτυπητός",
- "κτυπητή": "χτυπητή",
- "κτυπητό": "χτυπητό",
- "κωφός": "κουφός",
- "κωφή": "κουφή",
- "κωφό": "κουφό",
- "λειπανάβατος": "λειψανάβατος",
- "λειπανάβατη": "λειψανάβατη",
- "λειπανάβατο": "λειψανάβατο",
- "λιανικός": "λειανικός",
- "λιανική": "λειανική",
- "λιανικό": "λειανικό",
- "λιανός": "λειανός",
- "λιανή": "λειανή",
- "λιανό": "λειανό",
- "λιγοήµερος": "ολιγοήµερος",
- "λιγοήµερη": "ολιγοήµερη",
- "λιγοήµερο": "ολιγοήµερο",
- "λιγόκαρδος": "ολιγόκαρδος",
- "λιγόκαρδη": "ολιγόκαρδη",
- "λιγόκαρδο": "ολιγόκαρδο",
- "λιγόλογος": "ολιγόλογος",
- "λιγόλογη": "ολιγόλογη",
- "λιγόλογο": "ολιγόλογο",
- "λιγόπιστος": "ολιγόπιστος",
- "λιγόπιστη": "ολιγόπιστη",
- "λιγόπιστο": "ολιγόπιστο",
- "λιγόψυχος": "ολιγοψυχία",
- "λιγόψυχοςή": "ολιγοψυχίαη",
- "λιγόψυχοςό": "ολιγοψυχίαο",
- "λιόλουστος": "ηλιόλουστος",
- "λιόλουστη": "ηλιόλουστη",
- "λιόλουστο": "ηλιόλουστο",
- "λιόµορφος": "ηλιόµορφος",
- "λιόµορφη": "ηλιόµορφη",
- "λιόµορφο": "ηλιόµορφο",
- "λιόχαρος": "ηλιόχαρος",
- "λιόχαρη": "ηλιόχαρη",
- "λιόχαρο": "ηλιόχαρο",
- "λιπανάβατος": "λειψανάβατος",
- "λιπανάβατη": "λειψανάβατη",
- "λιπανάβατο": "λειψανάβατο",
- "λυµφατικός": "λεµφατικός",
- "λυµφατική": "λεµφατική",
- "λυµφατικό": "λεµφατικό",
- "µαυριδερός": "µαυρειδερός",
- "µαυριδερή": "µαυρειδερή",
- "µαυριδερό": "µαυρειδερό",
- "µεικτός": "µικτός",
- "µεικτή": "µικτή",
- "µεικτό": "µικτό",
- "µελαψός": "µελαµψός",
- "µελαψή": "µελαµψή",
- "µελαψό": "µελαµψό",
- "µετάξινος": "µεταξένιος",
- "µετάξινη": "µεταξένιη",
- "µετάξινο": "µεταξένιο",
- "µιξοβάρβαρος": "µειξοβάρβαρος",
- "µιξοβάρβαρη": "µειξοβάρβαρη",
- "µιξοβάρβαρο": "µειξοβάρβαρο",
- "µοσκαναθρεµµένος": "µοσχαναθρεµµένος",
- "µοσκαναθρεµµένη": "µοσχαναθρεµµένη",
- "µοσκαναθρεµµένο": "µοσχαναθρεµµένο",
- "µουλωχτός": "µουλλωχτός",
- "µουλωχτή": "µουλλωχτή",
- "µουλωχτό": "µουλλωχτό",
- "µπαµπακερός": "βαµβακερός",
- "µπαµπακερή": "βαµβακερή",
- "µπαµπακερό": "βαµβακερό",
- "νεόχτιστος": "νεόκτιστος",
- "νεόχτιστη": "νεόκτιστη",
- "νεόχτιστο": "νεόκτιστο",
- "νηστίσιµος": "νηστήσιµος",
- "νηστίσιµη": "νηστήσιµη",
- "νηστίσιµο": "νηστήσιµο",
- "νιογέννητος": "νεογέννητος",
- "νιογέννητη": "νεογέννητη",
- "νιογέννητο": "νεογέννητο",
- "νυκτερινός": "νυχτερινός",
- "νυκτερινή": "νυχτερινή",
- "νυκτερινό": "νυχτερινό",
- "ξιπόλητος": "ξυπόλυτος",
- "ξιπόλητη": "ξυπόλυτη",
- "ξιπόλητο": "ξυπόλυτο",
- "ξυνός": "ξινός",
- "ξυνή": "ξινή",
- "ξυνό": "ξινό",
- "ξωτικός": "εξωτικός",
- "ξωτική": "εξωτική",
- "ξωτικό": "εξωτικό",
- "οικονοµίστικος": "οικονοµικίστικος",
- "οικονοµίστικη": "οικονοµικίστικη",
- "οικονοµίστικο": "οικονοµικίστικο",
- "οκταγωνικός": "οχταγωνικός",
- "οκταγωνική": "οχταγωνική",
- "οκταγωνικό": "οχταγωνικό",
- "οκτάγωνος": "οχτάγωνος",
- "οκτάγωνη": "οχτάγωνη",
- "οκτάγωνο": "οχτάγωνο",
- "οκτάεδρος": "οχτάεδρος",
- "οκτάεδρη": "οχτάεδρη",
- "οκτάεδρο": "οχτάεδρο",
- "οκτάκιλος": "οχτάκιλος",
- "οκτάκιλη": "οχτάκιλη",
- "οκτάκιλο": "οχτάκιλο",
- "οξειδώσιµος": "οξιδώσιµος",
- "οξειδώσιµη": "οξιδώσιµη",
- "οξειδώσιµο": "οξιδώσιµο",
- "ορεχτικός": "ορεκτικός",
- "ορεχτική": "ορεκτική",
- "ορεχτικό": "ορεκτικό",
- "οχταγωνικός": "οκταγωνικός",
- "οχταγωνική": "οκταγωνική",
- "οχταγωνικό": "οκταγωνικό",
- "οχτάγωνος": "οκτάγωνος",
- "οχτάγωνη": "οκτάγωνη",
- "οχτάγωνο": "οκτάγωνο",
- "οχτάεδρος": "οκτάεδρος",
- "οχτάεδρη": "οκτάεδρη",
- "οχτάεδρο": "οκτάεδρο",
- "οχτακοσιοστός": "οκτακοσιοστός",
- "οχτακοσιοστή": "οκτακοσιοστή",
- "οχτακοσιοστό": "οκτακοσιοστό",
- "οχτάπλευρος": "οκτάπλευρος",
- "οχτάπλευρη": "οκτάπλευρη",
- "οχτάπλευρο": "οκτάπλευρο",
- "οχτάστηλος": "οκτάστηλος",
- "οχτάστηλη": "οκτάστηλη",
- "οχτάστηλο": "οκτάστηλο",
- "οχτάστιχος": "οκτάστιχος",
- "οχτάστιχη": "οκτάστιχη",
- "οχτάστιχο": "οκτάστιχο",
- "οχτάωρος": "οκτάωρος",
- "οχτάωρη": "οκτάωρη",
- "οχτάωρο": "οκτάωρο",
- "οχτωβριανός": "οκτωβριανός",
- "οχτωβριανή": "οκτωβριανή",
- "οχτωβριανό": "οκτωβριανό",
- "παιδιακίστικος": "παιδιάστικος",
- "παιδιακίστικη": "παιδιάστικη",
- "παιδιακίστικο": "παιδιάστικο",
- "πανέρµος": "πανέρηµος",
- "πανέρµη": "πανέρηµη",
- "πανέρµο": "πανέρηµο",
- "παπαδικός": "παππαδικός",
- "παπαδική": "παππαδική",
- "παπαδικό": "παππαδικό",
- "παπαδίστικος": "παππαδίστικος",
- "παπαδίστικη": "παππαδίστικη",
- "παπαδίστικο": "παππαδίστικο",
- "παραεκκλησιαστικός": "παρεκκλησιαστικός",
- "παραεκκλησιαστική": "παρεκκλησιαστική",
- "παραεκκλησιαστικό": "παρεκκλησιαστικό",
- "πειρακτικός": "πειραχτικός",
- "πειρακτική": "πειραχτική",
- "πειρακτικό": "πειραχτικό",
- "περήφανος": "υπερήφανος",
- "περήφανη": "υπερήφανη",
- "περήφανο": "υπερήφανο",
- "περσότερος": "περισσότερος",
- "περσότερη": "περισσότερη",
- "περσότερο": "περισσότερο",
- "πεταγµένος": "πεταµένος",
- "πεταγµένη": "πεταµένη",
- "πεταγµένο": "πεταµένο",
- "πηκτός": "πηχτός",
- "πηκτή": "πηχτή",
- "πηκτό": "πηχτό",
- "πιτσιλιστός": "πιτσυλιστός",
- "πιτσιλιστή": "πιτσυλιστή",
- "πιτσιλιστό": "πιτσυλιστό",
- "πλεχτικός": "πλεκτικός",
- "πλεχτική": "πλεκτική",
- "πλεχτικό": "πλεκτικό",
- "πλεχτός": "πλεκτός",
- "πλεχτή": "πλεκτή",
- "πλεχτό": "πλεκτό",
- "προσεχτικός": "προσεκτικός",
- "προσεχτική": "προσεκτική",
- "προσεχτικό": "προσεκτικό",
- "προψεσινός": "προχθεσινός",
- "προψεσινή": "προχθεσινή",
- "προψεσινό": "προχθεσινό",
- "πτερωτός": "φτερωτός",
- "πτερωτή": "φτερωτή",
- "πτερωτό": "φτερωτό",
- "πτωχικός": "φτωχικός",
- "πτωχική": "φτωχική",
- "πτωχικό": "φτωχικό",
- "ραφτικός": "ραπτικός",
- "ραφτική": "ραπτική",
- "ραφτικό": "ραπτικό",
- "ραφτός": "ραπτός",
- "ραφτή": "ραπτή",
- "ραφτό": "ραπτό",
- "ρούσικος": "ρωσικός",
- "ρούσικη": "ρωσική",
- "ρούσικο": "ρωσικό",
- "ρωµαντικός": "ροµαντικός",
- "ρωµαντική": "ροµαντική",
- "ρωµαντικό": "ροµαντικό",
- "σειληνικός": "σιληνικός",
- "σειληνική": "σιληνική",
- "σειληνικό": "σιληνικό",
- "σειριακός": "σειραϊκός",
- "σειριακή": "σειραϊκή",
- "σειριακό": "σειραϊκό",
- "σεξπιρικός": "σαιξπηρικός",
- "σεξπιρική": "σαιξπηρική",
- "σεξπιρικό": "σαιξπηρικό",
- "σιδηρόφρακτος": "σιδερόφραχτος",
- "σιδηρόφρακτη": "σιδερόφραχτη",
- "σιδηρόφρακτο": "σιδερόφραχτο",
- "σκεβρός": "σκευρός",
- "σκεβρή": "σκευρή",
- "σκεβρό": "σκευρό",
- "σκεφτικός": "σκεπτικός",
- "σκεφτική": "σκεπτική",
- "σκεφτικό": "σκεπτικό",
- "σκιστός": "σχιστός",
- "σκιστή": "σχιστή",
- "σκιστό": "σχιστό",
- "σκολιανός": "σχολιανός",
- "σκολιανή": "σχολιανή",
- "σκολιανό": "σχολιανό",
- "σκοτσέζικος": "σκοτσέζικος",
- "σκοτσέζικη": "σκοτσέζικη",
- "σκοτσέζικο": "σκοτσέζικο",
- "σµυρνιώτικος": "σµυρναίικος",
- "σµυρνιώτικη": "σµυρναίικη",
- "σµυρνιώτικο": "σµυρναίικο",
- "σοροπιαστός": "σιροπιαστός",
- "σοροπιαστή": "σιροπιαστή",
- "σοροπιαστό": "σιροπιαστό",
- "σπερνός": "εσπερινός",
- "σπερνή": "εσπερινή",
- "σπερνό": "εσπερινό",
- "σταρόχρωµος": "σιταρόχρωµος",
- "σταρόχρωµη": "σιταρόχρωµη",
- "σταρόχρωµο": "σιταρόχρωµο",
- "στενάχωρος": "στενόχωρος",
- "στενάχωρη": "στενόχωρη",
- "στενάχωρο": "στενόχωρο",
- "στιλιστικός": "στυλιστικός",
- "στιλιστική": "στυλιστική",
- "στιλιστικό": "στυλιστικό",
- "στριµόκωλος": "στρυµόκωλος",
- "στριµόκωλη": "στρυµόκωλη",
- "στριµόκωλο": "στρυµόκωλο",
- "στριµωχτός": "στρυµωχτός",
- "στριµωχτή": "στρυµωχτή",
- "στριµωχτό": "στρυµωχτό",
- "στριφνός": "στρυφνός",
- "στριφνή": "στρυφνή",
- "στριφνό": "στρυφνό",
- "σύµµεικτος": "σύµµικτος",
- "σύµµεικτη": "σύµµικτη",
- "σύµµεικτο": "σύµµικτο",
- "σύµψυχος": "σύψυχος",
- "σύµψυχη": "σύψυχη",
- "σύµψυχο": "σύψυχο",
- "συντεθειµένος": "συνθέτω",
- "συντεθειµένοςή": "συνθέτωη",
- "συντεθειµένοςό": "συνθέτωο",
- "συφοριασµένος": "συμφοριασμένος",
- "συφοριασµένη": "συμφοριασμένη",
- "συφοριασµένο": "συμφοριασμένο",
- "συχωριανός": "συγχωριανός",
- "συχωριανή": "συγχωριανή",
- "συχωριανό": "συγχωριανό",
- "ταγκός": "ταγγός",
- "ταγκή": "ταγγή",
- "ταµιευτικός": "αποταµιευτικός",
- "ταµιευτική": "αποταµιευτική",
- "ταµιευτικό": "αποταµιευτικό",
- "ταχτικός": "τακτικός",
- "ταχτική": "τακτική",
- "ταχτικό": "τακτικό",
- "τελολογικός": "τελεολογικός",
- "τελολογική": "τελεολογική",
- "τελολογικό": "τελεολογικό",
- "τραγικοκωµικός": "κωµικοτραγικός",
- "τραγικοκωµική": "κωµικοτραγική",
- "τραγικοκωµικό": "κωµικοτραγικό",
- "τρελλός": "τρελός",
- "τρελλή": "τρελή",
- "τρελλό": "τρελό",
- "τσεβδός": "τσευδός",
- "τσεβδή": "τσευδή",
- "τσεβδό": "τσευδό",
- "τσιριχτός": "τσυριχτός",
- "τσιριχτή": "τσυριχτή",
- "τσιριχτό": "τσυριχτό",
- "τσιτωτός": "τσητωτός",
- "τσιτωτή": "τσητωτή",
- "τσιτωτό": "τσητωτό",
- "υποµονητικός": "υποµονετικός",
- "υποµονητική": "υποµονετική",
- "υποµονητικό": "υποµονετικό",
- "φαµφαρονικός": "φανφαρονίστικος",
- "φαµφαρονική": "φανφαρονίστικη",
- "φαµφαρονικό": "φανφαρονίστικο",
- "φαµφαρονίστικος": "φανφαρονίστικος",
- "φαµφαρονίστικη": "φανφαρονίστικη",
- "φαµφαρονίστικο": "φανφαρονίστικο",
- "φαντός": "υφαντός",
- "φαντή": "υφαντή",
- "φαντό": "υφαντό",
- "φανφαρονικός": "φανφαρονιστικός",
- "φανφαρονική": "φανφαρονιστική",
- "φανφαρονικό": "φανφαρονιστικό",
- "φαρακλός": "φαλακρός",
- "φαρακλή": "φαλακρή",
- "φαρακλό": "φαλακρό",
- "φεγγαροφώτιστος": "φεγγαρόφωτος",
- "φεγγαροφώτιστη": "φεγγαρόφωτη",
- "φεγγαροφώτιστο": "φεγγαρόφωτο",
- "φεουδαλικός": "φεουδαρχικός",
- "φεουδαλική": "φεουδαρχική",
- "φεουδαλικό": "φεουδαρχικό",
- "φλοκάτος": "φλοκωτός",
- "φλοκάτη": "φλοκωτή",
- "φλοκάτο": "φλοκωτό",
- "φριχτός": "φρικτός",
- "φριχτή": "φρικτή",
- "φριχτό": "φρικτό",
- "φροϋδικός": "φροϊδικός",
- "φροϋδική": "φροϊδική",
- "φροϋδικό": "φροϊδικό",
- "φτειαστός": "φτειαχτός",
- "φτειαστή": "φτειαχτή",
- "φτειαστό": "φτειαχτό",
- "φτηνός": "φθηνός",
- "φτηνή": "φθηνή",
- "φτηνό": "φθηνό",
- "φυσιοθεραπευτικός": "φυσιοθεραπευτικός",
- "φυσιοθεραπευτική": "φυσιοθεραπευτική",
- "φυσιοθεραπευτικό": "φυσιοθεραπευτικό",
- "φωβιστικός": "φοβιστικός",
- "φωβιστική": "φοβιστική",
- "φωβιστικό": "φοβιστικό",
- "χαδεµένος": "χαϊδεµένος",
- "χαδεµένη": "χαϊδεµένη",
- "χαδεµένο": "χαϊδεµένο",
- "χειλόφωνος": "χειλεόφωνος",
- "χειλόφωνη": "χειλεόφωνη",
- "χειλόφωνο": "χειλεόφωνο",
- "χειροδύναµος": "χεροδύναµος",
- "χειροδύναµη": "χεροδύναµη",
- "χειροδύναµο": "χεροδύναµο",
- "χηράµενος": "χηρευάµενος",
- "χηράµενη": "χηρευάµενη",
- "χηράµενο": "χηρευάµενο",
- "χλωµός": "χλοµός",
- "χλωµή": "χλοµή",
- "χλωµό": "χλοµό",
- "χνουδάτος": "χνουδωτός",
- "χνουδάτη": "χνουδωτή",
- "χνουδάτο": "χνουδωτό",
- "χονδρός": "χοντρός",
- "χονδρή": "χοντρή",
- "χονδρό": "χοντρό",
- "χουβαρντάδικος": "χουβαρντάς",
- "χουβαρντάδικοςή": "χουβαρντάςη",
- "χουβαρντάδικοςό": "χουβαρντάςο",
- "χρεολυτικός": "χρεωλυτικός",
- "χρεολυτική": "χρεωλυτική",
- "χρεολυτικό": "χρεωλυτικό",
- "χρησµοδοτικός": "χρησµοδοσία",
- "χρησµοδοτική": "χρησµοδοσίαη",
- "χρησµοδοτικό": "χρησµοδοσίαο",
- "χρυσόπλεχτος": "χρυσόπλεκτος",
- "χρυσόπλεχτη": "χρυσόπλεκτη",
- "χρυσόπλεχτο": "χρυσόπλεκτο",
- "χτεσινός": "χθεσινός",
- "χτεσινή": "χθεσινή",
- "χτεσινό": "χθεσινό",
- "χτιστός": "κτιστός",
- "χτιστή": "κτιστή",
- "χτιστό": "κτιστό",
- "αντρείος": "ανδρείος",
- "αντρεία": "ανδρεία",
- "αντρείο": "ανδρείο",
- "αποποµπαίος": "αποδιοποµπαίος",
- "αποποµπαία": "αποδιοποµπαία",
- "αποποµπαίο": "αποδιοποµπαίο",
- "γεραλεος": "γηραλέος",
- "γεραλεα": "γηραλέα",
- "γεραλεο": "γηραλέο",
- "εντόπιος": "ντόπιος",
- "εντόπια": "ντόπια",
- "εντόπιο": "ντόπιο",
- "εφταπλάσιος": "επταπλάσιος",
- "εφταπλάσια": "επταπλάσια",
- "εφταπλάσιο": "επταπλάσιο",
- "ζούφιος": "τζούφιος",
- "ζούφια": "τζούφια",
- "ζούφιο": "τζούφιο",
- "καθάριος": "καθάρειος",
- "καθάρια": "καθάρεια",
- "καθάριο": "καθάρειο",
- "λαφήσιος": "ελαφήσιος",
- "λαφήσια": "ελαφήσια",
- "λαφήσιο": "ελαφήσιο",
- "οκταθέσιος": "οχταθέσιος",
- "οκταθέσια": "οχταθέσια",
- "οκταθέσιο": "οχταθέσιο",
- "ονυχαίος": "ονυχιαίος",
- "ονυχαία": "ονυχιαία",
- "ονυχαίο": "ονυχιαίο",
- "οχταπλάσιος": "οκταπλάσιος",
- "οχταπλάσια": "οκταπλάσια",
- "οχταπλάσιο": "οκταπλάσιο",
- "βοϊδήσιος": "βοδινός",
- "βοϊδήσια": "βοδινή",
- "βοϊδήσιο": "βοδινό",
- "καλαµποκίσιος": "καλαµποκήσιος",
- "καλαµποκίσια": "καλαµποκήσια",
- "καλαµποκίσιο": "καλαµποκήσιο",
- "κεφαλίσιος": "κεφαλήσιος",
- "κεφαλίσια": "κεφαλήσια",
- "κεφαλίσιο": "κεφαλήσιο",
- "κρουσταλλένιος": "κρυσταλλένιος",
- "κρουσταλλένια": "κρυσταλλένια",
- "κρουσταλλένιο": "κρυσταλλένιο",
- "µοσκαρήσιος": "µοσχαρήσιος",
- "µοσκαρήσια": "µοσχαρήσια",
- "µοσκαρήσιο": "µοσχαρήσιο",
- "παλικαρήσιος": "παλληκαρήσιος",
- "παλικαρήσια": "παλληκαρήσια",
- "παλικαρήσιο": "παλληκαρήσιο",
- "πετρένιος": "πέτρινος",
- "πετρένια": "πέτρινη",
- "πετρένιο": "πέτρινο",
- "σιταρένιος": "σταρένιος",
- "σιταρένια": "σταρένια",
- "σιταρένιο": "σταρένιο",
- "σκυλίσιος": "σκυλήσιος",
- "σκυλίσια": "σκυλήσια",
- "σκυλίσιο": "σκυλήσιο",
- "χελίσιος": "χελήσιος",
- "χελίσια": "χελήσια",
- "χελίσιο": "χελήσιο",
- "χελωνίσιος": "χελωνήσιος",
- "χελωνίσια": "χελωνήσια",
- "χελωνίσιο": "χελωνήσιο",
- "γουρσούζης": "γρουσούζης",
- "γουρσούζα": "γρουσούζα",
- "γουρσούζικο": "γρουσούζικο",
- "γρινιάρης": "γκρινιάρης",
- "γρινιάρα": "γκρινιάρα",
- "γρινιάρικο": "γκρινιάρικο",
- "λιχούδης": "λειχούδης",
- "λιχούδα": "λειχούδα",
- "λιχούδικο": "λειχούδικο",
- "µαργιόλής": "µαριόλης",
- "µαργιόλήςα": "µαριόλα",
- "µαργιόλήςικο": "µαριόλικο",
- "ξεκουτιάρης": "ξεκούτης",
- "ξεκουτιάρα": "ξεκούτα",
- "ξεκουτιάρικο": "ξεκούτικο",
- "σκανδαλιάρης": "σκανταλιάρης",
- "σκανδαλιάρα": "σκανταλιάρα",
- "σκανδαλιάρικο": "σκανταλιάρικο",
- "τσιγκούνης": "τσιγγούνης",
- "τσιγκούνα": "τσιγγούνα",
- "τσιγκούνικο": "τσιγγούνικο",
-}
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index fca4e01e7..4304b3c6a 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@@ -10,10 +9,9 @@ from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
def _return_en(_):
@@ -24,9 +22,6 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = _return_en
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py
deleted file mode 100644
index a2cf58b8a..000000000
--- a/spacy/lang/en/norm_exceptions.py
+++ /dev/null
@@ -1,1768 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
- # Slang and abbreviations
- "cos": "because",
- "cuz": "because",
- "fav": "favorite",
- "fave": "favorite",
- "misc": "miscellaneous",
- "plz": "please",
- "pls": "please",
- "thx": "thanks",
- # US vs. UK spelling
- "accessorise": "accessorize",
- "accessorised": "accessorized",
- "accessorises": "accessorizes",
- "accessorising": "accessorizing",
- "acclimatisation": "acclimatization",
- "acclimatise": "acclimatize",
- "acclimatised": "acclimatized",
- "acclimatises": "acclimatizes",
- "acclimatising": "acclimatizing",
- "accoutrements": "accouterments",
- "aeon": "eon",
- "aeons": "eons",
- "aerogramme": "aerogram",
- "aerogrammes": "aerograms",
- "aeroplane": "airplane",
- "aeroplanes ": "airplanes ",
- "aesthete": "esthete",
- "aesthetes": "esthetes",
- "aesthetic": "esthetic",
- "aesthetically": "esthetically",
- "aesthetics": "esthetics",
- "aetiology": "etiology",
- "ageing": "aging",
- "aggrandisement": "aggrandizement",
- "agonise": "agonize",
- "agonised": "agonized",
- "agonises": "agonizes",
- "agonising": "agonizing",
- "agonisingly": "agonizingly",
- "almanack": "almanac",
- "almanacks": "almanacs",
- "aluminium": "aluminum",
- "amortisable": "amortizable",
- "amortisation": "amortization",
- "amortisations": "amortizations",
- "amortise": "amortize",
- "amortised": "amortized",
- "amortises": "amortizes",
- "amortising": "amortizing",
- "amphitheatre": "amphitheater",
- "amphitheatres": "amphitheaters",
- "anaemia": "anemia",
- "anaemic": "anemic",
- "anaesthesia": "anesthesia",
- "anaesthetic": "anesthetic",
- "anaesthetics": "anesthetics",
- "anaesthetise": "anesthetize",
- "anaesthetised": "anesthetized",
- "anaesthetises": "anesthetizes",
- "anaesthetising": "anesthetizing",
- "anaesthetist": "anesthetist",
- "anaesthetists": "anesthetists",
- "anaesthetize": "anesthetize",
- "anaesthetized": "anesthetized",
- "anaesthetizes": "anesthetizes",
- "anaesthetizing": "anesthetizing",
- "analogue": "analog",
- "analogues": "analogs",
- "analyse": "analyze",
- "analysed": "analyzed",
- "analyses": "analyzes",
- "analysing": "analyzing",
- "anglicise": "anglicize",
- "anglicised": "anglicized",
- "anglicises": "anglicizes",
- "anglicising": "anglicizing",
- "annualised": "annualized",
- "antagonise": "antagonize",
- "antagonised": "antagonized",
- "antagonises": "antagonizes",
- "antagonising": "antagonizing",
- "apologise": "apologize",
- "apologised": "apologized",
- "apologises": "apologizes",
- "apologising": "apologizing",
- "appal": "appall",
- "appals": "appalls",
- "appetiser": "appetizer",
- "appetisers": "appetizers",
- "appetising": "appetizing",
- "appetisingly": "appetizingly",
- "arbour": "arbor",
- "arbours": "arbors",
- "archaeological": "archeological",
- "archaeologically": "archeologically",
- "archaeologist": "archeologist",
- "archaeologists": "archeologists",
- "archaeology": "archeology",
- "ardour": "ardor",
- "armour": "armor",
- "armoured": "armored",
- "armourer": "armorer",
- "armourers": "armorers",
- "armouries": "armories",
- "armoury": "armory",
- "artefact": "artifact",
- "artefacts": "artifacts",
- "authorise": "authorize",
- "authorised": "authorized",
- "authorises": "authorizes",
- "authorising": "authorizing",
- "axe": "ax",
- "backpedalled": "backpedaled",
- "backpedalling": "backpedaling",
- "bannister": "banister",
- "bannisters": "banisters",
- "baptise": "baptize",
- "baptised": "baptized",
- "baptises": "baptizes",
- "baptising": "baptizing",
- "bastardise": "bastardize",
- "bastardised": "bastardized",
- "bastardises": "bastardizes",
- "bastardising": "bastardizing",
- "battleaxe": "battleax",
- "baulk": "balk",
- "baulked": "balked",
- "baulking": "balking",
- "baulks": "balks",
- "bedevilled": "bedeviled",
- "bedevilling": "bedeviling",
- "behaviour": "behavior",
- "behavioural": "behavioral",
- "behaviourism": "behaviorism",
- "behaviourist": "behaviorist",
- "behaviourists": "behaviorists",
- "behaviours": "behaviors",
- "behove": "behoove",
- "behoved": "behooved",
- "behoves": "behooves",
- "bejewelled": "bejeweled",
- "belabour": "belabor",
- "belaboured": "belabored",
- "belabouring": "belaboring",
- "belabours": "belabors",
- "bevelled": "beveled",
- "bevvies": "bevies",
- "bevvy": "bevy",
- "biassed": "biased",
- "biassing": "biasing",
- "bingeing": "binging",
- "bougainvillaea": "bougainvillea",
- "bougainvillaeas": "bougainvilleas",
- "bowdlerise": "bowdlerize",
- "bowdlerised": "bowdlerized",
- "bowdlerises": "bowdlerizes",
- "bowdlerising": "bowdlerizing",
- "breathalyse": "breathalyze",
- "breathalysed": "breathalyzed",
- "breathalyser": "breathalyzer",
- "breathalysers": "breathalyzers",
- "breathalyses": "breathalyzes",
- "breathalysing": "breathalyzing",
- "brutalise": "brutalize",
- "brutalised": "brutalized",
- "brutalises": "brutalizes",
- "brutalising": "brutalizing",
- "buses": "busses",
- "busing": "bussing",
- "caesarean": "cesarean",
- "caesareans": "cesareans",
- "calibre": "caliber",
- "calibres": "calibers",
- "calliper": "caliper",
- "callipers": "calipers",
- "callisthenics": "calisthenics",
- "canalise": "canalize",
- "canalised": "canalized",
- "canalises": "canalizes",
- "canalising": "canalizing",
- "cancellation": "cancelation",
- "cancellations": "cancelations",
- "cancelled": "canceled",
- "cancelling": "canceling",
- "candour": "candor",
- "cannibalise": "cannibalize",
- "cannibalised": "cannibalized",
- "cannibalises": "cannibalizes",
- "cannibalising": "cannibalizing",
- "canonise": "canonize",
- "canonised": "canonized",
- "canonises": "canonizes",
- "canonising": "canonizing",
- "capitalise": "capitalize",
- "capitalised": "capitalized",
- "capitalises": "capitalizes",
- "capitalising": "capitalizing",
- "caramelise": "caramelize",
- "caramelised": "caramelized",
- "caramelises": "caramelizes",
- "caramelising": "caramelizing",
- "carbonise": "carbonize",
- "carbonised": "carbonized",
- "carbonises": "carbonizes",
- "carbonising": "carbonizing",
- "carolled": "caroled",
- "carolling": "caroling",
- "catalogue": "catalog",
- "catalogued": "cataloged",
- "catalogues": "catalogs",
- "cataloguing": "cataloging",
- "catalyse": "catalyze",
- "catalysed": "catalyzed",
- "catalyses": "catalyzes",
- "catalysing": "catalyzing",
- "categorise": "categorize",
- "categorised": "categorized",
- "categorises": "categorizes",
- "categorising": "categorizing",
- "cauterise": "cauterize",
- "cauterised": "cauterized",
- "cauterises": "cauterizes",
- "cauterising": "cauterizing",
- "cavilled": "caviled",
- "cavilling": "caviling",
- "centigramme": "centigram",
- "centigrammes": "centigrams",
- "centilitre": "centiliter",
- "centilitres": "centiliters",
- "centimetre": "centimeter",
- "centimetres": "centimeters",
- "centralise": "centralize",
- "centralised": "centralized",
- "centralises": "centralizes",
- "centralising": "centralizing",
- "centre": "center",
- "centred": "centered",
- "centrefold": "centerfold",
- "centrefolds": "centerfolds",
- "centrepiece": "centerpiece",
- "centrepieces": "centerpieces",
- "centres": "centers",
- "channelled": "channeled",
- "channelling": "channeling",
- "characterise": "characterize",
- "characterised": "characterized",
- "characterises": "characterizes",
- "characterising": "characterizing",
- "cheque": "check",
- "chequebook": "checkbook",
- "chequebooks": "checkbooks",
- "chequered": "checkered",
- "cheques": "checks",
- "chilli": "chili",
- "chimaera": "chimera",
- "chimaeras": "chimeras",
- "chiselled": "chiseled",
- "chiselling": "chiseling",
- "circularise": "circularize",
- "circularised": "circularized",
- "circularises": "circularizes",
- "circularising": "circularizing",
- "civilise": "civilize",
- "civilised": "civilized",
- "civilises": "civilizes",
- "civilising": "civilizing",
- "clamour": "clamor",
- "clamoured": "clamored",
- "clamouring": "clamoring",
- "clamours": "clamors",
- "clangour": "clangor",
- "clarinettist": "clarinetist",
- "clarinettists": "clarinetists",
- "collectivise": "collectivize",
- "collectivised": "collectivized",
- "collectivises": "collectivizes",
- "collectivising": "collectivizing",
- "colonisation": "colonization",
- "colonise": "colonize",
- "colonised": "colonized",
- "coloniser": "colonizer",
- "colonisers": "colonizers",
- "colonises": "colonizes",
- "colonising": "colonizing",
- "colour": "color",
- "colourant": "colorant",
- "colourants": "colorants",
- "coloured": "colored",
- "coloureds": "coloreds",
- "colourful": "colorful",
- "colourfully": "colorfully",
- "colouring": "coloring",
- "colourize": "colorize",
- "colourized": "colorized",
- "colourizes": "colorizes",
- "colourizing": "colorizing",
- "colourless": "colorless",
- "colours": "colors",
- "commercialise": "commercialize",
- "commercialised": "commercialized",
- "commercialises": "commercializes",
- "commercialising": "commercializing",
- "compartmentalise": "compartmentalize",
- "compartmentalised": "compartmentalized",
- "compartmentalises": "compartmentalizes",
- "compartmentalising": "compartmentalizing",
- "computerise": "computerize",
- "computerised": "computerized",
- "computerises": "computerizes",
- "computerising": "computerizing",
- "conceptualise": "conceptualize",
- "conceptualised": "conceptualized",
- "conceptualises": "conceptualizes",
- "conceptualising": "conceptualizing",
- "connexion": "connection",
- "connexions": "connections",
- "contextualise": "contextualize",
- "contextualised": "contextualized",
- "contextualises": "contextualizes",
- "contextualising": "contextualizing",
- "cosier": "cozier",
- "cosies": "cozies",
- "cosiest": "coziest",
- "cosily": "cozily",
- "cosiness": "coziness",
- "cosy": "cozy",
- "councillor": "councilor",
- "councillors": "councilors",
- "counselled": "counseled",
- "counselling": "counseling",
- "counsellor": "counselor",
- "counsellors": "counselors",
- "crenellated": "crenelated",
- "criminalise": "criminalize",
- "criminalised": "criminalized",
- "criminalises": "criminalizes",
- "criminalising": "criminalizing",
- "criticise": "criticize",
- "criticised": "criticized",
- "criticises": "criticizes",
- "criticising": "criticizing",
- "crueller": "crueler",
- "cruellest": "cruelest",
- "crystallisation": "crystallization",
- "crystallise": "crystallize",
- "crystallised": "crystallized",
- "crystallises": "crystallizes",
- "crystallising": "crystallizing",
- "cudgelled": "cudgeled",
- "cudgelling": "cudgeling",
- "customise": "customize",
- "customised": "customized",
- "customises": "customizes",
- "customising": "customizing",
- "cypher": "cipher",
- "cyphers": "ciphers",
- "decentralisation": "decentralization",
- "decentralise": "decentralize",
- "decentralised": "decentralized",
- "decentralises": "decentralizes",
- "decentralising": "decentralizing",
- "decriminalisation": "decriminalization",
- "decriminalise": "decriminalize",
- "decriminalised": "decriminalized",
- "decriminalises": "decriminalizes",
- "decriminalising": "decriminalizing",
- "defence": "defense",
- "defenceless": "defenseless",
- "defences": "defenses",
- "dehumanisation": "dehumanization",
- "dehumanise": "dehumanize",
- "dehumanised": "dehumanized",
- "dehumanises": "dehumanizes",
- "dehumanising": "dehumanizing",
- "demeanour": "demeanor",
- "demilitarisation": "demilitarization",
- "demilitarise": "demilitarize",
- "demilitarised": "demilitarized",
- "demilitarises": "demilitarizes",
- "demilitarising": "demilitarizing",
- "demobilisation": "demobilization",
- "demobilise": "demobilize",
- "demobilised": "demobilized",
- "demobilises": "demobilizes",
- "demobilising": "demobilizing",
- "democratisation": "democratization",
- "democratise": "democratize",
- "democratised": "democratized",
- "democratises": "democratizes",
- "democratising": "democratizing",
- "demonise": "demonize",
- "demonised": "demonized",
- "demonises": "demonizes",
- "demonising": "demonizing",
- "demoralisation": "demoralization",
- "demoralise": "demoralize",
- "demoralised": "demoralized",
- "demoralises": "demoralizes",
- "demoralising": "demoralizing",
- "denationalisation": "denationalization",
- "denationalise": "denationalize",
- "denationalised": "denationalized",
- "denationalises": "denationalizes",
- "denationalising": "denationalizing",
- "deodorise": "deodorize",
- "deodorised": "deodorized",
- "deodorises": "deodorizes",
- "deodorising": "deodorizing",
- "depersonalise": "depersonalize",
- "depersonalised": "depersonalized",
- "depersonalises": "depersonalizes",
- "depersonalising": "depersonalizing",
- "deputise": "deputize",
- "deputised": "deputized",
- "deputises": "deputizes",
- "deputising": "deputizing",
- "desensitisation": "desensitization",
- "desensitise": "desensitize",
- "desensitised": "desensitized",
- "desensitises": "desensitizes",
- "desensitising": "desensitizing",
- "destabilisation": "destabilization",
- "destabilise": "destabilize",
- "destabilised": "destabilized",
- "destabilises": "destabilizes",
- "destabilising": "destabilizing",
- "dialled": "dialed",
- "dialling": "dialing",
- "dialogue": "dialog",
- "dialogues": "dialogs",
- "diarrhoea": "diarrhea",
- "digitise": "digitize",
- "digitised": "digitized",
- "digitises": "digitizes",
- "digitising": "digitizing",
- "disc": "disk",
- "discolour": "discolor",
- "discoloured": "discolored",
- "discolouring": "discoloring",
- "discolours": "discolors",
- "discs": "disks",
- "disembowelled": "disemboweled",
- "disembowelling": "disemboweling",
- "disfavour": "disfavor",
- "dishevelled": "disheveled",
- "dishonour": "dishonor",
- "dishonourable": "dishonorable",
- "dishonourably": "dishonorably",
- "dishonoured": "dishonored",
- "dishonouring": "dishonoring",
- "dishonours": "dishonors",
- "disorganisation": "disorganization",
- "disorganised": "disorganized",
- "distil": "distill",
- "distils": "distills",
- "doin": "doing",
- "doin'": "doing",
- "dramatisation": "dramatization",
- "dramatisations": "dramatizations",
- "dramatise": "dramatize",
- "dramatised": "dramatized",
- "dramatises": "dramatizes",
- "dramatising": "dramatizing",
- "draught": "draft",
- "draughtboard": "draftboard",
- "draughtboards": "draftboards",
- "draughtier": "draftier",
- "draughtiest": "draftiest",
- "draughts": "drafts",
- "draughtsman": "draftsman",
- "draughtsmanship": "draftsmanship",
- "draughtsmen": "draftsmen",
- "draughtswoman": "draftswoman",
- "draughtswomen": "draftswomen",
- "draughty": "drafty",
- "drivelled": "driveled",
- "drivelling": "driveling",
- "duelled": "dueled",
- "duelling": "dueling",
- "economise": "economize",
- "economised": "economized",
- "economises": "economizes",
- "economising": "economizing",
- "edoema": "edema ",
- "editorialise": "editorialize",
- "editorialised": "editorialized",
- "editorialises": "editorializes",
- "editorialising": "editorializing",
- "empathise": "empathize",
- "empathised": "empathized",
- "empathises": "empathizes",
- "empathising": "empathizing",
- "emphasise": "emphasize",
- "emphasised": "emphasized",
- "emphasises": "emphasizes",
- "emphasising": "emphasizing",
- "enamelled": "enameled",
- "enamelling": "enameling",
- "enamoured": "enamored",
- "encyclopaedia": "encyclopedia",
- "encyclopaedias": "encyclopedias",
- "encyclopaedic": "encyclopedic",
- "endeavour": "endeavor",
- "endeavoured": "endeavored",
- "endeavouring": "endeavoring",
- "endeavours": "endeavors",
- "energise": "energize",
- "energised": "energized",
- "energises": "energizes",
- "energising": "energizing",
- "enrol": "enroll",
- "enrols": "enrolls",
- "enthral": "enthrall",
- "enthrals": "enthralls",
- "epaulette": "epaulet",
- "epaulettes": "epaulets",
- "epicentre": "epicenter",
- "epicentres": "epicenters",
- "epilogue": "epilog",
- "epilogues": "epilogs",
- "epitomise": "epitomize",
- "epitomised": "epitomized",
- "epitomises": "epitomizes",
- "epitomising": "epitomizing",
- "equalisation": "equalization",
- "equalise": "equalize",
- "equalised": "equalized",
- "equaliser": "equalizer",
- "equalisers": "equalizers",
- "equalises": "equalizes",
- "equalising": "equalizing",
- "eulogise": "eulogize",
- "eulogised": "eulogized",
- "eulogises": "eulogizes",
- "eulogising": "eulogizing",
- "evangelise": "evangelize",
- "evangelised": "evangelized",
- "evangelises": "evangelizes",
- "evangelising": "evangelizing",
- "exorcise": "exorcize",
- "exorcised": "exorcized",
- "exorcises": "exorcizes",
- "exorcising": "exorcizing",
- "extemporisation": "extemporization",
- "extemporise": "extemporize",
- "extemporised": "extemporized",
- "extemporises": "extemporizes",
- "extemporising": "extemporizing",
- "externalisation": "externalization",
- "externalisations": "externalizations",
- "externalise": "externalize",
- "externalised": "externalized",
- "externalises": "externalizes",
- "externalising": "externalizing",
- "factorise": "factorize",
- "factorised": "factorized",
- "factorises": "factorizes",
- "factorising": "factorizing",
- "faecal": "fecal",
- "faeces": "feces",
- "familiarisation": "familiarization",
- "familiarise": "familiarize",
- "familiarised": "familiarized",
- "familiarises": "familiarizes",
- "familiarising": "familiarizing",
- "fantasise": "fantasize",
- "fantasised": "fantasized",
- "fantasises": "fantasizes",
- "fantasising": "fantasizing",
- "favour": "favor",
- "favourable": "favorable",
- "favourably": "favorably",
- "favoured": "favored",
- "favouring": "favoring",
- "favourite": "favorite",
- "favourites": "favorites",
- "favouritism": "favoritism",
- "favours": "favors",
- "feminise": "feminize",
- "feminised": "feminized",
- "feminises": "feminizes",
- "feminising": "feminizing",
- "fertilisation": "fertilization",
- "fertilise": "fertilize",
- "fertilised": "fertilized",
- "fertiliser": "fertilizer",
- "fertilisers": "fertilizers",
- "fertilises": "fertilizes",
- "fertilising": "fertilizing",
- "fervour": "fervor",
- "fibre": "fiber",
- "fibreglass": "fiberglass",
- "fibres": "fibers",
- "fictionalisation": "fictionalization",
- "fictionalisations": "fictionalizations",
- "fictionalise": "fictionalize",
- "fictionalised": "fictionalized",
- "fictionalises": "fictionalizes",
- "fictionalising": "fictionalizing",
- "fillet": "filet",
- "filleted ": "fileted ",
- "filleting": "fileting",
- "fillets ": "filets ",
- "finalisation": "finalization",
- "finalise": "finalize",
- "finalised": "finalized",
- "finalises": "finalizes",
- "finalising": "finalizing",
- "flautist": "flutist",
- "flautists": "flutists",
- "flavour": "flavor",
- "flavoured": "flavored",
- "flavouring": "flavoring",
- "flavourings": "flavorings",
- "flavourless": "flavorless",
- "flavours": "flavors",
- "flavoursome": "flavorsome",
- "flyer / flier ": "flier / flyer ",
- "foetal": "fetal",
- "foetid": "fetid",
- "foetus": "fetus",
- "foetuses": "fetuses",
- "formalisation": "formalization",
- "formalise": "formalize",
- "formalised": "formalized",
- "formalises": "formalizes",
- "formalising": "formalizing",
- "fossilisation": "fossilization",
- "fossilise": "fossilize",
- "fossilised": "fossilized",
- "fossilises": "fossilizes",
- "fossilising": "fossilizing",
- "fraternisation": "fraternization",
- "fraternise": "fraternize",
- "fraternised": "fraternized",
- "fraternises": "fraternizes",
- "fraternising": "fraternizing",
- "fulfil": "fulfill",
- "fulfilment": "fulfillment",
- "fulfils": "fulfills",
- "funnelled": "funneled",
- "funnelling": "funneling",
- "galvanise": "galvanize",
- "galvanised": "galvanized",
- "galvanises": "galvanizes",
- "galvanising": "galvanizing",
- "gambolled": "gamboled",
- "gambolling": "gamboling",
- "gaol": "jail",
- "gaolbird": "jailbird",
- "gaolbirds": "jailbirds",
- "gaolbreak": "jailbreak",
- "gaolbreaks": "jailbreaks",
- "gaoled": "jailed",
- "gaoler": "jailer",
- "gaolers": "jailers",
- "gaoling": "jailing",
- "gaols": "jails",
- "gases": "gasses",
- "gauge": "gage",
- "gauged": "gaged",
- "gauges": "gages",
- "gauging": "gaging",
- "generalisation": "generalization",
- "generalisations": "generalizations",
- "generalise": "generalize",
- "generalised": "generalized",
- "generalises": "generalizes",
- "generalising": "generalizing",
- "ghettoise": "ghettoize",
- "ghettoised": "ghettoized",
- "ghettoises": "ghettoizes",
- "ghettoising": "ghettoizing",
- "gipsies": "gypsies",
- "glamorise": "glamorize",
- "glamorised": "glamorized",
- "glamorises": "glamorizes",
- "glamorising": "glamorizing",
- "glamour": "glamor",
- "globalisation": "globalization",
- "globalise": "globalize",
- "globalised": "globalized",
- "globalises": "globalizes",
- "globalising": "globalizing",
- "glueing ": "gluing ",
- "goin": "going",
- "goin'": "going",
- "goitre": "goiter",
- "goitres": "goiters",
- "gonorrhoea": "gonorrhea",
- "gramme": "gram",
- "grammes": "grams",
- "gravelled": "graveled",
- "grey": "gray",
- "greyed": "grayed",
- "greying": "graying",
- "greyish": "grayish",
- "greyness": "grayness",
- "greys": "grays",
- "grovelled": "groveled",
- "grovelling": "groveling",
- "groyne": "groin",
- "groynes ": "groins",
- "gruelling": "grueling",
- "gruellingly": "gruelingly",
- "gryphon": "griffin",
- "gryphons": "griffins",
- "gynaecological": "gynecological",
- "gynaecologist": "gynecologist",
- "gynaecologists": "gynecologists",
- "gynaecology": "gynecology",
- "haematological": "hematological",
- "haematologist": "hematologist",
- "haematologists": "hematologists",
- "haematology": "hematology",
- "haemoglobin": "hemoglobin",
- "haemophilia": "hemophilia",
- "haemophiliac": "hemophiliac",
- "haemophiliacs": "hemophiliacs",
- "haemorrhage": "hemorrhage",
- "haemorrhaged": "hemorrhaged",
- "haemorrhages": "hemorrhages",
- "haemorrhaging": "hemorrhaging",
- "haemorrhoids": "hemorrhoids",
- "harbour": "harbor",
- "harboured": "harbored",
- "harbouring": "harboring",
- "harbours": "harbors",
- "harmonisation": "harmonization",
- "harmonise": "harmonize",
- "harmonised": "harmonized",
- "harmonises": "harmonizes",
- "harmonising": "harmonizing",
- "havin": "having",
- "havin'": "having",
- "homoeopath": "homeopath",
- "homoeopathic": "homeopathic",
- "homoeopaths": "homeopaths",
- "homoeopathy": "homeopathy",
- "homogenise": "homogenize",
- "homogenised": "homogenized",
- "homogenises": "homogenizes",
- "homogenising": "homogenizing",
- "honour": "honor",
- "honourable": "honorable",
- "honourably": "honorably",
- "honoured": "honored",
- "honouring": "honoring",
- "honours": "honors",
- "hospitalisation": "hospitalization",
- "hospitalise": "hospitalize",
- "hospitalised": "hospitalized",
- "hospitalises": "hospitalizes",
- "hospitalising": "hospitalizing",
- "humanise": "humanize",
- "humanised": "humanized",
- "humanises": "humanizes",
- "humanising": "humanizing",
- "humour": "humor",
- "humoured": "humored",
- "humouring": "humoring",
- "humourless": "humorless",
- "humours": "humors",
- "hybridise": "hybridize",
- "hybridised": "hybridized",
- "hybridises": "hybridizes",
- "hybridising": "hybridizing",
- "hypnotise": "hypnotize",
- "hypnotised": "hypnotized",
- "hypnotises": "hypnotizes",
- "hypnotising": "hypnotizing",
- "hypothesise": "hypothesize",
- "hypothesised": "hypothesized",
- "hypothesises": "hypothesizes",
- "hypothesising": "hypothesizing",
- "idealisation": "idealization",
- "idealise": "idealize",
- "idealised": "idealized",
- "idealises": "idealizes",
- "idealising": "idealizing",
- "idolise": "idolize",
- "idolised": "idolized",
- "idolises": "idolizes",
- "idolising": "idolizing",
- "immobilisation": "immobilization",
- "immobilise": "immobilize",
- "immobilised": "immobilized",
- "immobiliser": "immobilizer",
- "immobilisers": "immobilizers",
- "immobilises": "immobilizes",
- "immobilising": "immobilizing",
- "immortalise": "immortalize",
- "immortalised": "immortalized",
- "immortalises": "immortalizes",
- "immortalising": "immortalizing",
- "immunisation": "immunization",
- "immunise": "immunize",
- "immunised": "immunized",
- "immunises": "immunizes",
- "immunising": "immunizing",
- "impanelled": "impaneled",
- "impanelling": "impaneling",
- "imperilled": "imperiled",
- "imperilling": "imperiling",
- "individualise": "individualize",
- "individualised": "individualized",
- "individualises": "individualizes",
- "individualising": "individualizing",
- "industrialise": "industrialize",
- "industrialised": "industrialized",
- "industrialises": "industrializes",
- "industrialising": "industrializing",
- "inflexion": "inflection",
- "inflexions": "inflections",
- "initialise": "initialize",
- "initialised": "initialized",
- "initialises": "initializes",
- "initialising": "initializing",
- "initialled": "initialed",
- "initialling": "initialing",
- "instal": "install",
- "instalment": "installment",
- "instalments": "installments",
- "instals": "installs",
- "instil": "instill",
- "instils": "instills",
- "institutionalisation": "institutionalization",
- "institutionalise": "institutionalize",
- "institutionalised": "institutionalized",
- "institutionalises": "institutionalizes",
- "institutionalising": "institutionalizing",
- "intellectualise": "intellectualize",
- "intellectualised": "intellectualized",
- "intellectualises": "intellectualizes",
- "intellectualising": "intellectualizing",
- "internalisation": "internalization",
- "internalise": "internalize",
- "internalised": "internalized",
- "internalises": "internalizes",
- "internalising": "internalizing",
- "internationalisation": "internationalization",
- "internationalise": "internationalize",
- "internationalised": "internationalized",
- "internationalises": "internationalizes",
- "internationalising": "internationalizing",
- "ionisation": "ionization",
- "ionise": "ionize",
- "ionised": "ionized",
- "ioniser": "ionizer",
- "ionisers": "ionizers",
- "ionises": "ionizes",
- "ionising": "ionizing",
- "italicise": "italicize",
- "italicised": "italicized",
- "italicises": "italicizes",
- "italicising": "italicizing",
- "itemise": "itemize",
- "itemised": "itemized",
- "itemises": "itemizes",
- "itemising": "itemizing",
- "jeopardise": "jeopardize",
- "jeopardised": "jeopardized",
- "jeopardises": "jeopardizes",
- "jeopardising": "jeopardizing",
- "jewelled": "jeweled",
- "jeweller": "jeweler",
- "jewellers": "jewelers",
- "jewellery": "jewelry",
- "judgement ": "judgment",
- "kilogramme": "kilogram",
- "kilogrammes": "kilograms",
- "kilometre": "kilometer",
- "kilometres": "kilometers",
- "labelled": "labeled",
- "labelling": "labeling",
- "labour": "labor",
- "laboured": "labored",
- "labourer": "laborer",
- "labourers": "laborers",
- "labouring": "laboring",
- "labours": "labors",
- "lacklustre": "lackluster",
- "legalisation": "legalization",
- "legalise": "legalize",
- "legalised": "legalized",
- "legalises": "legalizes",
- "legalising": "legalizing",
- "legitimise": "legitimize",
- "legitimised": "legitimized",
- "legitimises": "legitimizes",
- "legitimising": "legitimizing",
- "leukaemia": "leukemia",
- "levelled": "leveled",
- "leveller": "leveler",
- "levellers": "levelers",
- "levelling": "leveling",
- "libelled": "libeled",
- "libelling": "libeling",
- "libellous": "libelous",
- "liberalisation": "liberalization",
- "liberalise": "liberalize",
- "liberalised": "liberalized",
- "liberalises": "liberalizes",
- "liberalising": "liberalizing",
- "licence": "license",
- "licenced": "licensed",
- "licences": "licenses",
- "licencing": "licensing",
- "likeable": "likable ",
- "lionisation": "lionization",
- "lionise": "lionize",
- "lionised": "lionized",
- "lionises": "lionizes",
- "lionising": "lionizing",
- "liquidise": "liquidize",
- "liquidised": "liquidized",
- "liquidiser": "liquidizer",
- "liquidisers": "liquidizers",
- "liquidises": "liquidizes",
- "liquidising": "liquidizing",
- "litre": "liter",
- "litres": "liters",
- "localise": "localize",
- "localised": "localized",
- "localises": "localizes",
- "localising": "localizing",
- "lovin": "loving",
- "lovin'": "loving",
- "louvre": "louver",
- "louvred": "louvered",
- "louvres": "louvers ",
- "lustre": "luster",
- "magnetise": "magnetize",
- "magnetised": "magnetized",
- "magnetises": "magnetizes",
- "magnetising": "magnetizing",
- "manoeuvrability": "maneuverability",
- "manoeuvrable": "maneuverable",
- "manoeuvre": "maneuver",
- "manoeuvred": "maneuvered",
- "manoeuvres": "maneuvers",
- "manoeuvring": "maneuvering",
- "manoeuvrings": "maneuverings",
- "marginalisation": "marginalization",
- "marginalise": "marginalize",
- "marginalised": "marginalized",
- "marginalises": "marginalizes",
- "marginalising": "marginalizing",
- "marshalled": "marshaled",
- "marshalling": "marshaling",
- "marvelled": "marveled",
- "marvelling": "marveling",
- "marvellous": "marvelous",
- "marvellously": "marvelously",
- "materialisation": "materialization",
- "materialise": "materialize",
- "materialised": "materialized",
- "materialises": "materializes",
- "materialising": "materializing",
- "maximisation": "maximization",
- "maximise": "maximize",
- "maximised": "maximized",
- "maximises": "maximizes",
- "maximising": "maximizing",
- "meagre": "meager",
- "mechanisation": "mechanization",
- "mechanise": "mechanize",
- "mechanised": "mechanized",
- "mechanises": "mechanizes",
- "mechanising": "mechanizing",
- "mediaeval": "medieval",
- "memorialise": "memorialize",
- "memorialised": "memorialized",
- "memorialises": "memorializes",
- "memorialising": "memorializing",
- "memorise": "memorize",
- "memorised": "memorized",
- "memorises": "memorizes",
- "memorising": "memorizing",
- "mesmerise": "mesmerize",
- "mesmerised": "mesmerized",
- "mesmerises": "mesmerizes",
- "mesmerising": "mesmerizing",
- "metabolise": "metabolize",
- "metabolised": "metabolized",
- "metabolises": "metabolizes",
- "metabolising": "metabolizing",
- "metre": "meter",
- "metres": "meters",
- "micrometre": "micrometer",
- "micrometres": "micrometers",
- "militarise": "militarize",
- "militarised": "militarized",
- "militarises": "militarizes",
- "militarising": "militarizing",
- "milligramme": "milligram",
- "milligrammes": "milligrams",
- "millilitre": "milliliter",
- "millilitres": "milliliters",
- "millimetre": "millimeter",
- "millimetres": "millimeters",
- "miniaturisation": "miniaturization",
- "miniaturise": "miniaturize",
- "miniaturised": "miniaturized",
- "miniaturises": "miniaturizes",
- "miniaturising": "miniaturizing",
- "minibuses": "minibusses ",
- "minimise": "minimize",
- "minimised": "minimized",
- "minimises": "minimizes",
- "minimising": "minimizing",
- "misbehaviour": "misbehavior",
- "misdemeanour": "misdemeanor",
- "misdemeanours": "misdemeanors",
- "misspelt": "misspelled ",
- "mitre": "miter",
- "mitres": "miters",
- "mobilisation": "mobilization",
- "mobilise": "mobilize",
- "mobilised": "mobilized",
- "mobilises": "mobilizes",
- "mobilising": "mobilizing",
- "modelled": "modeled",
- "modeller": "modeler",
- "modellers": "modelers",
- "modelling": "modeling",
- "modernise": "modernize",
- "modernised": "modernized",
- "modernises": "modernizes",
- "modernising": "modernizing",
- "moisturise": "moisturize",
- "moisturised": "moisturized",
- "moisturiser": "moisturizer",
- "moisturisers": "moisturizers",
- "moisturises": "moisturizes",
- "moisturising": "moisturizing",
- "monologue": "monolog",
- "monologues": "monologs",
- "monopolisation": "monopolization",
- "monopolise": "monopolize",
- "monopolised": "monopolized",
- "monopolises": "monopolizes",
- "monopolising": "monopolizing",
- "moralise": "moralize",
- "moralised": "moralized",
- "moralises": "moralizes",
- "moralising": "moralizing",
- "motorised": "motorized",
- "mould": "mold",
- "moulded": "molded",
- "moulder": "molder",
- "mouldered": "moldered",
- "mouldering": "moldering",
- "moulders": "molders",
- "mouldier": "moldier",
- "mouldiest": "moldiest",
- "moulding": "molding",
- "mouldings": "moldings",
- "moulds": "molds",
- "mouldy": "moldy",
- "moult": "molt",
- "moulted": "molted",
- "moulting": "molting",
- "moults": "molts",
- "moustache": "mustache",
- "moustached": "mustached",
- "moustaches": "mustaches",
- "moustachioed": "mustachioed",
- "multicoloured": "multicolored",
- "nationalisation": "nationalization",
- "nationalisations": "nationalizations",
- "nationalise": "nationalize",
- "nationalised": "nationalized",
- "nationalises": "nationalizes",
- "nationalising": "nationalizing",
- "naturalisation": "naturalization",
- "naturalise": "naturalize",
- "naturalised": "naturalized",
- "naturalises": "naturalizes",
- "naturalising": "naturalizing",
- "neighbour": "neighbor",
- "neighbourhood": "neighborhood",
- "neighbourhoods": "neighborhoods",
- "neighbouring": "neighboring",
- "neighbourliness": "neighborliness",
- "neighbourly": "neighborly",
- "neighbours": "neighbors",
- "neutralisation": "neutralization",
- "neutralise": "neutralize",
- "neutralised": "neutralized",
- "neutralises": "neutralizes",
- "neutralising": "neutralizing",
- "normalisation": "normalization",
- "normalise": "normalize",
- "normalised": "normalized",
- "normalises": "normalizes",
- "normalising": "normalizing",
- "odour": "odor",
- "odourless": "odorless",
- "odours": "odors",
- "oesophagus": "esophagus",
- "oesophaguses": "esophaguses",
- "oestrogen": "estrogen",
- "offence": "offense",
- "offences": "offenses",
- "omelette": "omelet",
- "omelettes": "omelets",
- "optimise": "optimize",
- "optimised": "optimized",
- "optimises": "optimizes",
- "optimising": "optimizing",
- "organisation": "organization",
- "organisational": "organizational",
- "organisations": "organizations",
- "organise": "organize",
- "organised": "organized",
- "organiser": "organizer",
- "organisers": "organizers",
- "organises": "organizes",
- "organising": "organizing",
- "orthopaedic": "orthopedic",
- "orthopaedics": "orthopedics",
- "ostracise": "ostracize",
- "ostracised": "ostracized",
- "ostracises": "ostracizes",
- "ostracising": "ostracizing",
- "outmanoeuvre": "outmaneuver",
- "outmanoeuvred": "outmaneuvered",
- "outmanoeuvres": "outmaneuvers",
- "outmanoeuvring": "outmaneuvering",
- "overemphasise": "overemphasize",
- "overemphasised": "overemphasized",
- "overemphasises": "overemphasizes",
- "overemphasising": "overemphasizing",
- "oxidisation": "oxidization",
- "oxidise": "oxidize",
- "oxidised": "oxidized",
- "oxidises": "oxidizes",
- "oxidising": "oxidizing",
- "paederast": "pederast",
- "paederasts": "pederasts",
- "paediatric": "pediatric",
- "paediatrician": "pediatrician",
- "paediatricians": "pediatricians",
- "paediatrics": "pediatrics",
- "paedophile": "pedophile",
- "paedophiles": "pedophiles",
- "paedophilia": "pedophilia",
- "palaeolithic": "paleolithic",
- "palaeontologist": "paleontologist",
- "palaeontologists": "paleontologists",
- "palaeontology": "paleontology",
- "panelled": "paneled",
- "panelling": "paneling",
- "panellist": "panelist",
- "panellists": "panelists",
- "paralyse": "paralyze",
- "paralysed": "paralyzed",
- "paralyses": "paralyzes",
- "paralysing": "paralyzing",
- "parcelled": "parceled",
- "parcelling": "parceling",
- "parlour": "parlor",
- "parlours": "parlors",
- "particularise": "particularize",
- "particularised": "particularized",
- "particularises": "particularizes",
- "particularising": "particularizing",
- "passivisation": "passivization",
- "passivise": "passivize",
- "passivised": "passivized",
- "passivises": "passivizes",
- "passivising": "passivizing",
- "pasteurisation": "pasteurization",
- "pasteurise": "pasteurize",
- "pasteurised": "pasteurized",
- "pasteurises": "pasteurizes",
- "pasteurising": "pasteurizing",
- "patronise": "patronize",
- "patronised": "patronized",
- "patronises": "patronizes",
- "patronising": "patronizing",
- "patronisingly": "patronizingly",
- "pedalled": "pedaled",
- "pedalling": "pedaling",
- "pedestrianisation": "pedestrianization",
- "pedestrianise": "pedestrianize",
- "pedestrianised": "pedestrianized",
- "pedestrianises": "pedestrianizes",
- "pedestrianising": "pedestrianizing",
- "penalise": "penalize",
- "penalised": "penalized",
- "penalises": "penalizes",
- "penalising": "penalizing",
- "pencilled": "penciled",
- "pencilling": "penciling",
- "personalise": "personalize",
- "personalised": "personalized",
- "personalises": "personalizes",
- "personalising": "personalizing",
- "pharmacopoeia": "pharmacopeia",
- "pharmacopoeias": "pharmacopeias",
- "philosophise": "philosophize",
- "philosophised": "philosophized",
- "philosophises": "philosophizes",
- "philosophising": "philosophizing",
- "philtre": "filter",
- "philtres": "filters",
- "phoney ": "phony ",
- "plagiarise": "plagiarize",
- "plagiarised": "plagiarized",
- "plagiarises": "plagiarizes",
- "plagiarising": "plagiarizing",
- "plough": "plow",
- "ploughed": "plowed",
- "ploughing": "plowing",
- "ploughman": "plowman",
- "ploughmen": "plowmen",
- "ploughs": "plows",
- "ploughshare": "plowshare",
- "ploughshares": "plowshares",
- "polarisation": "polarization",
- "polarise": "polarize",
- "polarised": "polarized",
- "polarises": "polarizes",
- "polarising": "polarizing",
- "politicisation": "politicization",
- "politicise": "politicize",
- "politicised": "politicized",
- "politicises": "politicizes",
- "politicising": "politicizing",
- "popularisation": "popularization",
- "popularise": "popularize",
- "popularised": "popularized",
- "popularises": "popularizes",
- "popularising": "popularizing",
- "pouffe": "pouf",
- "pouffes": "poufs",
- "practise": "practice",
- "practised": "practiced",
- "practises": "practices",
- "practising ": "practicing ",
- "praesidium": "presidium",
- "praesidiums ": "presidiums ",
- "pressurisation": "pressurization",
- "pressurise": "pressurize",
- "pressurised": "pressurized",
- "pressurises": "pressurizes",
- "pressurising": "pressurizing",
- "pretence": "pretense",
- "pretences": "pretenses",
- "primaeval": "primeval",
- "prioritisation": "prioritization",
- "prioritise": "prioritize",
- "prioritised": "prioritized",
- "prioritises": "prioritizes",
- "prioritising": "prioritizing",
- "privatisation": "privatization",
- "privatisations": "privatizations",
- "privatise": "privatize",
- "privatised": "privatized",
- "privatises": "privatizes",
- "privatising": "privatizing",
- "professionalisation": "professionalization",
- "professionalise": "professionalize",
- "professionalised": "professionalized",
- "professionalises": "professionalizes",
- "professionalising": "professionalizing",
- "programme": "program",
- "programmes": "programs",
- "prologue": "prolog",
- "prologues": "prologs",
- "propagandise": "propagandize",
- "propagandised": "propagandized",
- "propagandises": "propagandizes",
- "propagandising": "propagandizing",
- "proselytise": "proselytize",
- "proselytised": "proselytized",
- "proselytiser": "proselytizer",
- "proselytisers": "proselytizers",
- "proselytises": "proselytizes",
- "proselytising": "proselytizing",
- "psychoanalyse": "psychoanalyze",
- "psychoanalysed": "psychoanalyzed",
- "psychoanalyses": "psychoanalyzes",
- "psychoanalysing": "psychoanalyzing",
- "publicise": "publicize",
- "publicised": "publicized",
- "publicises": "publicizes",
- "publicising": "publicizing",
- "pulverisation": "pulverization",
- "pulverise": "pulverize",
- "pulverised": "pulverized",
- "pulverises": "pulverizes",
- "pulverising": "pulverizing",
- "pummelled": "pummel",
- "pummelling": "pummeled",
- "pyjama": "pajama",
- "pyjamas": "pajamas",
- "pzazz": "pizzazz",
- "quarrelled": "quarreled",
- "quarrelling": "quarreling",
- "radicalise": "radicalize",
- "radicalised": "radicalized",
- "radicalises": "radicalizes",
- "radicalising": "radicalizing",
- "rancour": "rancor",
- "randomise": "randomize",
- "randomised": "randomized",
- "randomises": "randomizes",
- "randomising": "randomizing",
- "rationalisation": "rationalization",
- "rationalisations": "rationalizations",
- "rationalise": "rationalize",
- "rationalised": "rationalized",
- "rationalises": "rationalizes",
- "rationalising": "rationalizing",
- "ravelled": "raveled",
- "ravelling": "raveling",
- "realisable": "realizable",
- "realisation": "realization",
- "realisations": "realizations",
- "realise": "realize",
- "realised": "realized",
- "realises": "realizes",
- "realising": "realizing",
- "recognisable": "recognizable",
- "recognisably": "recognizably",
- "recognisance": "recognizance",
- "recognise": "recognize",
- "recognised": "recognized",
- "recognises": "recognizes",
- "recognising": "recognizing",
- "reconnoitre": "reconnoiter",
- "reconnoitred": "reconnoitered",
- "reconnoitres": "reconnoiters",
- "reconnoitring": "reconnoitering",
- "refuelled": "refueled",
- "refuelling": "refueling",
- "regularisation": "regularization",
- "regularise": "regularize",
- "regularised": "regularized",
- "regularises": "regularizes",
- "regularising": "regularizing",
- "remodelled": "remodeled",
- "remodelling": "remodeling",
- "remould": "remold",
- "remoulded": "remolded",
- "remoulding": "remolding",
- "remoulds": "remolds",
- "reorganisation": "reorganization",
- "reorganisations": "reorganizations",
- "reorganise": "reorganize",
- "reorganised": "reorganized",
- "reorganises": "reorganizes",
- "reorganising": "reorganizing",
- "revelled": "reveled",
- "reveller": "reveler",
- "revellers": "revelers",
- "revelling": "reveling",
- "revitalise": "revitalize",
- "revitalised": "revitalized",
- "revitalises": "revitalizes",
- "revitalising": "revitalizing",
- "revolutionise": "revolutionize",
- "revolutionised": "revolutionized",
- "revolutionises": "revolutionizes",
- "revolutionising": "revolutionizing",
- "rhapsodise": "rhapsodize",
- "rhapsodised": "rhapsodized",
- "rhapsodises": "rhapsodizes",
- "rhapsodising": "rhapsodizing",
- "rigour": "rigor",
- "rigours": "rigors",
- "ritualised": "ritualized",
- "rivalled": "rivaled",
- "rivalling": "rivaling",
- "romanticise": "romanticize",
- "romanticised": "romanticized",
- "romanticises": "romanticizes",
- "romanticising": "romanticizing",
- "rumour": "rumor",
- "rumoured": "rumored",
- "rumours": "rumors",
- "sabre": "saber",
- "sabres": "sabers",
- "saltpetre": "saltpeter",
- "sanitise": "sanitize",
- "sanitised": "sanitized",
- "sanitises": "sanitizes",
- "sanitising": "sanitizing",
- "satirise": "satirize",
- "satirised": "satirized",
- "satirises": "satirizes",
- "satirising": "satirizing",
- "saviour": "savior",
- "saviours": "saviors",
- "savour": "savor",
- "savoured": "savored",
- "savouries": "savories",
- "savouring": "savoring",
- "savours": "savors",
- "savoury": "savory",
- "scandalise": "scandalize",
- "scandalised": "scandalized",
- "scandalises": "scandalizes",
- "scandalising": "scandalizing",
- "sceptic": "skeptic",
- "sceptical": "skeptical",
- "sceptically": "skeptically",
- "scepticism": "skepticism",
- "sceptics": "skeptics",
- "sceptre": "scepter",
- "sceptres": "scepters",
- "scrutinise": "scrutinize",
- "scrutinised": "scrutinized",
- "scrutinises": "scrutinizes",
- "scrutinising": "scrutinizing",
- "secularisation": "secularization",
- "secularise": "secularize",
- "secularised": "secularized",
- "secularises": "secularizes",
- "secularising": "secularizing",
- "sensationalise": "sensationalize",
- "sensationalised": "sensationalized",
- "sensationalises": "sensationalizes",
- "sensationalising": "sensationalizing",
- "sensitise": "sensitize",
- "sensitised": "sensitized",
- "sensitises": "sensitizes",
- "sensitising": "sensitizing",
- "sentimentalise": "sentimentalize",
- "sentimentalised": "sentimentalized",
- "sentimentalises": "sentimentalizes",
- "sentimentalising": "sentimentalizing",
- "sepulchre": "sepulcher",
- "sepulchres": "sepulchers ",
- "serialisation": "serialization",
- "serialisations": "serializations",
- "serialise": "serialize",
- "serialised": "serialized",
- "serialises": "serializes",
- "serialising": "serializing",
- "sermonise": "sermonize",
- "sermonised": "sermonized",
- "sermonises": "sermonizes",
- "sermonising": "sermonizing",
- "sheikh ": "sheik ",
- "shovelled": "shoveled",
- "shovelling": "shoveling",
- "shrivelled": "shriveled",
- "shrivelling": "shriveling",
- "signalise": "signalize",
- "signalised": "signalized",
- "signalises": "signalizes",
- "signalising": "signalizing",
- "signalled": "signaled",
- "signalling": "signaling",
- "smoulder": "smolder",
- "smouldered": "smoldered",
- "smouldering": "smoldering",
- "smoulders": "smolders",
- "snivelled": "sniveled",
- "snivelling": "sniveling",
- "snorkelled": "snorkeled",
- "snorkelling": "snorkeling",
- "snowplough": "snowplow",
- "snowploughs": "snowplow",
- "socialisation": "socialization",
- "socialise": "socialize",
- "socialised": "socialized",
- "socialises": "socializes",
- "socialising": "socializing",
- "sodomise": "sodomize",
- "sodomised": "sodomized",
- "sodomises": "sodomizes",
- "sodomising": "sodomizing",
- "solemnise": "solemnize",
- "solemnised": "solemnized",
- "solemnises": "solemnizes",
- "solemnising": "solemnizing",
- "sombre": "somber",
- "specialisation": "specialization",
- "specialisations": "specializations",
- "specialise": "specialize",
- "specialised": "specialized",
- "specialises": "specializes",
- "specialising": "specializing",
- "spectre": "specter",
- "spectres": "specters",
- "spiralled": "spiraled",
- "spiralling": "spiraling",
- "splendour": "splendor",
- "splendours": "splendors",
- "squirrelled": "squirreled",
- "squirrelling": "squirreling",
- "stabilisation": "stabilization",
- "stabilise": "stabilize",
- "stabilised": "stabilized",
- "stabiliser": "stabilizer",
- "stabilisers": "stabilizers",
- "stabilises": "stabilizes",
- "stabilising": "stabilizing",
- "standardisation": "standardization",
- "standardise": "standardize",
- "standardised": "standardized",
- "standardises": "standardizes",
- "standardising": "standardizing",
- "stencilled": "stenciled",
- "stencilling": "stenciling",
- "sterilisation": "sterilization",
- "sterilisations": "sterilizations",
- "sterilise": "sterilize",
- "sterilised": "sterilized",
- "steriliser": "sterilizer",
- "sterilisers": "sterilizers",
- "sterilises": "sterilizes",
- "sterilising": "sterilizing",
- "stigmatisation": "stigmatization",
- "stigmatise": "stigmatize",
- "stigmatised": "stigmatized",
- "stigmatises": "stigmatizes",
- "stigmatising": "stigmatizing",
- "storey": "story",
- "storeys": "stories",
- "subsidisation": "subsidization",
- "subsidise": "subsidize",
- "subsidised": "subsidized",
- "subsidiser": "subsidizer",
- "subsidisers": "subsidizers",
- "subsidises": "subsidizes",
- "subsidising": "subsidizing",
- "succour": "succor",
- "succoured": "succored",
- "succouring": "succoring",
- "succours": "succors",
- "sulphate": "sulfate",
- "sulphates": "sulfates",
- "sulphide": "sulfide",
- "sulphides": "sulfides",
- "sulphur": "sulfur",
- "sulphurous": "sulfurous",
- "summarise": "summarize",
- "summarised": "summarized",
- "summarises": "summarizes",
- "summarising": "summarizing",
- "swivelled": "swiveled",
- "swivelling": "swiveling",
- "symbolise": "symbolize",
- "symbolised": "symbolized",
- "symbolises": "symbolizes",
- "symbolising": "symbolizing",
- "sympathise": "sympathize",
- "sympathised": "sympathized",
- "sympathiser": "sympathizer",
- "sympathisers": "sympathizers",
- "sympathises": "sympathizes",
- "sympathising": "sympathizing",
- "synchronisation": "synchronization",
- "synchronise": "synchronize",
- "synchronised": "synchronized",
- "synchronises": "synchronizes",
- "synchronising": "synchronizing",
- "synthesise": "synthesize",
- "synthesised": "synthesized",
- "synthesiser": "synthesizer",
- "synthesisers": "synthesizers",
- "synthesises": "synthesizes",
- "synthesising": "synthesizing",
- "syphon": "siphon",
- "syphoned": "siphoned",
- "syphoning": "siphoning",
- "syphons": "siphons",
- "systematisation": "systematization",
- "systematise": "systematize",
- "systematised": "systematized",
- "systematises": "systematizes",
- "systematising": "systematizing",
- "tantalise": "tantalize",
- "tantalised": "tantalized",
- "tantalises": "tantalizes",
- "tantalising": "tantalizing",
- "tantalisingly": "tantalizingly",
- "tasselled": "tasseled",
- "technicolour": "technicolor",
- "temporise": "temporize",
- "temporised": "temporized",
- "temporises": "temporizes",
- "temporising": "temporizing",
- "tenderise": "tenderize",
- "tenderised": "tenderized",
- "tenderises": "tenderizes",
- "tenderising": "tenderizing",
- "terrorise": "terrorize",
- "terrorised": "terrorized",
- "terrorises": "terrorizes",
- "terrorising": "terrorizing",
- "theatre": "theater",
- "theatregoer": "theatergoer",
- "theatregoers": "theatergoers",
- "theatres": "theaters",
- "theorise": "theorize",
- "theorised": "theorized",
- "theorises": "theorizes",
- "theorising": "theorizing",
- "tonne": "ton",
- "tonnes": "tons",
- "towelled": "toweled",
- "towelling": "toweling",
- "toxaemia": "toxemia",
- "tranquillise": "tranquilize",
- "tranquillised": "tranquilized",
- "tranquilliser": "tranquilizer",
- "tranquillisers": "tranquilizers",
- "tranquillises": "tranquilizes",
- "tranquillising": "tranquilizing",
- "tranquillity": "tranquility",
- "tranquillize": "tranquilize",
- "tranquillized": "tranquilized",
- "tranquillizer": "tranquilizer",
- "tranquillizers": "tranquilizers",
- "tranquillizes": "tranquilizes",
- "tranquillizing": "tranquilizing",
- "tranquilly": "tranquility",
- "transistorised": "transistorized",
- "traumatise": "traumatize",
- "traumatised": "traumatized",
- "traumatises": "traumatizes",
- "traumatising": "traumatizing",
- "travelled": "traveled",
- "traveller": "traveler",
- "travellers": "travelers",
- "travelling": "traveling",
- "travelogue": "travelog",
- "travelogues ": "travelogs ",
- "trialled": "trialed",
- "trialling": "trialing",
- "tricolour": "tricolor",
- "tricolours": "tricolors",
- "trivialise": "trivialize",
- "trivialised": "trivialized",
- "trivialises": "trivializes",
- "trivialising": "trivializing",
- "tumour": "tumor",
- "tumours": "tumors",
- "tunnelled": "tunneled",
- "tunnelling": "tunneling",
- "tyrannise": "tyrannize",
- "tyrannised": "tyrannized",
- "tyrannises": "tyrannizes",
- "tyrannising": "tyrannizing",
- "tyre": "tire",
- "tyres": "tires",
- "unauthorised": "unauthorized",
- "uncivilised": "uncivilized",
- "underutilised": "underutilized",
- "unequalled": "unequaled",
- "unfavourable": "unfavorable",
- "unfavourably": "unfavorably",
- "unionisation": "unionization",
- "unionise": "unionize",
- "unionised": "unionized",
- "unionises": "unionizes",
- "unionising": "unionizing",
- "unorganised": "unorganized",
- "unravelled": "unraveled",
- "unravelling": "unraveling",
- "unrecognisable": "unrecognizable",
- "unrecognised": "unrecognized",
- "unrivalled": "unrivaled",
- "unsavoury": "unsavory",
- "untrammelled": "untrammeled",
- "urbanisation": "urbanization",
- "urbanise": "urbanize",
- "urbanised": "urbanized",
- "urbanises": "urbanizes",
- "urbanising": "urbanizing",
- "utilisable": "utilizable",
- "utilisation": "utilization",
- "utilise": "utilize",
- "utilised": "utilized",
- "utilises": "utilizes",
- "utilising": "utilizing",
- "valour": "valor",
- "vandalise": "vandalize",
- "vandalised": "vandalized",
- "vandalises": "vandalizes",
- "vandalising": "vandalizing",
- "vaporisation": "vaporization",
- "vaporise": "vaporize",
- "vaporised": "vaporized",
- "vaporises": "vaporizes",
- "vaporising": "vaporizing",
- "vapour": "vapor",
- "vapours": "vapors",
- "verbalise": "verbalize",
- "verbalised": "verbalized",
- "verbalises": "verbalizes",
- "verbalising": "verbalizing",
- "victimisation": "victimization",
- "victimise": "victimize",
- "victimised": "victimized",
- "victimises": "victimizes",
- "victimising": "victimizing",
- "videodisc": "videodisk",
- "videodiscs": "videodisks",
- "vigour": "vigor",
- "visualisation": "visualization",
- "visualisations": "visualizations",
- "visualise": "visualize",
- "visualised": "visualized",
- "visualises": "visualizes",
- "visualising": "visualizing",
- "vocalisation": "vocalization",
- "vocalisations": "vocalizations",
- "vocalise": "vocalize",
- "vocalised": "vocalized",
- "vocalises": "vocalizes",
- "vocalising": "vocalizing",
- "vulcanised": "vulcanized",
- "vulgarisation": "vulgarization",
- "vulgarise": "vulgarize",
- "vulgarised": "vulgarized",
- "vulgarises": "vulgarizes",
- "vulgarising": "vulgarizing",
- "waggon": "wagon",
- "waggons": "wagons",
- "watercolour": "watercolor",
- "watercolours": "watercolors",
- "weaselled": "weaseled",
- "weaselling": "weaseling",
- "westernisation": "westernization",
- "westernise": "westernize",
- "westernised": "westernized",
- "westernises": "westernizes",
- "westernising": "westernizing",
- "womanise": "womanize",
- "womanised": "womanized",
- "womaniser": "womanizer",
- "womanisers": "womanizers",
- "womanises": "womanizes",
- "womanising": "womanizing",
- "woollen": "woolen",
- "woollens": "woolens",
- "woollies": "woolies",
- "woolly": "wooly",
- "worshipped ": "worshiped",
- "worshipping ": "worshiping ",
- "worshipper": "worshiper",
- "yodelled": "yodeled",
- "yodelling": "yodeling",
- "yoghourt": "yogurt",
- "yoghourts": "yogurts",
- "yoghurt": "yogurt",
- "yoghurts": "yogurts",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index ea8e355ac..8e2266a40 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -4,25 +4,20 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "id"
lex_attr_getters.update(LEX_ATTRS)
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py
deleted file mode 100644
index 09ac6a6d3..000000000
--- a/spacy/lang/id/norm_exceptions.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# Daftar kosakata yang sering salah dieja
-# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
-_exc = {
- # Slang and abbreviations
- "silahkan": "silakan",
- "yg": "yang",
- "kalo": "kalau",
- "cawu": "caturwulan",
- "ok": "oke",
- "gak": "tidak",
- "enggak": "tidak",
- "nggak": "tidak",
- "ndak": "tidak",
- "ngga": "tidak",
- "dgn": "dengan",
- "tdk": "tidak",
- "jg": "juga",
- "klo": "kalau",
- "denger": "dengar",
- "pinter": "pintar",
- "krn": "karena",
- "nemuin": "menemukan",
- "jgn": "jangan",
- "udah": "sudah",
- "sy": "saya",
- "udh": "sudah",
- "dapetin": "mendapatkan",
- "ngelakuin": "melakukan",
- "ngebuat": "membuat",
- "membikin": "membuat",
- "bikin": "buat",
- # Daftar kosakata yang sering salah dieja
- "malpraktik": "malapraktik",
- "malfungsi": "malafungsi",
- "malserap": "malaserap",
- "maladaptasi": "malaadaptasi",
- "malsuai": "malasuai",
- "maldistribusi": "maladistribusi",
- "malgizi": "malagizi",
- "malsikap": "malasikap",
- "memperhatikan": "memerhatikan",
- "akte": "akta",
- "cemilan": "camilan",
- "esei": "esai",
- "frase": "frasa",
- "kafeteria": "kafetaria",
- "ketapel": "katapel",
- "kenderaan": "kendaraan",
- "menejemen": "manajemen",
- "menejer": "manajer",
- "mesjid": "masjid",
- "rebo": "rabu",
- "seksama": "saksama",
- "senggama": "sanggama",
- "sekedar": "sekadar",
- "seprei": "seprai",
- "semedi": "semadi",
- "samadi": "semadi",
- "amandemen": "amendemen",
- "algoritma": "algoritme",
- "aritmatika": "aritmetika",
- "metoda": "metode",
- "materai": "meterai",
- "meterei": "meterai",
- "kalendar": "kalender",
- "kadaluwarsa": "kedaluwarsa",
- "katagori": "kategori",
- "parlamen": "parlemen",
- "sekular": "sekuler",
- "selular": "seluler",
- "sirkular": "sirkuler",
- "survai": "survei",
- "survey": "survei",
- "aktuil": "aktual",
- "formil": "formal",
- "trotoir": "trotoar",
- "komersiil": "komersial",
- "komersil": "komersial",
- "tradisionil": "tradisionial",
- "orisinil": "orisinal",
- "orijinil": "orisinal",
- "afdol": "afdal",
- "antri": "antre",
- "apotik": "apotek",
- "atlit": "atlet",
- "atmosfir": "atmosfer",
- "cidera": "cedera",
- "cendikiawan": "cendekiawan",
- "cepet": "cepat",
- "cinderamata": "cenderamata",
- "debet": "debit",
- "difinisi": "definisi",
- "dekrit": "dekret",
- "disain": "desain",
- "diskripsi": "deskripsi",
- "diskotik": "diskotek",
- "eksim": "eksem",
- "exim": "eksem",
- "faidah": "faedah",
- "ekstrim": "ekstrem",
- "ekstrimis": "ekstremis",
- "komplit": "komplet",
- "konkrit": "konkret",
- "kongkrit": "konkret",
- "kongkret": "konkret",
- "kridit": "kredit",
- "musium": "museum",
- "pinalti": "penalti",
- "piranti": "peranti",
- "pinsil": "pensil",
- "personil": "personel",
- "sistim": "sistem",
- "teoritis": "teoretis",
- "vidio": "video",
- "cengkeh": "cengkih",
- "desertasi": "disertasi",
- "hakekat": "hakikat",
- "intelejen": "intelijen",
- "kaedah": "kaidah",
- "kempes": "kempis",
- "kementrian": "kementerian",
- "ledeng": "leding",
- "nasehat": "nasihat",
- "penasehat": "penasihat",
- "praktek": "praktik",
- "praktekum": "praktikum",
- "resiko": "risiko",
- "retsleting": "ritsleting",
- "senen": "senin",
- "amuba": "ameba",
- "punggawa": "penggawa",
- "surban": "serban",
- "nomer": "nomor",
- "sorban": "serban",
- "bis": "bus",
- "agribisnis": "agrobisnis",
- "kantung": "kantong",
- "khutbah": "khotbah",
- "mandur": "mandor",
- "rubuh": "roboh",
- "pastur": "pastor",
- "supir": "sopir",
- "goncang": "guncang",
- "goa": "gua",
- "kaos": "kaus",
- "kokoh": "kukuh",
- "komulatif": "kumulatif",
- "kolomnis": "kolumnis",
- "korma": "kurma",
- "lobang": "lubang",
- "limo": "limusin",
- "limosin": "limusin",
- "mangkok": "mangkuk",
- "saos": "saus",
- "sop": "sup",
- "sorga": "surga",
- "tegor": "tegur",
- "telor": "telur",
- "obrak-abrik": "ubrak-abrik",
- "ekwivalen": "ekuivalen",
- "frekwensi": "frekuensi",
- "konsekwensi": "konsekuensi",
- "kwadran": "kuadran",
- "kwadrat": "kuadrat",
- "kwalifikasi": "kualifikasi",
- "kwalitas": "kualitas",
- "kwalitet": "kualitas",
- "kwalitatif": "kualitatif",
- "kwantitas": "kuantitas",
- "kwantitatif": "kuantitatif",
- "kwantum": "kuantum",
- "kwartal": "kuartal",
- "kwintal": "kuintal",
- "kwitansi": "kuitansi",
- "kwatir": "khawatir",
- "kuatir": "khawatir",
- "jadual": "jadwal",
- "hirarki": "hierarki",
- "karir": "karier",
- "aktip": "aktif",
- "daptar": "daftar",
- "efektip": "efektif",
- "epektif": "efektif",
- "epektip": "efektif",
- "Pebruari": "Februari",
- "pisik": "fisik",
- "pondasi": "fondasi",
- "photo": "foto",
- "photokopi": "fotokopi",
- "hapal": "hafal",
- "insap": "insaf",
- "insyaf": "insaf",
- "konperensi": "konferensi",
- "kreatip": "kreatif",
- "kreativ": "kreatif",
- "maap": "maaf",
- "napsu": "nafsu",
- "negatip": "negatif",
- "negativ": "negatif",
- "objektip": "objektif",
- "obyektip": "objektif",
- "obyektif": "objektif",
- "pasip": "pasif",
- "pasiv": "pasif",
- "positip": "positif",
- "positiv": "positif",
- "produktip": "produktif",
- "produktiv": "produktif",
- "sarap": "saraf",
- "sertipikat": "sertifikat",
- "subjektip": "subjektif",
- "subyektip": "subjektif",
- "subyektif": "subjektif",
- "tarip": "tarif",
- "transitip": "transitif",
- "transitiv": "transitif",
- "faham": "paham",
- "fikir": "pikir",
- "berfikir": "berpikir",
- "telefon": "telepon",
- "telfon": "telepon",
- "telpon": "telepon",
- "tilpon": "telepon",
- "nafas": "napas",
- "bernafas": "bernapas",
- "pernafasan": "pernapasan",
- "vermak": "permak",
- "vulpen": "pulpen",
- "aktifis": "aktivis",
- "konfeksi": "konveksi",
- "motifasi": "motivasi",
- "Nopember": "November",
- "propinsi": "provinsi",
- "babtis": "baptis",
- "jerembab": "jerembap",
- "lembab": "lembap",
- "sembab": "sembap",
- "saptu": "sabtu",
- "tekat": "tekad",
- "bejad": "bejat",
- "nekad": "nekat",
- "otoped": "otopet",
- "skuad": "skuat",
- "jenius": "genius",
- "marjin": "margin",
- "marjinal": "marginal",
- "obyek": "objek",
- "subyek": "subjek",
- "projek": "proyek",
- "azas": "asas",
- "ijasah": "ijazah",
- "jenasah": "jenazah",
- "plasa": "plaza",
- "bathin": "batin",
- "Katholik": "Katolik",
- "orthografi": "ortografi",
- "pathogen": "patogen",
- "theologi": "teologi",
- "ijin": "izin",
- "rejeki": "rezeki",
- "rejim": "rezim",
- "jaman": "zaman",
- "jamrud": "zamrud",
- "jinah": "zina",
- "perjinahan": "perzinaan",
- "anugrah": "anugerah",
- "cendrawasih": "cenderawasih",
- "jendral": "jenderal",
- "kripik": "keripik",
- "krupuk": "kerupuk",
- "ksatria": "kesatria",
- "mentri": "menteri",
- "negri": "negeri",
- "Prancis": "Perancis",
- "sebrang": "seberang",
- "menyebrang": "menyeberang",
- "Sumatra": "Sumatera",
- "trampil": "terampil",
- "isteri": "istri",
- "justeru": "justru",
- "perajurit": "prajurit",
- "putera": "putra",
- "puteri": "putri",
- "samudera": "samudra",
- "sastera": "sastra",
- "sutera": "sutra",
- "terompet": "trompet",
- "iklas": "ikhlas",
- "iktisar": "ikhtisar",
- "kafilah": "khafilah",
- "kawatir": "khawatir",
- "kotbah": "khotbah",
- "kusyuk": "khusyuk",
- "makluk": "makhluk",
- "mahluk": "makhluk",
- "mahkluk": "makhluk",
- "nahkoda": "nakhoda",
- "nakoda": "nakhoda",
- "tahta": "takhta",
- "takhyul": "takhayul",
- "tahyul": "takhayul",
- "tahayul": "takhayul",
- "akhli": "ahli",
- "anarkhi": "anarki",
- "kharisma": "karisma",
- "kharismatik": "karismatik",
- "mahsud": "maksud",
- "makhsud": "maksud",
- "rakhmat": "rahmat",
- "tekhnik": "teknik",
- "tehnik": "teknik",
- "tehnologi": "teknologi",
- "ikhwal": "ihwal",
- "expor": "ekspor",
- "extra": "ekstra",
- "komplex": "komplek",
- "sex": "seks",
- "taxi": "taksi",
- "extasi": "ekstasi",
- "syaraf": "saraf",
- "syurga": "surga",
- "mashur": "masyhur",
- "masyur": "masyhur",
- "mahsyur": "masyhur",
- "mashyur": "masyhur",
- "muadzin": "muazin",
- "adzan": "azan",
- "ustadz": "ustaz",
- "ustad": "ustaz",
- "ustadzah": "ustaz",
- "dzikir": "zikir",
- "dzuhur": "zuhur",
- "dhuhur": "zuhur",
- "zhuhur": "zuhur",
- "analisa": "analisis",
- "diagnosa": "diagnosis",
- "hipotesa": "hipotesis",
- "sintesa": "sintesis",
- "aktiviti": "aktivitas",
- "aktifitas": "aktivitas",
- "efektifitas": "efektivitas",
- "komuniti": "komunitas",
- "kreatifitas": "kreativitas",
- "produktifitas": "produktivitas",
- "realiti": "realitas",
- "realita": "realitas",
- "selebriti": "selebritas",
- "spotifitas": "sportivitas",
- "universiti": "universitas",
- "utiliti": "utilitas",
- "validiti": "validitas",
- "dilokalisir": "dilokalisasi",
- "didramatisir": "didramatisasi",
- "dipolitisir": "dipolitisasi",
- "dinetralisir": "dinetralisasi",
- "dikonfrontir": "dikonfrontasi",
- "mendominir": "mendominasi",
- "koordinir": "koordinasi",
- "proklamir": "proklamasi",
- "terorganisir": "terorganisasi",
- "terealisir": "terealisasi",
- "robah": "ubah",
- "dirubah": "diubah",
- "merubah": "mengubah",
- "terlanjur": "telanjur",
- "terlantar": "telantar",
- "penglepasan": "pelepasan",
- "pelihatan": "penglihatan",
- "pemukiman": "permukiman",
- "pengrumahan": "perumahan",
- "penyewaan": "persewaan",
- "menyintai": "mencintai",
- "menyolok": "mencolok",
- "contek": "sontek",
- "mencontek": "menyontek",
- "pungkir": "mungkir",
- "dipungkiri": "dimungkiri",
- "kupungkiri": "kumungkiri",
- "kaupungkiri": "kaumungkiri",
- "nampak": "tampak",
- "nampaknya": "tampaknya",
- "nongkrong": "tongkrong",
- "berternak": "beternak",
- "berterbangan": "beterbangan",
- "berserta": "beserta",
- "berperkara": "beperkara",
- "berpergian": "bepergian",
- "berkerja": "bekerja",
- "berberapa": "beberapa",
- "terbersit": "tebersit",
- "terpercaya": "tepercaya",
- "terperdaya": "teperdaya",
- "terpercik": "tepercik",
- "terpergok": "tepergok",
- "aksesoris": "aksesori",
- "handal": "andal",
- "hantar": "antar",
- "panutan": "anutan",
- "atsiri": "asiri",
- "bhakti": "bakti",
- "china": "cina",
- "dharma": "darma",
- "diktaktor": "diktator",
- "eksport": "ekspor",
- "hembus": "embus",
- "hadits": "hadis",
- "hadist": "hadits",
- "harafiah": "harfiah",
- "himbau": "imbau",
- "import": "impor",
- "inget": "ingat",
- "hisap": "isap",
- "interprestasi": "interpretasi",
- "kangker": "kanker",
- "konggres": "kongres",
- "lansekap": "lanskap",
- "maghrib": "magrib",
- "emak": "mak",
- "moderen": "modern",
- "pasport": "paspor",
- "perduli": "peduli",
- "ramadhan": "ramadan",
- "rapih": "rapi",
- "Sansekerta": "Sanskerta",
- "shalat": "salat",
- "sholat": "salat",
- "silahkan": "silakan",
- "standard": "standar",
- "hutang": "utang",
- "zinah": "zina",
- "ambulan": "ambulans",
- "antartika": "sntarktika",
- "arteri": "arteria",
- "asik": "asyik",
- "australi": "australia",
- "denga": "dengan",
- "depo": "depot",
- "detil": "detail",
- "ensiklopedi": "ensiklopedia",
- "elit": "elite",
- "frustasi": "frustrasi",
- "gladi": "geladi",
- "greget": "gereget",
- "itali": "italia",
- "karna": "karena",
- "klenteng": "kelenteng",
- "erling": "kerling",
- "kontruksi": "konstruksi",
- "masal": "massal",
- "merk": "merek",
- "respon": "respons",
- "diresponi": "direspons",
- "skak": "sekak",
- "stir": "setir",
- "singapur": "singapura",
- "standarisasi": "standardisasi",
- "varitas": "varietas",
- "amphibi": "amfibi",
- "anjlog": "anjlok",
- "alpukat": "avokad",
- "alpokat": "avokad",
- "bolpen": "pulpen",
- "cabe": "cabai",
- "cabay": "cabai",
- "ceret": "cerek",
- "differensial": "diferensial",
- "duren": "durian",
- "faksimili": "faksimile",
- "faksimil": "faksimile",
- "graha": "gerha",
- "goblog": "goblok",
- "gombrong": "gombroh",
- "horden": "gorden",
- "korden": "gorden",
- "gubug": "gubuk",
- "imaginasi": "imajinasi",
- "jerigen": "jeriken",
- "jirigen": "jeriken",
- "carut-marut": "karut-marut",
- "kwota": "kuota",
- "mahzab": "mazhab",
- "mempesona": "memesona",
- "milyar": "miliar",
- "missi": "misi",
- "nenas": "nanas",
- "negoisasi": "negosiasi",
- "automotif": "otomotif",
- "pararel": "paralel",
- "paska": "pasca",
- "prosen": "persen",
- "pete": "petai",
- "petay": "petai",
- "proffesor": "profesor",
- "rame": "ramai",
- "rapot": "rapor",
- "rileks": "relaks",
- "rileksasi": "relaksasi",
- "renumerasi": "remunerasi",
- "seketaris": "sekretaris",
- "sekertaris": "sekretaris",
- "sensorik": "sensoris",
- "sentausa": "sentosa",
- "strawberi": "stroberi",
- "strawbery": "stroberi",
- "taqwa": "takwa",
- "tauco": "taoco",
- "tauge": "taoge",
- "toge": "taoge",
- "tauladan": "teladan",
- "taubat": "tobat",
- "trilyun": "triliun",
- "vissi": "visi",
- "coklat": "cokelat",
- "narkotika": "narkotik",
- "oase": "oasis",
- "politisi": "politikus",
- "terong": "terung",
- "wool": "wol",
- "himpit": "impit",
- "mujizat": "mukjizat",
- "mujijat": "mukjizat",
- "yag": "yang",
-}
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 4fcfaddb4..8d85b8fc7 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -2,26 +2,21 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "lb"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py
deleted file mode 100644
index 7063e6863..000000000
--- a/spacy/lang/lb/norm_exceptions.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# TODO
-# norm execptions: find a possibility to deal with the zillions of spelling
-# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
-# here one could include the most common spelling mistakes
-
-_exc = {"dass": "datt", "viläicht": "vläicht"}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 7c0ed8a04..c9cd82d7b 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -186,10 +186,6 @@ def suffix(string):
return string[-3:]
-def cluster(string):
- return 0
-
-
def is_alpha(string):
return string.isalpha()
@@ -218,20 +214,11 @@ def is_stop(string, stops=set()):
return string.lower() in stops
-def is_oov(string):
- return True
-
-
-def get_prob(string):
- return -20.0
-
-
LEX_ATTRS = {
attrs.LOWER: lower,
attrs.NORM: lower,
attrs.PREFIX: prefix,
attrs.SUFFIX: suffix,
- attrs.CLUSTER: cluster,
attrs.IS_ALPHA: is_alpha,
attrs.IS_DIGIT: is_digit,
attrs.IS_LOWER: is_lower,
@@ -239,8 +226,6 @@ LEX_ATTRS = {
attrs.IS_TITLE: is_title,
attrs.IS_UPPER: is_upper,
attrs.IS_STOP: is_stop,
- attrs.IS_OOV: is_oov,
- attrs.PROB: get_prob,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index f786d6542..c09996126 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -5,22 +5,17 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
-from .norm_exceptions import NORM_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "pt"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py
deleted file mode 100644
index ea650cb31..000000000
--- a/spacy/lang/pt/norm_exceptions.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# These exceptions are used to add NORM values based on a token's ORTH value.
-# Individual languages can also add their own exceptions and overwrite them -
-# for example, British vs. American spelling in English.
-
-# Norms are only set if no alternative is provided in the tokenizer exceptions.
-# Note that this does not change any other token attributes. Its main purpose
-# is to normalise the word representations so that equivalent tokens receive
-# similar representations. For example: $ and € are very different, but they're
-# both currency symbols. By normalising currency symbols to $, all symbols are
-# seen as similar, no matter how common they are in the training data.
-
-
-NORM_EXCEPTIONS = {
- "R$": "$", # Real
- "r$": "$", # Real
- "Cz$": "$", # Cruzado
- "cz$": "$", # Cruzado
- "NCz$": "$", # Cruzado Novo
- "ncz$": "$", # Cruzado Novo
-}
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index f34fc5435..f0e77d811 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -3,26 +3,21 @@ from __future__ import unicode_literals, print_function
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
-from ...util import update_exc, add_lookups
+from ...util import update_exc
from ...language import Language
from ...lookups import Lookups
-from ...attrs import LANG, NORM
+from ...attrs import LANG
class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ru"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py
deleted file mode 100644
index 43e08948c..000000000
--- a/spacy/lang/ru/norm_exceptions.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
- # Slang
- "прив": "привет",
- "дарова": "привет",
- "дак": "так",
- "дык": "так",
- "здарова": "привет",
- "пакедава": "пока",
- "пакедаво": "пока",
- "ща": "сейчас",
- "спс": "спасибо",
- "пжлст": "пожалуйста",
- "плиз": "пожалуйста",
- "ладненько": "ладно",
- "лады": "ладно",
- "лан": "ладно",
- "ясн": "ясно",
- "всм": "всмысле",
- "хош": "хочешь",
- "хаюшки": "привет",
- "оч": "очень",
- "че": "что",
- "чо": "что",
- "шо": "что",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index f27b87102..286d6693b 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -3,22 +3,17 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
-from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...attrs import LANG
+from ...util import update_exc
class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py
deleted file mode 100644
index 69f2c3173..000000000
--- a/spacy/lang/sr/norm_exceptions.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
- # Slang
- "ћале": "отац",
- "кева": "мајка",
- "смор": "досада",
- "кец": "јединица",
- "тебра": "брат",
- "штребер": "ученик",
- "факс": "факултет",
- "профа": "професор",
- "бус": "аутобус",
- "пискарало": "службеник",
- "бакутанер": "бака",
- "џибер": "простак",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py
deleted file mode 100644
index fbdceb98c..000000000
--- a/spacy/lang/ta/norm_exceptions.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-_exc = {
- # Regional words normal
- # Sri Lanka - wikipeadia
- "இங்க": "இங்கே",
- "வாங்க": "வாருங்கள்",
- "ஒண்டு": "ஒன்று",
- "கண்டு": "கன்று",
- "கொண்டு": "கொன்று",
- "பண்டி": "பன்றி",
- "பச்ச": "பச்சை",
- "அம்பது": "ஐம்பது",
- "வெச்ச": "வைத்து",
- "வச்ச": "வைத்து",
- "வச்சி": "வைத்து",
- "வாளைப்பழம்": "வாழைப்பழம்",
- "மண்ணு": "மண்",
- "பொன்னு": "பொன்",
- "சாவல்": "சேவல்",
- "அங்கால": "அங்கு ",
- "அசுப்பு": "நடமாட்டம்",
- "எழுவான் கரை": "எழுவான்கரை",
- "ஓய்யாரம்": "எழில் ",
- "ஒளும்பு": "எழும்பு",
- "ஓர்மை": "துணிவு",
- "கச்சை": "கோவணம்",
- "கடப்பு": "தெருவாசல்",
- "சுள்ளி": "காய்ந்த குச்சி",
- "திறாவுதல்": "தடவுதல்",
- "நாசமறுப்பு": "தொல்லை",
- "பரிசாரி": "வைத்தியன்",
- "பறவாதி": "பேராசைக்காரன்",
- "பிசினி": "உலோபி ",
- "விசர்": "பைத்தியம்",
- "ஏனம்": "பாத்திரம்",
- "ஏலா": "இயலாது",
- "ஒசில்": "அழகு",
- "ஒள்ளுப்பம்": "கொஞ்சம்",
- # Srilankan and indian
- "குத்துமதிப்பு": "",
- "நூனாயம்": "நூல்நயம்",
- "பைய": "மெதுவாக",
- "மண்டை": "தலை",
- "வெள்ளனே": "சீக்கிரம்",
- "உசுப்பு": "எழுப்பு",
- "ஆணம்": "குழம்பு",
- "உறக்கம்": "தூக்கம்",
- "பஸ்": "பேருந்து",
- "களவு": "திருட்டு ",
- # relationship
- "புருசன்": "கணவன்",
- "பொஞ்சாதி": "மனைவி",
- "புள்ள": "பிள்ளை",
- "பிள்ள": "பிள்ளை",
- "ஆம்பிளப்புள்ள": "ஆண் பிள்ளை",
- "பொம்பிளப்புள்ள": "பெண் பிள்ளை",
- "அண்ணாச்சி": "அண்ணா",
- "அக்காச்சி": "அக்கா",
- "தங்கச்சி": "தங்கை",
- # difference words
- "பொடியன்": "சிறுவன்",
- "பொட்டை": "சிறுமி",
- "பிறகு": "பின்பு",
- "டக்கென்டு": "விரைவாக",
- "கெதியா": "விரைவாக",
- "கிறுகி": "திரும்பி",
- "போயித்து வாறன்": "போய் வருகிறேன்",
- "வருவாங்களா": "வருவார்களா",
- # regular spokens
- "சொல்லு": "சொல்",
- "கேளு": "கேள்",
- "சொல்லுங்க": "சொல்லுங்கள்",
- "கேளுங்க": "கேளுங்கள்",
- "நீங்கள்": "நீ",
- "உன்": "உன்னுடைய",
- # Portugeese formal words
- "அலவாங்கு": "கடப்பாரை",
- "ஆசுப்பத்திரி": "மருத்துவமனை",
- "உரோதை": "சில்லு",
- "கடுதாசி": "கடிதம்",
- "கதிரை": "நாற்காலி",
- "குசினி": "அடுக்களை",
- "கோப்பை": "கிண்ணம்",
- "சப்பாத்து": "காலணி",
- "தாச்சி": "இரும்புச் சட்டி",
- "துவாய்": "துவாலை",
- "தவறணை": "மதுக்கடை",
- "பீப்பா": "மரத்தாழி",
- "யன்னல்": "சாளரம்",
- "வாங்கு": "மரஇருக்கை",
- # Dutch formal words
- "இறாக்கை": "பற்சட்டம்",
- "இலாட்சி": "இழுப்பறை",
- "கந்தோர்": "பணிமனை",
- "நொத்தாரிசு": "ஆவண எழுத்துபதிவாளர்",
- # English formal words
- "இஞ்சினியர்": "பொறியியலாளர்",
- "சூப்பு": "ரசம்",
- "செக்": "காசோலை",
- "சேட்டு": "மேற்ச்சட்டை",
- "மார்க்கட்டு": "சந்தை",
- "விண்ணன்": "கெட்டிக்காரன்",
- # Arabic formal words
- "ஈமான்": "நம்பிக்கை",
- "சுன்னத்து": "விருத்தசேதனம்",
- "செய்த்தான்": "பிசாசு",
- "மவுத்து": "இறப்பு",
- "ஹலால்": "அங்கீகரிக்கப்பட்டது",
- "கறாம்": "நிராகரிக்கப்பட்டது",
- # Persian, Hindustanian and hindi formal words
- "சுமார்": "கிட்டத்தட்ட",
- "சிப்பாய்": "போர்வீரன்",
- "சிபார்சு": "சிபாரிசு",
- "ஜமீன்": "பணக்காரா்",
- "அசல்": "மெய்யான",
- "அந்தஸ்து": "கௌரவம்",
- "ஆஜர்": "சமா்ப்பித்தல்",
- "உசார்": "எச்சரிக்கை",
- "அச்சா": "நல்ல",
- # English words used in text conversations
- "bcoz": "ஏனெனில்",
- "bcuz": "ஏனெனில்",
- "fav": "விருப்பமான",
- "morning": "காலை வணக்கம்",
- "gdeveng": "மாலை வணக்கம்",
- "gdnyt": "இரவு வணக்கம்",
- "gdnit": "இரவு வணக்கம்",
- "plz": "தயவு செய்து",
- "pls": "தயவு செய்து",
- "thx": "நன்றி",
- "thanx": "நன்றி",
-}
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 06970fbd7..512be0c59 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -4,14 +4,12 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
-from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
-from ..norm_exceptions import BASE_NORMS
-from ...attrs import LANG, NORM
+from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
-from ...util import DummyTokenizer, add_lookups
+from ...util import DummyTokenizer
class ThaiTokenizer(DummyTokenizer):
@@ -37,9 +35,6 @@ class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th"
- lex_attr_getters[NORM] = add_lookups(
- Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
- )
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py
deleted file mode 100644
index ed1b3e760..000000000
--- a/spacy/lang/th/norm_exceptions.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
-_exc = {
- # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
- "สนุ๊กเกอร์": "สนุกเกอร์",
- "โน้ต": "โน้ต",
- # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
- "โทสับ": "โทรศัพท์",
- "พุ่งนี้": "พรุ่งนี้",
- # Strange (ให้ดูแปลกตา)
- "ชะมะ": "ใช่ไหม",
- "ชิมิ": "ใช่ไหม",
- "ชะ": "ใช่ไหม",
- "ช่ายมะ": "ใช่ไหม",
- "ป่าว": "เปล่า",
- "ป่ะ": "เปล่า",
- "ปล่าว": "เปล่า",
- "คัย": "ใคร",
- "ไค": "ใคร",
- "คราย": "ใคร",
- "เตง": "ตัวเอง",
- "ตะเอง": "ตัวเอง",
- "รึ": "หรือ",
- "เหรอ": "หรือ",
- "หรา": "หรือ",
- "หรอ": "หรือ",
- "ชั้น": "ฉัน",
- "ชั้ล": "ฉัน",
- "ช้าน": "ฉัน",
- "เทอ": "เธอ",
- "เทอร์": "เธอ",
- "เทอว์": "เธอ",
- "แกร": "แก",
- "ป๋ม": "ผม",
- "บ่องตง": "บอกตรงๆ",
- "ถ่ามตง": "ถามตรงๆ",
- "ต่อมตง": "ตอบตรงๆ",
- "เพิ่ล": "เพื่อน",
- "จอบอ": "จอบอ",
- "ดั้ย": "ได้",
- "ขอบคุง": "ขอบคุณ",
- "ยังงัย": "ยังไง",
- "Inw": "เทพ",
- "uou": "นอน",
- "Lกรีeu": "เกรียน",
- # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
- "เปงราย": "เป็นอะไร",
- "เปนรัย": "เป็นอะไร",
- "เปงรัย": "เป็นอะไร",
- "เป็นอัลไล": "เป็นอะไร",
- "ทามมาย": "ทำไม",
- "ทามมัย": "ทำไม",
- "จังรุย": "จังเลย",
- "จังเยย": "จังเลย",
- "จุงเบย": "จังเลย",
- "ไม่รู้": "มะรุ",
- "เฮ่ย": "เฮ้ย",
- "เห้ย": "เฮ้ย",
- "น่าร็อค": "น่ารัก",
- "น่าร๊าก": "น่ารัก",
- "ตั้ลล๊าก": "น่ารัก",
- "คือร๊ะ": "คืออะไร",
- "โอป่ะ": "โอเคหรือเปล่า",
- "น่ามคาน": "น่ารำคาญ",
- "น่ามสาร": "น่าสงสาร",
- "วงวาร": "สงสาร",
- "บับว่า": "แบบว่า",
- "อัลไล": "อะไร",
- "อิจ": "อิจฉา",
- # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
- "กรู": "กู",
- "กุ": "กู",
- "กรุ": "กู",
- "ตู": "กู",
- "ตรู": "กู",
- "มรึง": "มึง",
- "เมิง": "มึง",
- "มืง": "มึง",
- "มุง": "มึง",
- "สาด": "สัตว์",
- "สัส": "สัตว์",
- "สัก": "สัตว์",
- "แสรด": "สัตว์",
- "โคโตะ": "โคตร",
- "โคด": "โคตร",
- "โครต": "โคตร",
- "โคตะระ": "โคตร",
- "พ่อง": "พ่อมึง",
- "แม่เมิง": "แม่มึง",
- "เชี่ย": "เหี้ย",
- # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
- "แอร๊ยย": "อ๊าย",
- "อร๊ายยย": "อ๊าย",
- "มันส์": "มัน",
- "วู๊วววววววว์": "วู้",
- # Acronym (แบบคำย่อ)
- "หมาลัย": "มหาวิทยาลัย",
- "วิดวะ": "วิศวะ",
- "สินสาด ": "ศิลปศาสตร์",
- "สินกำ ": "ศิลปกรรมศาสตร์",
- "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
- "เมกา ": "อเมริกา",
- "มอไซค์ ": "มอเตอร์ไซค์",
-}
-
-
-NORM_EXCEPTIONS = {}
-
-for string, norm in _exc.items():
- NORM_EXCEPTIONS[string] = norm
- NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/language.py b/spacy/language.py
index f23776def..703806627 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -28,10 +28,11 @@ from .compat import izip, basestring_, is_python2, class_types
from .gold import GoldParse
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
-from .attrs import IS_STOP, LANG
+from .attrs import IS_STOP, LANG, NORM
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
+from .lang.norm_exceptions import BASE_NORMS
from .lang.tag_map import TAG_MAP
from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
@@ -77,6 +78,9 @@ class BaseDefaults(object):
lemmatizer=lemmatizer,
lookups=lookups,
)
+ vocab.lex_attr_getters[NORM] = util.add_lookups(
+ vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+ )
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index f31733374..167f57462 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,8 +1,8 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-from .structs cimport LexemeC, SerializedLexemeC
+from .structs cimport LexemeC
from .strings cimport StringStore
from .vocab cimport Vocab
@@ -24,22 +24,6 @@ cdef class Lexeme:
self.vocab = vocab
self.orth = lex.orth
- @staticmethod
- cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
- cdef SerializedLexemeC lex_data
- buff = &lex.flags
- end = &lex.sentiment + sizeof(lex.sentiment)
- for i in range(sizeof(lex_data.data)):
- lex_data.data[i] = buff[i]
- return lex_data
-
- @staticmethod
- cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
- buff = &lex.flags
- end = &lex.sentiment + sizeof(lex.sentiment)
- for i in range(sizeof(lex_data.data)):
- buff[i] = lex_data.data[i]
-
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8):
@@ -56,8 +40,6 @@ cdef class Lexeme:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
- elif name == CLUSTER:
- lex.cluster = value
elif name == LANG:
lex.lang = value
@@ -84,8 +66,6 @@ cdef class Lexeme:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
- elif feat_name == CLUSTER:
- return lex.cluster
elif feat_name == LANG:
return lex.lang
else:
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index a081ffe42..dec2993fa 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -17,7 +17,7 @@ from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from .attrs cimport IS_CURRENCY, IS_OOV, PROB
+from .attrs cimport IS_CURRENCY
from .attrs import intify_attrs
from .errors import Errors, Warnings
@@ -89,12 +89,11 @@ cdef class Lexeme:
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
- if attr == PROB:
- self.c.prob = value
- elif attr == CLUSTER:
- self.c.cluster = int(value)
- elif isinstance(value, int) or isinstance(value, long):
- Lexeme.set_struct_attr(self.c, attr, value)
+ # skip PROB, e.g. from lexemes.jsonl
+ if isinstance(value, float):
+ continue
+ elif isinstance(value, (int, long)):
+ Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
@@ -137,34 +136,6 @@ cdef class Lexeme:
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
- def to_bytes(self):
- lex_data = Lexeme.c_to_bytes(self.c)
- start = &self.c.flags
- end = &self.c.sentiment + sizeof(self.c.sentiment)
- if (end-start) != sizeof(lex_data.data):
- raise ValueError(Errors.E072.format(length=end-start,
- bad_length=sizeof(lex_data.data)))
- byte_string = b"\0" * sizeof(lex_data.data)
- byte_chars = byte_string
- for i in range(sizeof(lex_data.data)):
- byte_chars[i] = lex_data.data[i]
- if len(byte_string) != sizeof(lex_data.data):
- raise ValueError(Errors.E072.format(length=len(byte_string),
- bad_length=sizeof(lex_data.data)))
- return byte_string
-
- def from_bytes(self, bytes byte_string):
- # This method doesn't really have a use-case --- wrote it for testing.
- # Possibly delete? It puts the Lexeme out of synch with the vocab.
- cdef SerializedLexemeC lex_data
- if len(byte_string) != sizeof(lex_data.data):
- raise ValueError(Errors.E072.format(length=len(byte_string),
- bad_length=sizeof(lex_data.data)))
- for i in range(len(byte_string)):
- lex_data.data[i] = byte_string[i]
- Lexeme.c_from_bytes(self.c, lex_data)
- self.orth = self.c.orth
-
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
@@ -208,10 +179,14 @@ cdef class Lexeme:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
- return self.c.sentiment
+ sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
+ return sentiment_table.get(self.c.orth, 0.0)
- def __set__(self, float sentiment):
- self.c.sentiment = sentiment
+ def __set__(self, float x):
+ if "lexeme_sentiment" not in self.vocab.lookups:
+ self.vocab.lookups.add_table("lexeme_sentiment")
+ sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
+ sentiment_table[self.c.orth] = x
@property
def orth_(self):
@@ -238,9 +213,13 @@ cdef class Lexeme:
lexeme text.
"""
def __get__(self):
- return self.c.norm
+ return self.c.norm
def __set__(self, attr_t x):
+ if "lexeme_norm" not in self.vocab.lookups:
+ self.vocab.lookups.add_table("lexeme_norm")
+ norm_table = self.vocab.lookups.get_table("lexeme_norm")
+ norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
property shape:
@@ -276,10 +255,12 @@ cdef class Lexeme:
property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
- return self.c.cluster
+ cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+ return cluster_table.get(self.c.orth, 0)
- def __set__(self, attr_t x):
- self.c.cluster = x
+ def __set__(self, int x):
+ cluster_table = self.vocab.load_extra_lookups("lexeme_cluster")
+ cluster_table[self.c.orth] = x
property lang:
"""RETURNS (uint64): Language of the parent vocabulary."""
@@ -293,10 +274,14 @@ cdef class Lexeme:
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
type."""
def __get__(self):
- return self.c.prob
+ prob_table = self.vocab.load_extra_lookups("lexeme_prob")
+ settings_table = self.vocab.load_extra_lookups("lexeme_settings")
+ default_oov_prob = settings_table.get("oov_prob", -20.0)
+ return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x):
- self.c.prob = x
+ prob_table = self.vocab.load_extra_lookups("lexeme_prob")
+ prob_table[self.c.orth] = x
property lower_:
"""RETURNS (unicode): Lowercase form of the word."""
@@ -314,7 +299,7 @@ cdef class Lexeme:
return self.vocab.strings[self.c.norm]
def __set__(self, unicode x):
- self.c.norm = self.vocab.strings.add(x)
+ self.norm = self.vocab.strings.add(x)
property shape_:
"""RETURNS (unicode): Transform of the word's string, to show
@@ -362,13 +347,10 @@ cdef class Lexeme:
def __set__(self, flags_t x):
self.c.flags = x
- property is_oov:
+ @property
+ def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
- def __get__(self):
- return Lexeme.c_check_flag(self.c, IS_OOV)
-
- def __set__(self, attr_t x):
- Lexeme.c_set_flag(self.c, IS_OOV, x)
+ return self.orth in self.vocab.vectors
property is_stop:
"""RETURNS (bool): Whether the lexeme is a stop word."""
diff --git a/spacy/lookups.py b/spacy/lookups.py
index bf250b4b4..1fa29bdfe 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -124,7 +124,7 @@ class Lookups(object):
self._tables[key].update(value)
return self
- def to_disk(self, path, **kwargs):
+ def to_disk(self, path, filename="lookups.bin", **kwargs):
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
@@ -136,11 +136,11 @@ class Lookups(object):
path = ensure_path(path)
if not path.exists():
path.mkdir()
- filepath = path / "lookups.bin"
+ filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
- def from_disk(self, path, **kwargs):
+ def from_disk(self, path, filename="lookups.bin", **kwargs):
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
@@ -150,7 +150,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_disk
"""
path = ensure_path(path)
- filepath = path / "lookups.bin"
+ filepath = path / filename
if filepath.exists():
with filepath.open("rb") as file_:
data = file_.read()
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index b8e63a725..1f5f32675 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -23,29 +23,6 @@ cdef struct LexemeC:
attr_t prefix
attr_t suffix
- attr_t cluster
-
- float prob
- float sentiment
-
-
-cdef struct SerializedLexemeC:
- unsigned char[8 + 8*10 + 4 + 4] data
- # sizeof(flags_t) # flags
- # + sizeof(attr_t) # lang
- # + sizeof(attr_t) # id
- # + sizeof(attr_t) # length
- # + sizeof(attr_t) # orth
- # + sizeof(attr_t) # lower
- # + sizeof(attr_t) # norm
- # + sizeof(attr_t) # shape
- # + sizeof(attr_t) # prefix
- # + sizeof(attr_t) # suffix
- # + sizeof(attr_t) # cluster
- # + sizeof(float) # prob
- # + sizeof(float) # cluster
- # + sizeof(float) # l2_norm
-
cdef struct SpanC:
hash_t id
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 9229c9970..ebb87c8d2 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -12,7 +12,7 @@ cdef enum symbol_t:
LIKE_NUM
LIKE_EMAIL
IS_STOP
- IS_OOV
+ IS_OOV_DEPRECATED
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index e438caba5..83a9d0482 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -17,7 +17,7 @@ IDS = {
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
- "IS_OOV": IS_OOV,
+ "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py
index 837ceb323..503399ee4 100644
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@@ -37,14 +37,6 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
assert tokens[7].text == "."
-@pytest.mark.parametrize(
- "text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
-)
-def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
- tokens = da_tokenizer(text)
- assert tokens[0].norm_ == norm
-
-
@pytest.mark.parametrize(
"text,n_tokens",
[
diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py
index 2e065870e..3b464e1ae 100644
--- a/spacy/tests/lang/de/test_exceptions.py
+++ b/spacy/tests/lang/de/test_exceptions.py
@@ -22,17 +22,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
assert len(tokens) == 6
assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit"
-
-
-@pytest.mark.parametrize(
- "text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
-)
-def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
- tokens = de_tokenizer(text)
- assert [token.norm_ for token in tokens] == norms
-
-
-@pytest.mark.parametrize("text,norm", [("daß", "dass")])
-def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
- tokens = de_tokenizer(text)
- assert tokens[0].norm_ == norm
diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py
index 6285a9408..a78e1815f 100644
--- a/spacy/tests/lang/en/test_exceptions.py
+++ b/spacy/tests/lang/en/test_exceptions.py
@@ -118,6 +118,7 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
assert [token.norm_ for token in tokens] == norms
+@pytest.mark.skip
@pytest.mark.parametrize(
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
)
diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py
index 7ca2394b7..ebfab75cf 100644
--- a/spacy/tests/lang/lb/test_exceptions.py
+++ b/spacy/tests/lang/lb/test_exceptions.py
@@ -22,9 +22,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
assert len(tokens) == 9
assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et"
-
-
-@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
-def test_lb_norm_exceptions(lb_tokenizer, text, norm):
- tokens = lb_tokenizer(text)
- assert tokens[0].norm_ == norm
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 1671845ee..63faf44fc 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
+import pickle
from spacy.vocab import Vocab
from spacy.strings import StringStore
@@ -36,8 +37,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
- assert len(new_vocab1) == len(strings1)
- assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
+ assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
+ assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
@pytest.mark.parametrize("strings1,strings2", test_strings)
@@ -51,12 +52,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
vocab2.to_disk(file_path2)
vocab1_d = Vocab().from_disk(file_path1)
vocab2_d = Vocab().from_disk(file_path2)
- assert list(vocab1_d) == list(vocab1)
- assert list(vocab2_d) == list(vocab2)
+ # check strings rather than lexemes, which are only reloaded on demand
+ assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
+ assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
if strings1 == strings2:
- assert list(vocab1_d) == list(vocab2_d)
+ assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
else:
- assert list(vocab1_d) != list(vocab2_d)
+ assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -76,7 +78,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
vocab = Vocab(strings=strings)
length = len(vocab)
vocab.from_bytes(vocab.to_bytes())
- assert len(vocab) == length
+ assert len(vocab.strings) == len(strings) + 1 # adds _SP
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -127,3 +129,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
assert list(sstore1_d) == list(sstore2_d)
else:
assert list(sstore1_d) != list(sstore2_d)
+
+@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
+def test_pickle_vocab(strings, lex_attr):
+ vocab = Vocab(strings=strings)
+ vocab[strings[0]].norm_ = lex_attr
+ vocab_pickled = pickle.dumps(vocab)
+ vocab_unpickled = pickle.loads(vocab_pickled)
+ assert vocab.to_bytes() == vocab_unpickled.to_bytes()
diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py
index 701222afc..bcda2999a 100644
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@@ -26,7 +26,7 @@ def test_lemmatizer_reflects_lookups_changes():
nlp_bytes = nlp.to_bytes()
new_nlp.from_bytes(nlp_bytes)
# Make sure we have the previously saved lookup table
- assert len(new_nlp.vocab.lookups) == 1
+ assert "lemma_lookup" in new_nlp.vocab.lookups
assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2
assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world"
assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar"
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index b57c6705a..af73a79bf 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -60,19 +60,6 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab):
assert en_vocab["dogs"].check_flag(is_len4) is True
-def test_lexeme_bytes_roundtrip(en_vocab):
- one = en_vocab["one"]
- alpha = en_vocab["alpha"]
- assert one.orth != alpha.orth
- assert one.lower != alpha.lower
- alpha.from_bytes(one.to_bytes())
-
- assert one.orth_ == alpha.orth_
- assert one.orth == alpha.orth
- assert one.lower == alpha.lower
- assert one.lower_ == alpha.lower_
-
-
def test_vocab_lexeme_oov_rank(en_vocab):
"""Test that default rank is OOV_RANK."""
lex = en_vocab["word"]
diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py
index f78dd33c4..af15e9e91 100644
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@@ -119,12 +119,11 @@ def test_lookups_to_from_bytes_via_vocab():
table_name = "test"
vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
- assert len(vocab.lookups) == 1
assert table_name in vocab.lookups
vocab_bytes = vocab.to_bytes()
new_vocab = Vocab()
new_vocab.from_bytes(vocab_bytes)
- assert len(new_vocab.lookups) == 1
+ assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
@@ -137,13 +136,12 @@ def test_lookups_to_from_disk_via_vocab():
table_name = "test"
vocab = Vocab()
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
- assert len(vocab.lookups) == 1
assert table_name in vocab.lookups
with make_tempdir() as tmpdir:
vocab.to_disk(tmpdir)
new_vocab = Vocab()
new_vocab.from_disk(tmpdir)
- assert len(new_vocab.lookups) == 1
+ assert len(new_vocab.lookups) == len(vocab.lookups)
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 322ef462a..16d9801ab 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -329,3 +329,15 @@ def test_vocab_prune_vectors():
neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+
+
+def test_vector_is_oov():
+ vocab = Vocab(vectors_name="test_vocab_is_oov")
+ data = numpy.ndarray((5, 3), dtype="f")
+ data[0] = 1.0
+ data[1] = 2.0
+ vocab.set_vector("cat", data[0])
+ vocab.set_vector("dog", data[1])
+ assert vocab["cat"].is_oov is True
+ assert vocab["dog"].is_oov is True
+ assert vocab["hamster"].is_oov is False
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index b79d2d805..45deebc93 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -17,7 +17,7 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
+from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..symbols cimport conj
@@ -259,7 +259,7 @@ cdef class Token:
@property
def prob(self):
"""RETURNS (float): Smoothed log probability estimate of token type."""
- return self.c.lex.prob
+ return self.vocab[self.c.lex.orth].prob
@property
def sentiment(self):
@@ -267,7 +267,7 @@ cdef class Token:
negativity of the token."""
if "sentiment" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["sentiment"](self)
- return self.c.lex.sentiment
+ return self.vocab[self.c.lex.orth].sentiment
@property
def lang(self):
@@ -286,7 +286,7 @@ cdef class Token:
@property
def cluster(self):
"""RETURNS (int): Brown cluster ID."""
- return self.c.lex.cluster
+ return self.vocab[self.c.lex.orth].cluster
@property
def orth(self):
@@ -923,7 +923,7 @@ cdef class Token:
@property
def is_oov(self):
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
- return Lexeme.c_check_flag(self.c.lex, IS_OOV)
+ return self.c.lex.orth in self.vocab.vectors
@property
def is_stop(self):
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index d989d6c40..73754eb02 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -30,6 +30,7 @@ cdef class Vocab:
cpdef public Morphology morphology
cpdef public object vectors
cpdef public object lookups
+ cpdef public object lookups_extra
cdef readonly int length
cdef public object data_dir
cdef public object lex_attr_getters
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ef2e86bcc..68f0ac0db 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -11,8 +11,7 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK
from .lexeme cimport Lexeme
from .typedefs cimport attr_t
from .tokens.token cimport Token
-from .attrs cimport PROB, LANG, ORTH, TAG, POS
-from .structs cimport SerializedLexemeC
+from .attrs cimport LANG, ORTH, TAG, POS
from .compat import copy_reg, basestring_
from .errors import Errors
@@ -22,6 +21,8 @@ from .vectors import Vectors
from ._ml import link_vectors_to_models
from .lookups import Lookups
from . import util
+from .lang.norm_exceptions import BASE_NORMS
+from .lang.lex_attrs import LEX_ATTRS
cdef class Vocab:
@@ -32,8 +33,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
- strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None,
- **deprecated_kwargs):
+ strings=tuple(), lookups=None, lookups_extra=None,
+ oov_prob=-20., vectors_name=None, **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -44,6 +45,7 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
+ lookups_extra (Lookups): Container for optional lookup tables and dictionaries.
name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
"""
@@ -51,8 +53,12 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {}
if lookups in (None, True, False):
lookups = Lookups()
+ if "lexeme_norm" not in lookups:
+ lookups.add_table("lexeme_norm")
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer(lookups)
+ if lookups_extra in (None, True, False):
+ lookups_extra = Lookups()
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_orth = PreshMap()
@@ -65,6 +71,7 @@ cdef class Vocab:
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(name=vectors_name)
self.lookups = lookups
+ self.lookups_extra = lookups_extra
@property
def lang(self):
@@ -173,9 +180,7 @@ cdef class Vocab:
value = func(string)
if isinstance(value, unicode):
value = self.strings.add(value)
- if attr == PROB:
- lex.prob = value
- elif value is not None:
+ if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
@@ -435,17 +440,16 @@ cdef class Vocab:
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
- setters = ["strings", "lexemes", "vectors"]
+ setters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
- if "lexemes" not in exclude:
- with (path / "lexemes.bin").open("wb") as file_:
- file_.write(self.lexemes_to_bytes())
if "vectors" not in "exclude" and self.vectors is not None:
self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None:
self.lookups.to_disk(path)
+ if "lookups_extra" not in "exclude" and self.lookups_extra is not None:
+ self.lookups_extra.to_disk(path, filename="lookups_extra.bin")
def from_disk(self, path, exclude=tuple(), **kwargs):
"""Loads state from a directory. Modifies the object in place and
@@ -458,13 +462,10 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
- getters = ["strings", "lexemes", "vectors"]
+ getters = ["strings", "vectors"]
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
- if "lexemes" not in exclude:
- with (path / "lexemes.bin").open("rb") as file_:
- self.lexemes_from_bytes(file_.read())
if "vectors" not in exclude:
if self.vectors is not None:
self.vectors.from_disk(path, exclude=["strings"])
@@ -472,6 +473,14 @@ cdef class Vocab:
link_vectors_to_models(self)
if "lookups" not in exclude:
self.lookups.from_disk(path)
+ if "lookups_extra" not in exclude:
+ self.lookups_extra.from_disk(path, filename="lookups_extra.bin")
+ if "lexeme_norm" in self.lookups:
+ self.lex_attr_getters[NORM] = util.add_lookups(
+ self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+ )
+ self.length = 0
+ self._by_orth = PreshMap()
return self
def to_bytes(self, exclude=tuple(), **kwargs):
@@ -490,9 +499,9 @@ cdef class Vocab:
getters = OrderedDict((
("strings", lambda: self.strings.to_bytes()),
- ("lexemes", lambda: self.lexemes_to_bytes()),
("vectors", deserialize_vectors),
- ("lookups", lambda: self.lookups.to_bytes())
+ ("lookups", lambda: self.lookups.to_bytes()),
+ ("lookups_extra", lambda: self.lookups_extra.to_bytes())
))
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude)
@@ -514,99 +523,62 @@ cdef class Vocab:
setters = OrderedDict((
("strings", lambda b: self.strings.from_bytes(b)),
- ("lexemes", lambda b: self.lexemes_from_bytes(b)),
("vectors", lambda b: serialize_vectors(b)),
- ("lookups", lambda b: self.lookups.from_bytes(b))
+ ("lookups", lambda b: self.lookups.from_bytes(b)),
+ ("lookups_extra", lambda b: self.lookups_extra.from_bytes(b))
))
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude)
+ if "lexeme_norm" in self.lookups:
+ self.lex_attr_getters[NORM] = util.add_lookups(
+ self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+ )
+ self.length = 0
+ self._by_orth = PreshMap()
if self.vectors.name is not None:
link_vectors_to_models(self)
return self
- def lexemes_to_bytes(self):
- cdef hash_t key
- cdef size_t addr
- cdef LexemeC* lexeme = NULL
- cdef SerializedLexemeC lex_data
- cdef int size = 0
- for key, addr in self._by_orth.items():
- if addr == 0:
- continue
- size += sizeof(lex_data.data)
- byte_string = b"\0" * size
- byte_ptr = byte_string
- cdef int j
- cdef int i = 0
- for key, addr in self._by_orth.items():
- if addr == 0:
- continue
- lexeme = addr
- lex_data = Lexeme.c_to_bytes(lexeme)
- for j in range(sizeof(lex_data.data)):
- byte_ptr[i] = lex_data.data[j]
- i += 1
- return byte_string
-
- def lexemes_from_bytes(self, bytes bytes_data):
- """Load the binary vocabulary data from the given string."""
- cdef LexemeC* lexeme
- cdef hash_t key
- cdef unicode py_str
- cdef int i = 0
- cdef int j = 0
- cdef SerializedLexemeC lex_data
- chunk_size = sizeof(lex_data.data)
- cdef void* ptr
- cdef unsigned char* bytes_ptr = bytes_data
- for i in range(0, len(bytes_data), chunk_size):
- lexeme = self.mem.alloc(1, sizeof(LexemeC))
- for j in range(sizeof(lex_data.data)):
- lex_data.data[j] = bytes_ptr[i+j]
- Lexeme.c_from_bytes(lexeme, lex_data)
- prev_entry = self._by_orth.get(lexeme.orth)
- if prev_entry != NULL:
- memcpy(prev_entry, lexeme, sizeof(LexemeC))
- continue
- ptr = self.strings._map.get(lexeme.orth)
- if ptr == NULL:
- continue
- py_str = self.strings[lexeme.orth]
- if self.strings[py_str] != lexeme.orth:
- raise ValueError(Errors.E086.format(string=py_str,
- orth_id=lexeme.orth,
- hash_id=self.strings[py_str]))
- self._by_orth.set(lexeme.orth, lexeme)
- self.length += 1
-
def _reset_cache(self, keys, strings):
# I'm not sure this made sense. Disable it for now.
raise NotImplementedError
+ def load_extra_lookups(self, table_name):
+ if table_name not in self.lookups_extra:
+ if self.lang + "_extra" in util.registry.lookups:
+ tables = util.registry.lookups.get(self.lang + "_extra")
+ for name, filename in tables.items():
+ if table_name == name:
+ data = util.load_language_data(filename)
+ self.lookups_extra.add_table(name, data)
+ if table_name not in self.lookups_extra:
+ self.lookups_extra.add_table(table_name)
+ return self.lookups_extra.get_table(table_name)
+
+
def pickle_vocab(vocab):
sstore = vocab.strings
vectors = vocab.vectors
morph = vocab.morphology
- length = vocab.length
data_dir = vocab.data_dir
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
- lexemes_data = vocab.lexemes_to_bytes()
+ lookups = vocab.lookups
+ lookups_extra = vocab.lookups_extra
return (unpickle_vocab,
- (sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
+ (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, lookups_extra))
def unpickle_vocab(sstore, vectors, morphology, data_dir,
- lex_attr_getters, bytes lexemes_data, int length):
+ lex_attr_getters, lookups, lookups_extra):
cdef Vocab vocab = Vocab()
- vocab.length = length
vocab.vectors = vectors
vocab.strings = sstore
vocab.morphology = morphology
vocab.data_dir = data_dir
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
- vocab.lexemes_from_bytes(lexemes_data)
- vocab.length = length
+ vocab.lookups = lookups
+ vocab.lookups_extra = lookups_extra
return vocab
From 0061992d958ca56b2e5ba1ebd1fadee402fd65d7 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 19 May 2020 15:59:55 +0200
Subject: [PATCH 38/69] Update Polish tokenizer for UD_Polish-PDB (#5432)
Update Polish tokenizer for UD_Polish-PDB, which is a relatively major
change from the existing tokenizer. Unused exceptions files and
conflicting test cases removed.
Co-authored-by: Matthew Honnibal
---
spacy/lang/pl/__init__.py | 11 +-
spacy/lang/pl/_tokenizer_exceptions_list.py | 1443 -------------------
spacy/lang/pl/polish_srx_rules_LICENSE.txt | 23 -
spacy/lang/pl/punctuation.py | 36 +-
spacy/lang/pl/tokenizer_exceptions.py | 26 -
spacy/tests/lang/pl/test_tokenizer.py | 36 +-
6 files changed, 39 insertions(+), 1536 deletions(-)
delete mode 100644 spacy/lang/pl/_tokenizer_exceptions_list.py
delete mode 100644 spacy/lang/pl/polish_srx_rules_LICENSE.txt
delete mode 100644 spacy/lang/pl/tokenizer_exceptions.py
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 0540bf535..61608a3d9 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_SUFFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
- tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+ mod_base_exceptions = {
+ exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
+ }
+ tokenizer_exceptions = mod_base_exceptions
stop_words = STOP_WORDS
tag_map = TAG_MAP
+ prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
+ suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py
deleted file mode 100644
index 839eccb83..000000000
--- a/spacy/lang/pl/_tokenizer_exceptions_list.py
+++ /dev/null
@@ -1,1443 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
-# The following list consists of:
-# - exceptions generated from polish_srx_rules [1]
-# (https://github.com/milekpl/polish_srx_rules)
-# - abbreviations parsed from Wikipedia
-# - some manually added exceptions
-#
-# [1] M. Miłkowski and J. Lipski,
-# "Using SRX Standard for Sentence Segmentation," in LTC 2009,
-# Lecture Notes in Artificial Intelligence 6562,
-# Z. Vetulani, Ed. Berlin Heidelberg: Springer-Verlag, 2011, pp. 172–182.
-PL_BASE_EXCEPTIONS = [
- "0.",
- "1.",
- "10.",
- "2.",
- "3.",
- "4.",
- "5.",
- "6.",
- "7.",
- "8.",
- "9.",
- "A.A.",
- "A.B.",
- "A.C.",
- "A.D.",
- "A.E.",
- "A.F.",
- "A.G.",
- "A.H.",
- "A.I.",
- "A.J.",
- "A.K.",
- "A.L.",
- "A.M.",
- "A.N.",
- "A.O.",
- "A.P.",
- "A.R.",
- "A.S.",
- "A.T.",
- "A.U.",
- "A.W.",
- "A.Y.",
- "A.Z.",
- "A.Ó.",
- "A.Ą.",
- "A.Ć.",
- "A.Ę.",
- "A.Ł.",
- "A.Ń.",
- "A.Ś.",
- "A.Ź.",
- "A.Ż.",
- "Ad.",
- "Adw.",
- "Al.",
- "Art.",
- "B.A.",
- "B.B.",
- "B.C.",
- "B.D.",
- "B.E.",
- "B.F.",
- "B.G.",
- "B.H.",
- "B.I.",
- "B.J.",
- "B.K.",
- "B.L.",
- "B.M.",
- "B.N.",
- "B.O.",
- "B.P.",
- "B.R.",
- "B.S.",
- "B.T.",
- "B.U.",
- "B.W.",
- "B.Y.",
- "B.Z.",
- "B.Ó.",
- "B.Ą.",
- "B.Ć.",
- "B.Ę.",
- "B.Ł.",
- "B.Ń.",
- "B.Ś.",
- "B.Ź.",
- "B.Ż.",
- "D.A.",
- "D.B.",
- "D.C.",
- "D.D.",
- "D.E.",
- "D.F.",
- "D.G.",
- "D.H.",
- "D.I.",
- "D.J.",
- "D.K.",
- "D.L.",
- "D.M.",
- "D.N.",
- "D.O.",
- "D.P.",
- "D.R.",
- "D.S.",
- "D.T.",
- "D.U.",
- "D.W.",
- "D.Y.",
- "D.Z.",
- "D.Ó.",
- "D.Ą.",
- "D.Ć.",
- "D.Ę.",
- "D.Ł.",
- "D.Ń.",
- "D.Ś.",
- "D.Ź.",
- "D.Ż.",
- "Dh.",
- "Doc.",
- "Dr.",
- "Dyr.",
- "Dyw.",
- "Dz.U.",
- "E.A.",
- "E.B.",
- "E.C.",
- "E.D.",
- "E.E.",
- "E.F.",
- "E.G.",
- "E.H.",
- "E.I.",
- "E.J.",
- "E.K.",
- "E.L.",
- "E.M.",
- "E.N.",
- "E.O.",
- "E.P.",
- "E.R.",
- "E.S.",
- "E.T.",
- "E.U.",
- "E.W.",
- "E.Y.",
- "E.Z.",
- "E.Ó.",
- "E.Ą.",
- "E.Ć.",
- "E.Ę.",
- "E.Ł.",
- "E.Ń.",
- "E.Ś.",
- "E.Ź.",
- "E.Ż.",
- "F.A.",
- "F.B.",
- "F.C.",
- "F.D.",
- "F.E.",
- "F.F.",
- "F.G.",
- "F.H.",
- "F.I.",
- "F.J.",
- "F.K.",
- "F.L.",
- "F.M.",
- "F.N.",
- "F.O.",
- "F.P.",
- "F.R.",
- "F.S.",
- "F.T.",
- "F.U.",
- "F.W.",
- "F.Y.",
- "F.Z.",
- "F.Ó.",
- "F.Ą.",
- "F.Ć.",
- "F.Ę.",
- "F.Ł.",
- "F.Ń.",
- "F.Ś.",
- "F.Ź.",
- "F.Ż.",
- "G.A.",
- "G.B.",
- "G.C.",
- "G.D.",
- "G.E.",
- "G.F.",
- "G.G.",
- "G.H.",
- "G.I.",
- "G.J.",
- "G.K.",
- "G.L.",
- "G.M.",
- "G.N.",
- "G.O.",
- "G.P.",
- "G.R.",
- "G.S.",
- "G.T.",
- "G.U.",
- "G.W.",
- "G.Y.",
- "G.Z.",
- "G.Ó.",
- "G.Ą.",
- "G.Ć.",
- "G.Ę.",
- "G.Ł.",
- "G.Ń.",
- "G.Ś.",
- "G.Ź.",
- "G.Ż.",
- "H.A.",
- "H.B.",
- "H.C.",
- "H.D.",
- "H.E.",
- "H.F.",
- "H.G.",
- "H.H.",
- "H.I.",
- "H.J.",
- "H.K.",
- "H.L.",
- "H.M.",
- "H.N.",
- "H.O.",
- "H.P.",
- "H.R.",
- "H.S.",
- "H.T.",
- "H.U.",
- "H.W.",
- "H.Y.",
- "H.Z.",
- "H.Ó.",
- "H.Ą.",
- "H.Ć.",
- "H.Ę.",
- "H.Ł.",
- "H.Ń.",
- "H.Ś.",
- "H.Ź.",
- "H.Ż.",
- "Hr.",
- "I.A.",
- "I.B.",
- "I.C.",
- "I.D.",
- "I.E.",
- "I.F.",
- "I.G.",
- "I.H.",
- "I.I.",
- "I.J.",
- "I.K.",
- "I.L.",
- "I.M.",
- "I.N.",
- "I.O.",
- "I.P.",
- "I.R.",
- "I.S.",
- "I.T.",
- "I.U.",
- "I.W.",
- "I.Y.",
- "I.Z.",
- "I.Ó.",
- "I.Ą.",
- "I.Ć.",
- "I.Ę.",
- "I.Ł.",
- "I.Ń.",
- "I.Ś.",
- "I.Ź.",
- "I.Ż.",
- "Inż.",
- "J.A.",
- "J.B.",
- "J.C.",
- "J.D.",
- "J.E.",
- "J.F.",
- "J.G.",
- "J.H.",
- "J.I.",
- "J.J.",
- "J.K.",
- "J.L.",
- "J.M.",
- "J.N.",
- "J.O.",
- "J.P.",
- "J.R.",
- "J.S.",
- "J.T.",
- "J.U.",
- "J.W.",
- "J.Y.",
- "J.Z.",
- "J.Ó.",
- "J.Ą.",
- "J.Ć.",
- "J.Ę.",
- "J.Ł.",
- "J.Ń.",
- "J.Ś.",
- "J.Ź.",
- "J.Ż.",
- "K.A.",
- "K.B.",
- "K.C.",
- "K.D.",
- "K.E.",
- "K.F.",
- "K.G.",
- "K.H.",
- "K.I.",
- "K.J.",
- "K.K.",
- "K.L.",
- "K.M.",
- "K.N.",
- "K.O.",
- "K.P.",
- "K.R.",
- "K.S.",
- "K.T.",
- "K.U.",
- "K.W.",
- "K.Y.",
- "K.Z.",
- "K.Ó.",
- "K.Ą.",
- "K.Ć.",
- "K.Ę.",
- "K.Ł.",
- "K.Ń.",
- "K.Ś.",
- "K.Ź.",
- "K.Ż.",
- "Ks.",
- "L.A.",
- "L.B.",
- "L.C.",
- "L.D.",
- "L.E.",
- "L.F.",
- "L.G.",
- "L.H.",
- "L.I.",
- "L.J.",
- "L.K.",
- "L.L.",
- "L.M.",
- "L.N.",
- "L.O.",
- "L.P.",
- "L.R.",
- "L.S.",
- "L.T.",
- "L.U.",
- "L.W.",
- "L.Y.",
- "L.Z.",
- "L.Ó.",
- "L.Ą.",
- "L.Ć.",
- "L.Ę.",
- "L.Ł.",
- "L.Ń.",
- "L.Ś.",
- "L.Ź.",
- "L.Ż.",
- "Lek.",
- "M.A.",
- "M.B.",
- "M.C.",
- "M.D.",
- "M.E.",
- "M.F.",
- "M.G.",
- "M.H.",
- "M.I.",
- "M.J.",
- "M.K.",
- "M.L.",
- "M.M.",
- "M.N.",
- "M.O.",
- "M.P.",
- "M.R.",
- "M.S.",
- "M.T.",
- "M.U.",
- "M.W.",
- "M.Y.",
- "M.Z.",
- "M.Ó.",
- "M.Ą.",
- "M.Ć.",
- "M.Ę.",
- "M.Ł.",
- "M.Ń.",
- "M.Ś.",
- "M.Ź.",
- "M.Ż.",
- "Mat.",
- "Mec.",
- "Mojż.",
- "N.A.",
- "N.B.",
- "N.C.",
- "N.D.",
- "N.E.",
- "N.F.",
- "N.G.",
- "N.H.",
- "N.I.",
- "N.J.",
- "N.K.",
- "N.L.",
- "N.M.",
- "N.N.",
- "N.O.",
- "N.P.",
- "N.R.",
- "N.S.",
- "N.T.",
- "N.U.",
- "N.W.",
- "N.Y.",
- "N.Z.",
- "N.Ó.",
- "N.Ą.",
- "N.Ć.",
- "N.Ę.",
- "N.Ł.",
- "N.Ń.",
- "N.Ś.",
- "N.Ź.",
- "N.Ż.",
- "Na os.",
- "Nadkom.",
- "Najśw.",
- "Nb.",
- "Np.",
- "O.A.",
- "O.B.",
- "O.C.",
- "O.D.",
- "O.E.",
- "O.F.",
- "O.G.",
- "O.H.",
- "O.I.",
- "O.J.",
- "O.K.",
- "O.L.",
- "O.M.",
- "O.N.",
- "O.O.",
- "O.P.",
- "O.R.",
- "O.S.",
- "O.T.",
- "O.U.",
- "O.W.",
- "O.Y.",
- "O.Z.",
- "O.Ó.",
- "O.Ą.",
- "O.Ć.",
- "O.Ę.",
- "O.Ł.",
- "O.Ń.",
- "O.Ś.",
- "O.Ź.",
- "O.Ż.",
- "OO.",
- "Oo.",
- "P.A.",
- "P.B.",
- "P.C.",
- "P.D.",
- "P.E.",
- "P.F.",
- "P.G.",
- "P.H.",
- "P.I.",
- "P.J.",
- "P.K.",
- "P.L.",
- "P.M.",
- "P.N.",
- "P.O.",
- "P.P.",
- "P.R.",
- "P.S.",
- "P.T.",
- "P.U.",
- "P.W.",
- "P.Y.",
- "P.Z.",
- "P.Ó.",
- "P.Ą.",
- "P.Ć.",
- "P.Ę.",
- "P.Ł.",
- "P.Ń.",
- "P.Ś.",
- "P.Ź.",
- "P.Ż.",
- "Podkom.",
- "Przyp.",
- "Ps.",
- "Pt.",
- "Płk.",
- "R.A.",
- "R.B.",
- "R.C.",
- "R.D.",
- "R.E.",
- "R.F.",
- "R.G.",
- "R.H.",
- "R.I.",
- "R.J.",
- "R.K.",
- "R.L.",
- "R.M.",
- "R.N.",
- "R.O.",
- "R.P.",
- "R.R.",
- "R.S.",
- "R.T.",
- "R.U.",
- "R.W.",
- "R.Y.",
- "R.Z.",
- "R.Ó.",
- "R.Ą.",
- "R.Ć.",
- "R.Ę.",
- "R.Ł.",
- "R.Ń.",
- "R.Ś.",
- "R.Ź.",
- "R.Ż.",
- "Red.",
- "Reż.",
- "Ryc.",
- "Rys.",
- "S.A.",
- "S.B.",
- "S.C.",
- "S.D.",
- "S.E.",
- "S.F.",
- "S.G.",
- "S.H.",
- "S.I.",
- "S.J.",
- "S.K.",
- "S.L.",
- "S.M.",
- "S.N.",
- "S.O.",
- "S.P.",
- "S.R.",
- "S.S.",
- "S.T.",
- "S.U.",
- "S.W.",
- "S.Y.",
- "S.Z.",
- "S.Ó.",
- "S.Ą.",
- "S.Ć.",
- "S.Ę.",
- "S.Ł.",
- "S.Ń.",
- "S.Ś.",
- "S.Ź.",
- "S.Ż.",
- "Sp.",
- "Spółdz.",
- "Stow.",
- "Stoł.",
- "Sz.P.",
- "Szer.",
- "T.A.",
- "T.B.",
- "T.C.",
- "T.D.",
- "T.E.",
- "T.F.",
- "T.G.",
- "T.H.",
- "T.I.",
- "T.J.",
- "T.K.",
- "T.L.",
- "T.M.",
- "T.N.",
- "T.O.",
- "T.P.",
- "T.R.",
- "T.S.",
- "T.T.",
- "T.U.",
- "T.W.",
- "T.Y.",
- "T.Z.",
- "T.Ó.",
- "T.Ą.",
- "T.Ć.",
- "T.Ę.",
- "T.Ł.",
- "T.Ń.",
- "T.Ś.",
- "T.Ź.",
- "T.Ż.",
- "Tow.",
- "Tzw.",
- "U.A.",
- "U.B.",
- "U.C.",
- "U.D.",
- "U.E.",
- "U.F.",
- "U.G.",
- "U.H.",
- "U.I.",
- "U.J.",
- "U.K.",
- "U.L.",
- "U.M.",
- "U.N.",
- "U.O.",
- "U.P.",
- "U.R.",
- "U.S.",
- "U.T.",
- "U.U.",
- "U.W.",
- "U.Y.",
- "U.Z.",
- "U.Ó.",
- "U.Ą.",
- "U.Ć.",
- "U.Ę.",
- "U.Ł.",
- "U.Ń.",
- "U.Ś.",
- "U.Ź.",
- "U.Ż.",
- "W.A.",
- "W.B.",
- "W.C.",
- "W.D.",
- "W.E.",
- "W.F.",
- "W.G.",
- "W.H.",
- "W.I.",
- "W.J.",
- "W.K.",
- "W.L.",
- "W.M.",
- "W.N.",
- "W.O.",
- "W.P.",
- "W.R.",
- "W.S.",
- "W.T.",
- "W.U.",
- "W.W.",
- "W.Y.",
- "W.Z.",
- "W.Ó.",
- "W.Ą.",
- "W.Ć.",
- "W.Ę.",
- "W.Ł.",
- "W.Ń.",
- "W.Ś.",
- "W.Ź.",
- "W.Ż.",
- "Y.A.",
- "Y.B.",
- "Y.C.",
- "Y.D.",
- "Y.E.",
- "Y.F.",
- "Y.G.",
- "Y.H.",
- "Y.I.",
- "Y.J.",
- "Y.K.",
- "Y.L.",
- "Y.M.",
- "Y.N.",
- "Y.O.",
- "Y.P.",
- "Y.R.",
- "Y.S.",
- "Y.T.",
- "Y.U.",
- "Y.W.",
- "Y.Y.",
- "Y.Z.",
- "Y.Ó.",
- "Y.Ą.",
- "Y.Ć.",
- "Y.Ę.",
- "Y.Ł.",
- "Y.Ń.",
- "Y.Ś.",
- "Y.Ź.",
- "Y.Ż.",
- "Z.A.",
- "Z.B.",
- "Z.C.",
- "Z.D.",
- "Z.E.",
- "Z.F.",
- "Z.G.",
- "Z.H.",
- "Z.I.",
- "Z.J.",
- "Z.K.",
- "Z.L.",
- "Z.M.",
- "Z.N.",
- "Z.O.",
- "Z.P.",
- "Z.R.",
- "Z.S.",
- "Z.T.",
- "Z.U.",
- "Z.W.",
- "Z.Y.",
- "Z.Z.",
- "Z.Ó.",
- "Z.Ą.",
- "Z.Ć.",
- "Z.Ę.",
- "Z.Ł.",
- "Z.Ń.",
- "Z.Ś.",
- "Z.Ź.",
- "Z.Ż.",
- "Zob.",
- "a.",
- "ad.",
- "adw.",
- "afr.",
- "ags.",
- "akad.",
- "al.",
- "alb.",
- "am.",
- "amer.",
- "ang.",
- "aor.",
- "ap.",
- "apost.",
- "arch.",
- "arcyks.",
- "art.",
- "artyst.",
- "asp.",
- "astr.",
- "aust.",
- "austr.",
- "austral.",
- "b.",
- "bałt.",
- "bdb.",
- "belg.",
- "białorus.",
- "białost.",
- "bm.",
- "bot.",
- "bp.",
- "br.",
- "bryg.",
- "bryt.",
- "bułg.",
- "bł.",
- "c.b.d.o.",
- "c.k.",
- "c.o.",
- "cbdu.",
- "cd.",
- "cdn.",
- "centr.",
- "ces.",
- "chem.",
- "chir.",
- "chiń.",
- "chor.",
- "chorw.",
- "cieśn.",
- "cnd.",
- "cyg.",
- "cyt.",
- "cyw.",
- "cz.",
- "czes.",
- "czw.",
- "czyt.",
- "d.",
- "daw.",
- "dcn.",
- "dekl.",
- "demokr.",
- "det.",
- "dh.",
- "diec.",
- "dk.",
- "dn.",
- "doc.",
- "doktor h.c.",
- "dol.",
- "dolnośl.",
- "dost.",
- "dosł.",
- "dot.",
- "dr h.c.",
- "dr hab.",
- "dr.",
- "ds.",
- "dst.",
- "duszp.",
- "dypl.",
- "dyr.",
- "dyw.",
- "dł.",
- "egz.",
- "ekol.",
- "ekon.",
- "elektr.",
- "em.",
- "ent.",
- "est.",
- "europ.",
- "ew.",
- "fab.",
- "farm.",
- "fot.",
- "fr.",
- "franc.",
- "g.",
- "gastr.",
- "gat.",
- "gd.",
- "gen.",
- "geogr.",
- "geol.",
- "gimn.",
- "gm.",
- "godz.",
- "gorz.",
- "gosp.",
- "gosp.-polit.",
- "gr.",
- "gram.",
- "grub.",
- "górn.",
- "głęb.",
- "h.c.",
- "hab.",
- "hist.",
- "hiszp.",
- "hitl.",
- "hm.",
- "hot.",
- "hr.",
- "i in.",
- "i s.",
- "id.",
- "ie.",
- "im.",
- "in.",
- "inż.",
- "iron.",
- "itd.",
- "itp.",
- "j.",
- "j.a.",
- "jez.",
- "jn.",
- "jw.",
- "jwt.",
- "k.",
- "k.k.",
- "k.o.",
- "k.p.a.",
- "k.p.c.",
- "k.r.",
- "k.r.o.",
- "kard.",
- "kark.",
- "kasz.",
- "kat.",
- "katol.",
- "kier.",
- "kk.",
- "kl.",
- "kol.",
- "kpc.",
- "kpt.",
- "kr.",
- "krak.",
- "kryt.",
- "ks.",
- "książk.",
- "kuj.",
- "kult.",
- "kł.",
- "l.",
- "laic.",
- "lek.",
- "lit.",
- "lp.",
- "lub.",
- "m.",
- "m.b.",
- "m.in.",
- "m.p.",
- "m.st.",
- "mar.",
- "maz.",
- "małop.",
- "mec.",
- "med.",
- "mgr.",
- "min.",
- "mn.",
- "mn.w.",
- "muz.",
- "mł.",
- "n.",
- "n.e.",
- "n.p.m.",
- "n.p.u.",
- "na os.",
- "nadkom.",
- "najśw.",
- "nb.",
- "niedz.",
- "niem.",
- "norw.",
- "np.",
- "nt.",
- "nż.",
- "o s.",
- "o.",
- "oO.",
- "ob.",
- "odc.",
- "odp.",
- "ok.",
- "oo.",
- "op.",
- "os.",
- "p.",
- "p.a.",
- "p.f.",
- "p.f.v.",
- "p.n.e.",
- "p.o.",
- "p.p.",
- "p.p.m.",
- "p.r.",
- "p.r.v.",
- "phm.",
- "pie.",
- "pl.",
- "pn.",
- "pocz.",
- "pod.",
- "podgat.",
- "podkarp.",
- "podkom.",
- "poet.",
- "poj.",
- "pok.",
- "pol.",
- "pom.",
- "pon.",
- "poprz.",
- "por.",
- "port.",
- "posp.",
- "pow.",
- "poz.",
- "poł.",
- "pp.",
- "ppanc.",
- "ppor.",
- "ppoż.",
- "prawdop.",
- "proc.",
- "prof.",
- "prok.",
- "przed Chr.",
- "przyp.",
- "ps.",
- "pseud.",
- "pt.",
- "pw.",
- "półn.",
- "płd.",
- "płk.",
- "płn.",
- "r.",
- "r.ż.",
- "red.",
- "reż.",
- "ros.",
- "rozdz.",
- "rtg.",
- "rtm.",
- "rub.",
- "rum.",
- "ryc.",
- "rys.",
- "rz.",
- "s.",
- "serb.",
- "sierż.",
- "skr.",
- "sob.",
- "sp.",
- "społ.",
- "spółdz.",
- "spółgł.",
- "st.",
- "st.rus.",
- "stow.",
- "stoł.",
- "str.",
- "sud.",
- "szczec.",
- "szer.",
- "szt.",
- "szw.",
- "szwajc.",
- "słow.",
- "t.",
- "t.j.",
- "tatrz.",
- "tel.",
- "tj.",
- "tow.",
- "trl.",
- "tryb.",
- "ts.",
- "tur.",
- "tys.",
- "tzn.",
- "tzw.",
- "tłum.",
- "u s.",
- "ub.",
- "ukr.",
- "ul.",
- "up.",
- "ur.",
- "v.v.",
- "vs.",
- "w.",
- "warm.",
- "wlk.",
- "wlkp.",
- "woj.",
- "wroc.",
- "ws.",
- "wsch.",
- "wt.",
- "ww.",
- "wyb.",
- "wyd.",
- "wyj.",
- "wym.",
- "wyst.",
- "wył.",
- "wyż.",
- "wzgl.",
- "wędr.",
- "węg.",
- "wł.",
- "x.",
- "xx.",
- "zach.",
- "zagr.",
- "zak.",
- "zakł.",
- "zal.",
- "zam.",
- "zast.",
- "zaw.",
- "zazw.",
- "zał.",
- "zdr.",
- "zew.",
- "zewn.",
- "ziel.",
- "zm.",
- "zn.",
- "zob.",
- "zool.",
- "zw.",
- "ząbk.",
- "Ó.A.",
- "Ó.B.",
- "Ó.C.",
- "Ó.D.",
- "Ó.E.",
- "Ó.F.",
- "Ó.G.",
- "Ó.H.",
- "Ó.I.",
- "Ó.J.",
- "Ó.K.",
- "Ó.L.",
- "Ó.M.",
- "Ó.N.",
- "Ó.O.",
- "Ó.P.",
- "Ó.R.",
- "Ó.S.",
- "Ó.T.",
- "Ó.U.",
- "Ó.W.",
- "Ó.Y.",
- "Ó.Z.",
- "Ó.Ó.",
- "Ó.Ą.",
- "Ó.Ć.",
- "Ó.Ę.",
- "Ó.Ł.",
- "Ó.Ń.",
- "Ó.Ś.",
- "Ó.Ź.",
- "Ó.Ż.",
- "Ą.A.",
- "Ą.B.",
- "Ą.C.",
- "Ą.D.",
- "Ą.E.",
- "Ą.F.",
- "Ą.G.",
- "Ą.H.",
- "Ą.I.",
- "Ą.J.",
- "Ą.K.",
- "Ą.L.",
- "Ą.M.",
- "Ą.N.",
- "Ą.O.",
- "Ą.P.",
- "Ą.R.",
- "Ą.S.",
- "Ą.T.",
- "Ą.U.",
- "Ą.W.",
- "Ą.Y.",
- "Ą.Z.",
- "Ą.Ó.",
- "Ą.Ą.",
- "Ą.Ć.",
- "Ą.Ę.",
- "Ą.Ł.",
- "Ą.Ń.",
- "Ą.Ś.",
- "Ą.Ź.",
- "Ą.Ż.",
- "Ć.A.",
- "Ć.B.",
- "Ć.C.",
- "Ć.D.",
- "Ć.E.",
- "Ć.F.",
- "Ć.G.",
- "Ć.H.",
- "Ć.I.",
- "Ć.J.",
- "Ć.K.",
- "Ć.L.",
- "Ć.M.",
- "Ć.N.",
- "Ć.O.",
- "Ć.P.",
- "Ć.R.",
- "Ć.S.",
- "Ć.T.",
- "Ć.U.",
- "Ć.W.",
- "Ć.Y.",
- "Ć.Z.",
- "Ć.Ó.",
- "Ć.Ą.",
- "Ć.Ć.",
- "Ć.Ę.",
- "Ć.Ł.",
- "Ć.Ń.",
- "Ć.Ś.",
- "Ć.Ź.",
- "Ć.Ż.",
- "ćw.",
- "ćwicz.",
- "Ę.A.",
- "Ę.B.",
- "Ę.C.",
- "Ę.D.",
- "Ę.E.",
- "Ę.F.",
- "Ę.G.",
- "Ę.H.",
- "Ę.I.",
- "Ę.J.",
- "Ę.K.",
- "Ę.L.",
- "Ę.M.",
- "Ę.N.",
- "Ę.O.",
- "Ę.P.",
- "Ę.R.",
- "Ę.S.",
- "Ę.T.",
- "Ę.U.",
- "Ę.W.",
- "Ę.Y.",
- "Ę.Z.",
- "Ę.Ó.",
- "Ę.Ą.",
- "Ę.Ć.",
- "Ę.Ę.",
- "Ę.Ł.",
- "Ę.Ń.",
- "Ę.Ś.",
- "Ę.Ź.",
- "Ę.Ż.",
- "Ł.A.",
- "Ł.B.",
- "Ł.C.",
- "Ł.D.",
- "Ł.E.",
- "Ł.F.",
- "Ł.G.",
- "Ł.H.",
- "Ł.I.",
- "Ł.J.",
- "Ł.K.",
- "Ł.L.",
- "Ł.M.",
- "Ł.N.",
- "Ł.O.",
- "Ł.P.",
- "Ł.R.",
- "Ł.S.",
- "Ł.T.",
- "Ł.U.",
- "Ł.W.",
- "Ł.Y.",
- "Ł.Z.",
- "Ł.Ó.",
- "Ł.Ą.",
- "Ł.Ć.",
- "Ł.Ę.",
- "Ł.Ł.",
- "Ł.Ń.",
- "Ł.Ś.",
- "Ł.Ź.",
- "Ł.Ż.",
- "Łuk.",
- "łac.",
- "łot.",
- "łow.",
- "Ń.A.",
- "Ń.B.",
- "Ń.C.",
- "Ń.D.",
- "Ń.E.",
- "Ń.F.",
- "Ń.G.",
- "Ń.H.",
- "Ń.I.",
- "Ń.J.",
- "Ń.K.",
- "Ń.L.",
- "Ń.M.",
- "Ń.N.",
- "Ń.O.",
- "Ń.P.",
- "Ń.R.",
- "Ń.S.",
- "Ń.T.",
- "Ń.U.",
- "Ń.W.",
- "Ń.Y.",
- "Ń.Z.",
- "Ń.Ó.",
- "Ń.Ą.",
- "Ń.Ć.",
- "Ń.Ę.",
- "Ń.Ł.",
- "Ń.Ń.",
- "Ń.Ś.",
- "Ń.Ź.",
- "Ń.Ż.",
- "Ś.A.",
- "Ś.B.",
- "Ś.C.",
- "Ś.D.",
- "Ś.E.",
- "Ś.F.",
- "Ś.G.",
- "Ś.H.",
- "Ś.I.",
- "Ś.J.",
- "Ś.K.",
- "Ś.L.",
- "Ś.M.",
- "Ś.N.",
- "Ś.O.",
- "Ś.P.",
- "Ś.R.",
- "Ś.S.",
- "Ś.T.",
- "Ś.U.",
- "Ś.W.",
- "Ś.Y.",
- "Ś.Z.",
- "Ś.Ó.",
- "Ś.Ą.",
- "Ś.Ć.",
- "Ś.Ę.",
- "Ś.Ł.",
- "Ś.Ń.",
- "Ś.Ś.",
- "Ś.Ź.",
- "Ś.Ż.",
- "ŚW.",
- "Śp.",
- "Św.",
- "śW.",
- "śl.",
- "śp.",
- "śr.",
- "św.",
- "Ź.A.",
- "Ź.B.",
- "Ź.C.",
- "Ź.D.",
- "Ź.E.",
- "Ź.F.",
- "Ź.G.",
- "Ź.H.",
- "Ź.I.",
- "Ź.J.",
- "Ź.K.",
- "Ź.L.",
- "Ź.M.",
- "Ź.N.",
- "Ź.O.",
- "Ź.P.",
- "Ź.R.",
- "Ź.S.",
- "Ź.T.",
- "Ź.U.",
- "Ź.W.",
- "Ź.Y.",
- "Ź.Z.",
- "Ź.Ó.",
- "Ź.Ą.",
- "Ź.Ć.",
- "Ź.Ę.",
- "Ź.Ł.",
- "Ź.Ń.",
- "Ź.Ś.",
- "Ź.Ź.",
- "Ź.Ż.",
- "Ż.A.",
- "Ż.B.",
- "Ż.C.",
- "Ż.D.",
- "Ż.E.",
- "Ż.F.",
- "Ż.G.",
- "Ż.H.",
- "Ż.I.",
- "Ż.J.",
- "Ż.K.",
- "Ż.L.",
- "Ż.M.",
- "Ż.N.",
- "Ż.O.",
- "Ż.P.",
- "Ż.R.",
- "Ż.S.",
- "Ż.T.",
- "Ż.U.",
- "Ż.W.",
- "Ż.Y.",
- "Ż.Z.",
- "Ż.Ó.",
- "Ż.Ą.",
- "Ż.Ć.",
- "Ż.Ę.",
- "Ż.Ł.",
- "Ż.Ń.",
- "Ż.Ś.",
- "Ż.Ź.",
- "Ż.Ż.",
- "ż.",
- "żarg.",
- "żart.",
- "żyd.",
- "żyw.",
-]
diff --git a/spacy/lang/pl/polish_srx_rules_LICENSE.txt b/spacy/lang/pl/polish_srx_rules_LICENSE.txt
deleted file mode 100644
index 995a1b0f7..000000000
--- a/spacy/lang/pl/polish_srx_rules_LICENSE.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-
-Copyright (c) 2019, Marcin Miłkowski
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index 4e69a3912..aa8adac29 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -1,22 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
-from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
+from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
+_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+
_infixes = (
LIST_ELLIPSES
- + [CONCAT_ICONS]
+ + LIST_ICONS
+ + LIST_HYPHENS
+ [
- r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+ r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
- r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
- r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
- r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+ r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
]
)
+_suffixes = (
+ ["''", "’’", r"\.", "…"]
+ + LIST_PUNCT
+ + LIST_QUOTES
+ + LIST_ICONS
+ + [
+ r"(?<=[0-9])\+",
+ r"(?<=°[FfCcKk])\.",
+ r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
+ al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+ ),
+ r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
+ ]
+)
+
+
+TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
deleted file mode 100644
index 9e4814b0f..000000000
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
-from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
-
-
-_exc = {}
-
-for exc_data in [
- {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
- {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
- {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
- {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
- {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
- {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
-]:
- _exc[exc_data[ORTH]] = [exc_data]
-
-for orth in ["w.", "r."]:
- _exc[orth] = [{ORTH: orth}]
-
-for orth in PL_BASE_EXCEPTIONS:
- _exc[orth] = [{ORTH: orth}]
-
-TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py
index 9d0034589..9f4f5a38d 100644
--- a/spacy/tests/lang/pl/test_tokenizer.py
+++ b/spacy/tests/lang/pl/test_tokenizer.py
@@ -4,49 +4,15 @@ from __future__ import unicode_literals
import pytest
DOT_TESTS = [
- ("tel.", ["tel."]),
- ("np.", ["np."]),
- ("godz. 21:37", ["godz.", "21:37"]),
- ("inż.", ["inż."]),
- ("gosp.-polit.", ["gosp.-polit."]),
- ("ppoż", ["ppoż"]),
- ("płn", ["płn"]),
- ("ul.", ["ul."]),
- ("jw.", ["jw."]),
- ("itd.", ["itd."]),
- ("cdn.", ["cdn."]),
- ("itp.", ["itp."]),
- ("10,- zł", ["10,-", "zł"]),
+ ("tel.", ["tel", "."]),
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
- ("0,99 rub.", ["0,99", "rub."]),
- ("dol.", ["dol."]),
- ("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
- ("m.in.", ["m.in."]),
- ("p.n.e.", ["p.n.e."]),
- ("Sz.P.", ["Sz.P."]),
- ("p.o.", ["p.o."]),
- ("k.o.", ["k.o."]),
- ("m.st.", ["m.st."]),
- ("dra.", ["dra", "."]),
- ("pp.", ["pp."]),
- ("oo.", ["oo."]),
]
HYPHEN_TESTS = [
- ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
- ("NESS-040C5", ["NESS-040C5"]),
- ("JTE-7-31", ["JTE-7-31"]),
- ("BAY-59-3074", ["BAY-59-3074"]),
- ("BAY-38-7271", ["BAY-38-7271"]),
- ("STS-135", ["STS-135"]),
- ("5F-PB-22", ["5F-PB-22"]),
("cztero-", ["cztero-"]),
("jedno-", ["jedno-"]),
("dwu-", ["dwu-"]),
("trzy-", ["trzy-"]),
- ("b-adoratorzy", ["b-adoratorzy"]),
- ("2-3-4 drzewa", ["2-3-4", "drzewa"]),
- ("b-drzewa", ["b-drzewa"]),
]
From 70da1fd2d6e96256ad863f1e625091c46dac4835 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 19 May 2020 16:01:18 +0200
Subject: [PATCH 39/69] Add warning for misaligned character offset spans
(#5007)
* Add warning for misaligned character offset spans
* Resolve conflict
* Filter warnings in example scripts
Filter warnings in example scripts to show warnings once, in particular
warnings about misaligned entities.
Co-authored-by: Ines Montani
---
examples/training/rehearsal.py | 6 +++++-
examples/training/train_ner.py | 9 +++++++--
examples/training/train_new_entity_type.py | 9 +++++++--
spacy/errors.py | 6 +++++-
spacy/gold.pyx | 6 ++++++
spacy/tests/test_gold.py | 3 ++-
6 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index 9ece91427..24b1cea00 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -1,6 +1,7 @@
"""Prevent catastrophic forgetting with rehearsal updates."""
import plac
import random
+import warnings
import srsly
import spacy
from spacy.gold import GoldParse
@@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
sizes = compounding(1.0, 4.0, 1.001)
- with nlp.disable_pipes(*other_pipes):
+ with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+ # show warnings for misaligned entity spans once
+ warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
random.shuffle(raw_docs)
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 01bb6a67b..ff6029567 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -8,12 +8,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
import plac
import random
+import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
@@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train NER
+ # only train NER
+ with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+ # show warnings for misaligned entity spans once
+ warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 72d33ad50..e8ff6802a 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -24,12 +24,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.1.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
import plac
import random
+import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
@@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train NER
+ # only train NER
+ with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+ # show warnings for misaligned entity spans once
+ warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
for itn in range(n_iter):
diff --git a/spacy/errors.py b/spacy/errors.py
index d99c96922..1b268d5ab 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -110,7 +110,11 @@ class Warnings(object):
"in problems with the vocab further on in the pipeline.")
W029 = ("Unable to align tokens with entities from character offsets. "
"Discarding entity annotation for the text: {text}.")
-
+ W030 = ("Some entities could not be aligned in the text \"{text}\" with "
+ "entities \"{entities}\". Use "
+ "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+ " to check the alignment. Misaligned entities ('-') will be "
+ "ignored during training.")
@add_codes
class Errors(object):
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 4b8a4f52d..cf67a2ac7 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
break
else:
biluo[token.i] = missing
+ if "-" in biluo:
+ ent_str = str(entities)
+ warnings.warn(Warnings.W030.format(
+ text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
+ entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
+ ))
return biluo
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index fc9e624eb..37b877561 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
spaces = [True, True, True, True, True, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
- tags = biluo_tags_from_offsets(doc, entities)
+ with pytest.warns(UserWarning):
+ tags = biluo_tags_from_offsets(doc, entities)
assert tags == ["O", "O", "O", "-", "-", "-"]
From 40e65d6f6349a55f20f109eb4fbae91489ec54b0 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 19 May 2020 16:41:26 +0200
Subject: [PATCH 40/69] Fix most_similar for vectors with unused rows (#5348)
* Fix most_similar for vectors with unused rows
Address issues related to the unused rows in the vector table and
`most_similar`:
* Update `most_similar()` to search only through rows that are in use
according to `key2row`.
* Raise an error when `most_similar(n=n)` is larger than the number of
vectors in the table.
* Set and restore `_unset` correctly when vectors are added or
deserialized so that new vectors are added in the correct row.
* Set data and keys to the same length in `Vocab.prune_vectors()` to
avoid spurious entries in `key2row`.
* Fix regression test using `most_similar`
Co-authored-by: Matthew Honnibal
---
spacy/errors.py | 2 +
spacy/tests/regression/test_issue3001-3500.py | 2 +-
spacy/tests/vocab_vectors/test_vectors.py | 45 ++++++++++++++++---
spacy/vectors.pyx | 23 +++++++---
4 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 1b268d5ab..f0b8592df 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -564,6 +564,8 @@ class Errors(object):
E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
"only be fixed with token.is_sent_start.")
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
+ E198 = ("Unable to return {n} most similar vectors for the current vectors "
+ "table, which contains {n_rows} vectors.")
@add_codes
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index d05759c31..effbebb92 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -295,7 +295,7 @@ def test_issue3410():
def test_issue3412():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
- vectors = Vectors(data=data)
+ vectors = Vectors(data=data, keys=["A", "B", "C"])
keys, best_rows, scores = vectors.most_similar(
numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
)
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 16d9801ab..24eb3a1af 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import pytest
import numpy
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_equal
from spacy._ml import cosine
from spacy.vocab import Vocab
from spacy.vectors import Vectors
@@ -11,7 +11,7 @@ from spacy.tokenizer import Tokenizer
from spacy.strings import hash_string
from spacy.tokens import Doc
-from ..util import add_vecs_to_vocab
+from ..util import add_vecs_to_vocab, make_tempdir
@pytest.fixture
@@ -59,6 +59,11 @@ def most_similar_vectors_data():
)
+@pytest.fixture
+def most_similar_vectors_keys():
+ return ["a", "b", "c", "d"]
+
+
@pytest.fixture
def resize_data():
return numpy.asarray([[0.0, 1.0], [2.0, 3.0]], dtype="f")
@@ -146,11 +151,14 @@ def test_set_vector(strings, data):
assert list(v[strings[0]]) != list(orig[0])
-def test_vectors_most_similar(most_similar_vectors_data):
- v = Vectors(data=most_similar_vectors_data)
+def test_vectors_most_similar(most_similar_vectors_data, most_similar_vectors_keys):
+ v = Vectors(data=most_similar_vectors_data, keys=most_similar_vectors_keys)
_, best_rows, _ = v.most_similar(v.data, batch_size=2, n=2, sort=True)
assert all(row[0] == i for i, row in enumerate(best_rows))
+ with pytest.raises(ValueError):
+ v.most_similar(v.data, batch_size=2, n=10, sort=True)
+
def test_vectors_most_similar_identical():
"""Test that most similar identical vectors are assigned a score of 1.0."""
@@ -331,6 +339,33 @@ def test_vocab_prune_vectors():
assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+def test_vectors_serialize():
+ data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+ v = Vectors(data=data, keys=["A", "B", "C"])
+ b = v.to_bytes()
+ v_r = Vectors()
+ v_r.from_bytes(b)
+ assert_equal(v.data, v_r.data)
+ assert v.key2row == v_r.key2row
+ v.resize((5, 4))
+ v_r.resize((5, 4))
+ row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
+ row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
+ assert row == row_r
+ assert_equal(v.data, v_r.data)
+ assert v.is_full == v_r.is_full
+ with make_tempdir() as d:
+ v.to_disk(d)
+ v_r.from_disk(d)
+ assert_equal(v.data, v_r.data)
+ assert v.key2row == v_r.key2row
+ v.resize((5, 4))
+ v_r.resize((5, 4))
+ row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
+ row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
+ assert row == row_r
+ assert_equal(v.data, v_r.data)
+
def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov")
data = numpy.ndarray((5, 3), dtype="f")
@@ -340,4 +375,4 @@ def test_vector_is_oov():
vocab.set_vector("dog", data[1])
assert vocab["cat"].is_oov is True
assert vocab["dog"].is_oov is True
- assert vocab["hamster"].is_oov is False
+ assert vocab["hamster"].is_oov is False
\ No newline at end of file
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 2973ddb5b..3da3b01d7 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -212,8 +212,7 @@ cdef class Vectors:
copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1]))
resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]]
self.data = resized_array
- filled = {row for row in self.key2row.values()}
- self._unset = cppset[int]({row for row in range(shape[0]) if row not in filled})
+ self._sync_unset()
removed_items = []
for key, row in list(self.key2row.items()):
if row >= shape[0]:
@@ -310,8 +309,8 @@ cdef class Vectors:
raise ValueError(Errors.E197.format(row=row, key=key))
if vector is not None:
self.data[row] = vector
- if self._unset.count(row):
- self._unset.erase(self._unset.find(row))
+ if self._unset.count(row):
+ self._unset.erase(self._unset.find(row))
return row
def most_similar(self, queries, *, batch_size=1024, n=1, sort=True):
@@ -330,11 +329,14 @@ cdef class Vectors:
RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
tuple.
"""
+ filled = sorted(list({row for row in self.key2row.values()}))
+ if len(filled) < n:
+ raise ValueError(Errors.E198.format(n=n, n_rows=len(filled)))
xp = get_array_module(self.data)
- norms = xp.linalg.norm(self.data, axis=1, keepdims=True)
+ norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True)
norms[norms == 0] = 1
- vectors = self.data / norms
+ vectors = self.data[filled] / norms
best_rows = xp.zeros((queries.shape[0], n), dtype='i')
scores = xp.zeros((queries.shape[0], n), dtype='f')
@@ -356,7 +358,8 @@ cdef class Vectors:
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
- xp = get_array_module(self.data)
+ for i, j in numpy.ndindex(best_rows.shape):
+ best_rows[i, j] = filled[best_rows[i, j]]
# Round values really close to 1 or -1
scores = xp.around(scores, decimals=4, out=scores)
# Account for numerical error we want to return in range -1, 1
@@ -419,6 +422,7 @@ cdef class Vectors:
("vectors", load_vectors),
))
util.from_disk(path, serializers, [])
+ self._sync_unset()
return self
def to_bytes(self, **kwargs):
@@ -461,4 +465,9 @@ cdef class Vectors:
("vectors", deserialize_weights)
))
util.from_bytes(data, deserializers, [])
+ self._sync_unset()
return self
+
+ def _sync_unset(self):
+ filled = {row for row in self.key2row.values()}
+ self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled})
From 0a5b140235bb6a8cfdb35bcd5fdd68d14128733c Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Tue, 19 May 2020 20:12:21 -0700
Subject: [PATCH 41/69] Update universe.json
---
website/meta/universe.json | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index cf587f5f0..724dc3d07 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2172,6 +2172,39 @@
"model_uri = f'runs:/{my_run_id}/model'",
"nlp2 = mlflow.spacy.load_model(model_uri=model_uri)"
]
+ },
+ {
+ "id": "pyate",
+ "title": "PyATE",
+ "slogan": "Python Automated Term Extraction",
+ "description": "PyATE is a term extraction library written in Python using Spacy POS tagging with Basic, Combo Basic, C-Value, TermExtractor, and Weirdness.",
+ "github": "kevinlu1248/pyate",
+ "pip": "pyate",
+ "code_example": [
+ "from pyate import combo_basic",
+ "",
+ "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
+ "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
+ "",
+ "print(combo_basic(string).sort_values(ascending=False).head(5))",
+ "\"\"\"\"\"\"",
+ "dysfunctional tumor 1.443147",
+ "tumor suppressors 1.443147",
+ "genetic changes 1.386294",
+ "cancer cells 1.386294",
+ "dysfunctional tumor suppressors 1.298612",
+ "\"\"\"\"\"\""
+ ],
+ "code_language": "python",
+ "url": "https://github.com/kevinlu1248/pyate",
+ "author": "Kevin Lu",
+ "author_links": {
+ "twitter": "kevinlu1248",
+ "github": "kevinlu1248",
+ "website": "https://github.com/kevinlu1248/pyate"
+ },
+ "category": ["pipeline", "research"],
+ "tags": ["term_extraction"]
}
],
From a23b3a5a5042ed99cfd0c9988d1956adb85601c0 Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Tue, 19 May 2020 20:24:24 -0700
Subject: [PATCH 42/69] Update CONTRIBUTOR_AGREEMENT.md
---
.github/CONTRIBUTOR_AGREEMENT.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index da9f244eb..fc974ec95 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
- * [ ] I am signing on behalf of myself as an individual and no other person
+ * [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
@@ -98,9 +98,9 @@ mark both statements:
| Field | Entry |
|------------------------------- | -------------------- |
-| Name | |
+| Name | Kevin Lu|
| Company name (if applicable) | |
-| Title or role (if applicable) | |
+| Title or role (if applicable) | Student|
| Date | |
-| GitHub username | |
+| GitHub username | kevinlu1248|
| Website (optional) | |
From 9a1a5352154a58a83278de3be77aa564af05b40f Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Tue, 19 May 2020 20:25:45 -0700
Subject: [PATCH 43/69] Create kevinlu1248.md
---
.github/contributors/kevinlu1248.md | 106 ++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 .github/contributors/kevinlu1248.md
diff --git a/.github/contributors/kevinlu1248.md b/.github/contributors/kevinlu1248.md
new file mode 100644
index 000000000..fc974ec95
--- /dev/null
+++ b/.github/contributors/kevinlu1248.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Kevin Lu|
+| Company name (if applicable) | |
+| Title or role (if applicable) | Student|
+| Date | |
+| GitHub username | kevinlu1248|
+| Website (optional) | |
From 291b9ad7b902edd945cc8430550a6633440c582a Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Tue, 19 May 2020 20:29:53 -0700
Subject: [PATCH 44/69] Update CONTRIBUTOR_AGREEMENT.md
---
.github/CONTRIBUTOR_AGREEMENT.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index fc974ec95..da9f244eb 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
- * [x] I am signing on behalf of myself as an individual and no other person
+ * [ ] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
@@ -98,9 +98,9 @@ mark both statements:
| Field | Entry |
|------------------------------- | -------------------- |
-| Name | Kevin Lu|
+| Name | |
| Company name (if applicable) | |
-| Title or role (if applicable) | Student|
+| Title or role (if applicable) | |
| Date | |
-| GitHub username | kevinlu1248|
+| GitHub username | |
| Website (optional) | |
From 4fa96705379b10b761a7097b1adb12145402cb1f Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Wed, 20 May 2020 09:56:56 +0200
Subject: [PATCH 45/69] Extend lemmatizer rules for all UPOS tags
---
spacy/lemmatizer.py | 17 ++++++-----------
1 file changed, 6 insertions(+), 11 deletions(-)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 33908eecf..a070574bb 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -6,6 +6,7 @@ from collections import OrderedDict
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
+from .parts_of_speech import NAMES as UPOS_NAMES
class Lemmatizer(object):
@@ -43,17 +44,11 @@ class Lemmatizer(object):
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
- if univ_pos in (NOUN, "NOUN", "noun"):
- univ_pos = "noun"
- elif univ_pos in (VERB, "VERB", "verb"):
- univ_pos = "verb"
- elif univ_pos in (ADJ, "ADJ", "adj"):
- univ_pos = "adj"
- elif univ_pos in (PUNCT, "PUNCT", "punct"):
- univ_pos = "punct"
- elif univ_pos in (PROPN, "PROPN"):
- return [string]
- else:
+ if isinstance(univ_pos, int):
+ univ_pos = UPOS_NAMES.get(univ_pos, "X")
+ univ_pos = univ_pos.lower()
+
+ if univ_pos in ("", "eol", "space"):
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
From 78bb9ff5e0e4adc01bd30e227657118d87546f83 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 20 May 2020 14:56:52 +0200
Subject: [PATCH 46/69] doc_or_span -> obj
---
spacy/matcher/matcher.pyx | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4cfab915f..3d99f117a 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -213,28 +213,28 @@ cdef class Matcher:
else:
yield doc
- def __call__(self, object doc_or_span):
+ def __call__(self, object obj):
"""Find all token sequences matching the supplied pattern.
- doc_or_span (Doc or Span): The document to match over.
+ obj (Doc / Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
- if isinstance(doc_or_span, Doc):
- doc = doc_or_span
+ if isinstance(obj, Doc):
+ doc = obj
length = len(doc)
- elif isinstance(doc_or_span, Span):
- doc = doc_or_span.doc
- length = doc_or_span.end - doc_or_span.start
+ elif isinstance(obj, Span):
+ doc = obj.doc
+ length = obj.end - obj.start
else:
- raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__))
+ raise ValueError(Errors.E195.format(good="Doc or Span", got=type(obj).__name__))
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format())
- matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length,
+ matches = find_matches(&self.patterns[0], self.patterns.size(), obj, length,
extensions=self._extensions, predicates=self._extra_predicates)
for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None)
@@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
return matcher
-cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, extensions=None, predicates=tuple()):
"""Find matches in a doc, with a compiled array of patterns. Matches are
returned as a list of (id, start, end) tuples.
@@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
else:
nr_extra_attr = 0
extra_attr_values = mem.alloc(length, sizeof(attr_t))
- for i, token in enumerate(doc_or_span):
+ for i, token in enumerate(obj):
for name, index in extensions.items():
value = token._.get(name)
if isinstance(value, basestring):
@@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache,
- doc_or_span[i], extra_attr_values, predicates)
+ obj[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
From 9393253b66b5f9fc6c5e58806cf261da5afd1778 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 20 May 2020 15:18:06 +0200
Subject: [PATCH 47/69] Remove peeking from Parser.begin_training (#5456)
Inspect all instances in `Parser.begin_training` rather than only the
first 1000.
---
spacy/syntax/nn_parser.pyx | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index d5c6bf2a8..fafa492c6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -9,7 +9,6 @@ import numpy
cimport cython.parallel
import numpy.random
cimport numpy as np
-from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp
@@ -621,15 +620,15 @@ cdef class Parser:
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
if sgd is None:
sgd = self.create_optimizer()
- doc_sample = []
- gold_sample = []
- for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
+ docs = []
+ golds = []
+ for raw_text, annots_brackets in get_gold_tuples():
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
- doc_sample.append(Doc(self.vocab, words=words))
- gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
- heads=heads, deps=deps, entities=ents))
- self.model.begin_training(doc_sample, gold_sample)
+ docs.append(Doc(self.vocab, words=words))
+ golds.append(GoldParse(docs[-1], words=words, tags=tags,
+ heads=heads, deps=deps, entities=ents))
+ self.model.begin_training(docs, golds)
if pipeline is not None:
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab)
From 8cba0e41d8e2797763110e8dd1b3b2ec8a29e719 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Wed, 20 May 2020 15:35:08 +0200
Subject: [PATCH 48/69] Return lowercase form as default except for PROPN
---
spacy/lemmatizer.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index a070574bb..1f0f0da3f 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -56,6 +56,11 @@ class Lemmatizer(object):
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
+ if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
+ if univ_pos == "propn":
+ return [string]
+ else:
+ return [string.lower()]
lemmas = self.lemmatize(
string,
index_table.get(univ_pos, {}),
From daaa7bf45111cd7d033868f875442b494a9dfead Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Wed, 20 May 2020 15:51:44 +0200
Subject: [PATCH 49/69] Add option to omit extra lexeme tables in CLI
---
spacy/cli/init_model.py | 12 ++++++++++++
spacy/cli/train.py | 11 +++++++++++
2 files changed, 23 insertions(+)
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 3311a5120..18589a954 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -18,6 +18,7 @@ from wasabi import msg
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..util import ensure_path, get_lang_class, OOV_RANK
+from ..lookups import Lookups
try:
import ftfy
@@ -49,6 +50,7 @@ DEFAULT_OOV_PROB = -20
str,
),
model_name=("Optional name for the model meta", "option", "mn", str),
+ omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
)
def init_model(
lang,
@@ -61,6 +63,7 @@ def init_model(
prune_vectors=-1,
vectors_name=None,
model_name=None,
+ omit_extra_lookups=False,
):
"""
Create a new model from raw data, like word frequencies, Brown clusters
@@ -93,6 +96,15 @@ def init_model(
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name)
+
+ # Create empty extra lexeme tables so the data from spacy-lookups-data
+ # isn't loaded if these features are accessed
+ if omit_extra_lookups:
+ nlp.vocab.lookups_extra = Lookups()
+ nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+ nlp.vocab.lookups_extra.add_table("lexeme_prob")
+ nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7cb2d9745..6ce095c15 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -17,6 +17,7 @@ from .._ml import create_default_optimizer
from ..util import use_gpu as set_gpu
from ..gold import GoldCorpus
from ..compat import path2str
+from ..lookups import Lookups
from .. import util
from .. import about
@@ -57,6 +58,7 @@ from .. import about
textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
+ omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on
@@ -96,6 +98,7 @@ def train(
textcat_arch="bow",
textcat_positive_label=None,
tag_map_path=None,
+ omit_extra_lookups=False,
verbose=False,
debug=False,
):
@@ -247,6 +250,14 @@ def train(
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
+ # Create empty extra lexeme tables so the data from spacy-lookups-data
+ # isn't loaded if these features are accessed
+ if omit_extra_lookups:
+ nlp.vocab.lookups_extra = Lookups()
+ nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+ nlp.vocab.lookups_extra.add_table("lexeme_prob")
+ nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
if vectors:
msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors)
From c7c4cd5fe13ccae97a4cb9ee211226dfd129a941 Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Wed, 20 May 2020 09:11:32 -0700
Subject: [PATCH 50/69] Changed pyate code example in universe.json
---
website/meta/universe.json | 30 +++++++++++++++++-------------
1 file changed, 17 insertions(+), 13 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 724dc3d07..857e26813 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2181,19 +2181,23 @@
"github": "kevinlu1248/pyate",
"pip": "pyate",
"code_example": [
- "from pyate import combo_basic",
- "",
- "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
- "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
- "",
- "print(combo_basic(string).sort_values(ascending=False).head(5))",
- "\"\"\"\"\"\"",
- "dysfunctional tumor 1.443147",
- "tumor suppressors 1.443147",
- "genetic changes 1.386294",
- "cancer cells 1.386294",
- "dysfunctional tumor suppressors 1.298612",
- "\"\"\"\"\"\""
+ "import spacy",
+ "from pyate.term_extraction_pipeline import TermExtractionPipeline",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "nlp.add_pipe(TermExtractionPipeline())",
+ "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
+ "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
+ "",
+ "doc = nlp(string)",
+ "print(doc._.combo_basic.sort_values(ascending=False).head(5))",
+ "\"\"\"\"\"\"",
+ "dysfunctional tumor 1.443147",
+ "tumor suppressors 1.443147",
+ "genetic changes 1.386294",
+ "cancer cells 1.386294",
+ "dysfunctional tumor suppressors 1.298612",
+ "\"\"\"\"\"\""
],
"code_language": "python",
"url": "https://github.com/kevinlu1248/pyate",
From 49ef06d793b885c3bd634ac72f38be067246822a Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 20 May 2020 18:49:11 +0200
Subject: [PATCH 51/69] Add option for base model in init-model CLI (#5467)
Intended for languages like Chinese with a custom tokenizer.
---
spacy/cli/init_model.py | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 3311a5120..537afd10f 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -17,7 +17,7 @@ from wasabi import msg
from ..vectors import Vectors
from ..errors import Errors, Warnings
-from ..util import ensure_path, get_lang_class, OOV_RANK
+from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
try:
import ftfy
@@ -49,6 +49,7 @@ DEFAULT_OOV_PROB = -20
str,
),
model_name=("Optional name for the model meta", "option", "mn", str),
+ base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
)
def init_model(
lang,
@@ -61,6 +62,7 @@ def init_model(
prune_vectors=-1,
vectors_name=None,
model_name=None,
+ base_model=None,
):
"""
Create a new model from raw data, like word frequencies, Brown clusters
@@ -92,7 +94,7 @@ def init_model(
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
- nlp = create_model(lang, lex_attrs, name=model_name)
+ nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
@@ -152,9 +154,16 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs
-def create_model(lang, lex_attrs, name=None):
- lang_class = get_lang_class(lang)
- nlp = lang_class()
+def create_model(lang, lex_attrs, name=None, base_model=None):
+ if base_model:
+ nlp = load_model(base_model)
+ # keep the tokenizer but remove any existing pipeline components due to
+ # potentially conflicting vectors
+ for pipe in nlp.pipe_names:
+ nlp.remove_pipe(pipe)
+ else:
+ lang_class = get_lang_class(lang)
+ nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
for attrs in lex_attrs:
From 36a94c409a50e3d815924197d668e0ae315d4352 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 20 May 2020 23:06:03 +0200
Subject: [PATCH 52/69] failing test to reproduce overlapping spans problem
---
spacy/tests/regression/test_issue5458.py | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
create mode 100644 spacy/tests/regression/test_issue5458.py
diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py
new file mode 100644
index 000000000..33281c858
--- /dev/null
+++ b/spacy/tests/regression/test_issue5458.py
@@ -0,0 +1,21 @@
+from spacy.lang.en import English
+from spacy.lang.en.syntax_iterators import noun_chunks
+from spacy.tests.util import get_doc
+from spacy.vocab import Vocab
+
+
+def test_issue5458():
+ # Test that the noun chuncker does not generate overlapping spans
+ words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+ vocab = Vocab(strings=words)
+ dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+ pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+ heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
+
+ en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
+ en_doc.noun_chunks_iterator = noun_chunks
+
+ # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+ nlp = English()
+ merge_nps = nlp.create_pipe("merge_noun_chunks")
+ merge_nps(en_doc)
From b509a3e7fcadf84c257c1e5168b6dc926b8b2f3d Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 20 May 2020 23:06:39 +0200
Subject: [PATCH 53/69] fix: use actual range in 'seen' instead of subtree
---
spacy/lang/en/syntax_iterators.py | 4 ++--
spacy/language.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 5ff848124..22f7fcf81 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -36,7 +36,7 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ if any(j in seen for j in range(word.left_edge.i, word.i + 1)):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label
@@ -46,7 +46,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ if any(j in seen for j in range(word.left_edge.i, word.i + 1)):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/language.py b/spacy/language.py
index 703806627..c4eb26bad 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -418,7 +418,7 @@ class Language(object):
def __call__(self, text, disable=[], component_cfg=None):
"""Apply the pipeline to some text. The text can span multiple sentences,
- and can contain arbtrary whitespace. Alignment into the original string
+ and can contain arbitrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
From b221bcf1ba3907552d4c3b660d1902b0a1c26b2e Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Thu, 21 May 2020 00:17:28 +0200
Subject: [PATCH 54/69] fixing all languages
---
spacy/lang/el/syntax_iterators.py | 14 +++++++-------
spacy/lang/en/syntax_iterators.py | 10 ++++++----
spacy/lang/fa/syntax_iterators.py | 10 ++++++----
spacy/lang/fr/syntax_iterators.py | 10 ++++++----
spacy/lang/id/syntax_iterators.py | 10 ++++++----
spacy/lang/nb/syntax_iterators.py | 10 ++++++----
spacy/lang/sv/syntax_iterators.py | 10 ++++++----
7 files changed, 43 insertions(+), 31 deletions(-)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index f02619ac9..5d6398aad 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -31,16 +31,15 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
- continue
flag = False
if word.pos == NOUN:
# check for patterns such as γραμμή παραγωγής
for potential_nmod in word.rights:
if potential_nmod.dep == nmod:
- seen.update(
- j for j in range(word.left_edge.i, potential_nmod.i + 1)
- )
+ w_range = range(word.left_edge.i, potential_nmod.i + 1)
+ if any(j in seen for j in w_range):
+ continue
+ seen.update(j for j in w_range)
yield word.left_edge.i, potential_nmod.i + 1, np_label
flag = True
break
@@ -54,9 +53,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 22f7fcf81..0d43ebf37 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -36,9 +36,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(j in seen for j in range(word.left_edge.i, word.i + 1)):
+ w_range = range(word.left_edge.i, word.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,9 +47,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(j in seen for j in range(word.left_edge.i, word.i + 1)):
+ w_range = range(word.left_edge.i, word.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 5ff848124..0d43ebf37 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -36,9 +36,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,9 +47,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 9495dcf1e..91b338eb3 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -35,9 +35,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -45,9 +46,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 9495dcf1e..91b338eb3 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -35,9 +35,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -45,9 +46,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 9495dcf1e..91b338eb3 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -35,9 +35,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -45,9 +46,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 148884efe..31e3302e9 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -36,9 +36,10 @@ def noun_chunks(obj):
if word.i in seen:
continue
if word.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,9 +47,10 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- if any(w.i in seen for w in word.subtree):
+ w_range = range(word.left_edge.i, word.right_edge.i + 1)
+ if any(j in seen for j in w_range):
continue
- seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ seen.update(j for j in w_range)
yield word.left_edge.i, word.right_edge.i + 1, np_label
From 56de520afd2276e80f634ceb01e8c5a51ea64bb5 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:04:57 +0200
Subject: [PATCH 55/69] Try to fix tests on Travis (2.7)
---
spacy/lang/hy/examples.py | 1 +
spacy/lang/hy/lex_attrs.py | 1 +
spacy/lang/hy/stop_words.py | 3 ++-
spacy/lang/zh/__init__.py | 36 ++++++++++++++------------------
spacy/tests/lang/hy/test_text.py | 1 +
5 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py
index b0df31aae..d04204c55 100644
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index 7c1b9592f..910625fb8 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py
index c671956a4..3f2f7bb15 100644
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
@@ -105,6 +106,6 @@ STOP_WORDS = set(
յուրաքանչյուր
այս
մեջ
-թ
+թ
""".split()
)
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index ed0b3eb74..508c5a03f 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
if reset:
try:
import pkuseg
+
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
if self.use_pkuseg:
@@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
)
raise ImportError(msg)
for word in words:
- self.pkuseg_seg.preprocesser.insert(word.strip(), '')
+ self.pkuseg_seg.preprocesser.insert(word.strip(), "")
def _get_config(self):
config = OrderedDict(
@@ -168,21 +169,19 @@ class ChineseTokenizer(DummyTokenizer):
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
- pkuseg_features_b = b""
- pkuseg_weights_b = b""
- pkuseg_processors_data = None
+ data = {"features_b": b"", "weights_b": b"", "processors_data": None}
+ # pkuseg_features_b = b""
+ # pkuseg_weights_b = b""
+ # pkuseg_processors_data = None
def deserialize_pkuseg_features(b):
- nonlocal pkuseg_features_b
- pkuseg_features_b = b
+ data["features_b"] = b
def deserialize_pkuseg_weights(b):
- nonlocal pkuseg_weights_b
- pkuseg_weights_b = b
+ data["weights_b"] = b
def deserialize_pkuseg_processors(b):
- nonlocal pkuseg_processors_data
- pkuseg_processors_data = srsly.msgpack_loads(b)
+ data["processors_data"] = srsly.msgpack_loads(b)
deserializers = OrderedDict(
(
@@ -194,13 +193,13 @@ class ChineseTokenizer(DummyTokenizer):
)
util.from_bytes(data, deserializers, [])
- if pkuseg_features_b and pkuseg_weights_b:
+ if data["features_b"] and data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
with open(tempdir / "features.pkl", "wb") as fileh:
- fileh.write(pkuseg_features_b)
+ fileh.write(data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh:
- fileh.write(pkuseg_weights_b)
+ fileh.write(data["weights_b"])
try:
import pkuseg
except ImportError:
@@ -209,13 +208,10 @@ class ChineseTokenizer(DummyTokenizer):
+ _PKUSEG_INSTALL_MSG
)
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
- if pkuseg_processors_data:
- (
- user_dict,
- do_process,
- common_words,
- other_words,
- ) = pkuseg_processors_data
+ if data["processors_data"]:
+ (user_dict, do_process, common_words, other_words) = data[
+ "processors_data"
+ ]
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py
index 6b785bdfc..cbdb77e4e 100644
--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
import pytest
From d8f3190c0a265033ca367097e00cbf085b34615a Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:14:01 +0200
Subject: [PATCH 56/69] Tidy up and auto-format
---
spacy/cli/debug_data.py | 11 ++++++++---
spacy/cli/init_model.py | 7 ++++++-
spacy/errors.py | 3 ++-
spacy/lang/da/__init__.py | 1 -
spacy/lang/de/stop_words.py | 2 +-
spacy/lang/en/tokenizer_exceptions.py | 2 +-
spacy/lang/es/punctuation.py | 1 -
spacy/lang/fr/tokenizer_exceptions.py | 2 +-
spacy/lang/gu/stop_words.py | 14 +++++++-------
spacy/lang/hy/__init__.py | 5 +++--
spacy/lang/hy/examples.py | 2 +-
spacy/lang/hy/lex_attrs.py | 1 +
spacy/lang/hy/stop_words.py | 4 ++--
spacy/lang/hy/tag_map.py | 12 ++++++------
spacy/lang/ml/lex_attrs.py | 2 +-
spacy/lang/ml/stop_words.py | 1 -
spacy/lang/pl/__init__.py | 2 +-
spacy/lang/pl/lemmatizer.py | 1 -
spacy/lang/pl/punctuation.py | 4 +++-
spacy/lang/sv/lex_attrs.py | 2 +-
spacy/lang/ur/tag_map.py | 1 -
spacy/lang/zh/__init__.py | 3 ++-
spacy/language.py | 8 ++++++--
spacy/tests/conftest.py | 9 +++++++--
spacy/tests/doc/test_creation.py | 12 +++++++++---
spacy/tests/doc/test_token_api.py | 2 ++
spacy/tests/lang/de/test_noun_chunks.py | 4 ++--
spacy/tests/lang/el/test_noun_chunks.py | 4 ++--
spacy/tests/lang/en/test_noun_chunks.py | 4 ++--
spacy/tests/lang/es/test_noun_chunks.py | 4 ++--
spacy/tests/lang/es/test_text.py | 2 +-
spacy/tests/lang/fr/test_noun_chunks.py | 4 ++--
spacy/tests/lang/gu/test_text.py | 7 +++----
spacy/tests/lang/id/test_noun_chunks.py | 4 ++--
spacy/tests/lang/ml/test_text.py | 11 ++++++++++-
spacy/tests/lang/nb/test_noun_chunks.py | 4 ++--
spacy/tests/lang/sv/test_noun_chunks.py | 4 ++--
spacy/tests/lang/zh/test_serialize.py | 12 +++++++++++-
spacy/tests/lang/zh/test_tokenizer.py | 8 ++++++--
spacy/tests/matcher/test_matcher_api.py | 6 +++---
spacy/tests/pipeline/test_sentencizer.py | 4 +++-
.../serialize/test_serialize_vocab_strings.py | 14 +++++++++-----
spacy/tests/test_gold.py | 4 ++--
spacy/tests/vocab_vectors/test_vectors.py | 3 ++-
spacy/util.py | 2 +-
45 files changed, 138 insertions(+), 81 deletions(-)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 279f34f16..7a4a093e2 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -187,12 +187,17 @@ def debug_data(
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
msg.warn(
"{} words in training data without vectors ({:0.2f}%)".format(
- n_missing_vectors,
- n_missing_vectors / gold_train_data["n_words"],
+ n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
),
)
msg.text(
- "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+ "10 most common words without vectors: {}".format(
+ _format_labels(
+ gold_train_data["words_missing_vectors"].most_common(10),
+ counts=True,
+ )
+ ),
+ show=verbose,
)
else:
msg.info("No word vectors present in the model")
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 537afd10f..edbd5dff7 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -49,7 +49,12 @@ DEFAULT_OOV_PROB = -20
str,
),
model_name=("Optional name for the model meta", "option", "mn", str),
- base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
+ base_model=(
+ "Base model (for languages with custom tokenizers)",
+ "option",
+ "b",
+ str,
+ ),
)
def init_model(
lang,
diff --git a/spacy/errors.py b/spacy/errors.py
index f0b8592df..0750ab616 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -8,7 +8,7 @@ def add_codes(err_cls):
class ErrorsWithCodes(err_cls):
def __getattribute__(self, code):
msg = super().__getattribute__(code)
- if code.startswith('__'): # python system attributes like __class__
+ if code.startswith("__"): # python system attributes like __class__
return msg
else:
return "[{code}] {msg}".format(code=code, msg=msg)
@@ -116,6 +116,7 @@ class Warnings(object):
" to check the alignment. Misaligned entities ('-') will be "
"ignored during training.")
+
@add_codes
class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index 92eec44b2..0190656e5 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -9,7 +9,6 @@ from .morph_rules import MORPH_RULES
from ..tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py
index 69134124f..0c8b375e0 100644
--- a/spacy/lang/de/stop_words.py
+++ b/spacy/lang/de/stop_words.py
@@ -47,7 +47,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
lang lange leicht leider lieber los
machen macht machte mag magst man manche manchem manchen mancher manches mehr
-mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
+mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
mögen möglich mögt morgen muss muß müssen musst müsst musste mussten
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 62de81912..6a553052b 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -197,7 +197,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: word, NORM: word},
- {ORTH: "d", NORM: "'d"}
+ {ORTH: "d", NORM: "'d"},
]
_exc[orth + "'d've"] = [
diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py
index 42335237c..f989221c2 100644
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@@ -5,7 +5,6 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
from ..char_classes import merge_chars
-from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_list_units = [u for u in LIST_UNITS if u != "%"]
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index cb1702300..4eb4c1568 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(
- "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
+ "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
).match
diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py
index f641b5720..85d33763d 100644
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
STOP_WORDS = set(
"""
-એમ
+એમ
આ
એ
રહી
@@ -24,7 +24,7 @@ STOP_WORDS = set(
તેમને
તેમના
તેમણે
-તેમનું
+તેમનું
તેમાં
અને
અહીં
@@ -33,12 +33,12 @@ STOP_WORDS = set(
થાય
જે
ને
-કે
+કે
ના
ની
નો
ને
-નું
+નું
શું
માં
પણ
@@ -69,12 +69,12 @@ STOP_WORDS = set(
કોઈ
કેમ
કર્યો
-કર્યુ
+કર્યુ
કરે
સૌથી
-ત્યારબાદ
+ત્યારબાદ
તથા
-દ્વારા
+દ્વારા
જુઓ
જાઓ
જ્યારે
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 3320edb6c..6aaa965bb 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,11 +1,12 @@
+# coding: utf8
+from __future__ import unicode_literals
+
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
-
from ...attrs import LANG
from ...language import Language
-from ...tokens import Doc
class ArmenianDefaults(Language.Defaults):
diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py
index b0df31aae..323f77b1c 100644
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@@ -1,6 +1,6 @@
+# coding: utf8
from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.hy.examples import sentences
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index 7c1b9592f..910625fb8 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py
index c671956a4..d75aad6e2 100644
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@@ -1,6 +1,6 @@
+# coding: utf8
from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
նա
@@ -105,6 +105,6 @@ STOP_WORDS = set(
յուրաքանչյուր
այս
մեջ
-թ
+թ
""".split()
)
diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py
index 90690c22e..722270110 100644
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
-from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
+from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
TAG_MAP = {
@@ -716,7 +716,7 @@ TAG_MAP = {
POS: NOUN,
"Animacy": "Nhum",
"Case": "Dat",
- "Number": "Coll",
+ # "Number": "Coll",
"Number": "Sing",
"Person": "1",
},
@@ -815,7 +815,7 @@ TAG_MAP = {
"Animacy": "Nhum",
"Case": "Nom",
"Definite": "Def",
- "Number": "Plur",
+ # "Number": "Plur",
"Number": "Sing",
"Poss": "Yes",
},
@@ -880,7 +880,7 @@ TAG_MAP = {
POS: NOUN,
"Animacy": "Nhum",
"Case": "Nom",
- "Number": "Plur",
+ # "Number": "Plur",
"Number": "Sing",
"Person": "2",
},
@@ -1223,9 +1223,9 @@ TAG_MAP = {
"PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": {
POS: PRON,
"Case": "Nom",
- "Number": "Sing",
+ # "Number": "Sing",
"Number": "Plur",
- "Person": "3",
+ # "Person": "3",
"Person": "1",
"PronType": "Emp",
},
diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py
index 345da8126..468ad88f8 100644
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@@ -55,7 +55,7 @@ _num_words = [
"തൊണ്ണൂറ് ",
"നുറ് ",
"ആയിരം ",
- "പത്തുലക്ഷം"
+ "പത്തുലക്ഷം",
]
diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py
index 4012571bc..8bd6a7e02 100644
--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
STOP_WORDS = set(
-
"""
അത്
ഇത്
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 61608a3d9..52b662a90 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups
+from ...util import add_lookups
from ...lookups import Lookups
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 2be4b0fb7..cd555b9c2 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES
-from ...errors import Errors
class PolishLemmatizer(Lemmatizer):
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index aa8adac29..c87464b1b 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -8,7 +8,9 @@ from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
-_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+_prefixes = _prefixes = [
+ r"(długo|krótko|jedno|dwu|trzy|cztero)-"
+] + BASE_TOKENIZER_PREFIXES
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py
index 4b5278c7b..24d06a97a 100644
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@@ -40,7 +40,7 @@ _num_words = [
"miljard",
"biljon",
"biljard",
- "kvadriljon"
+ "kvadriljon",
]
diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py
index eebd3a14a..aad548e9b 100644
--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@@ -38,7 +38,6 @@ TAG_MAP = {
"NNPC": {POS: PROPN},
"NNC": {POS: NOUN},
"PSP": {POS: ADP},
-
".": {POS: PUNCT},
",": {POS: PUNCT},
"-LRB-": {POS: PUNCT},
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index ed0b3eb74..a877169a2 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
if reset:
try:
import pkuseg
+
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
if self.use_pkuseg:
@@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
)
raise ImportError(msg)
for word in words:
- self.pkuseg_seg.preprocesser.insert(word.strip(), '')
+ self.pkuseg_seg.preprocesser.insert(word.strip(), "")
def _get_config(self):
config = OrderedDict(
diff --git a/spacy/language.py b/spacy/language.py
index 703806627..0e5c46459 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -79,7 +79,9 @@ class BaseDefaults(object):
lookups=lookups,
)
vocab.lex_attr_getters[NORM] = util.add_lookups(
- vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, vocab.lookups.get_table("lexeme_norm")
+ vocab.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+ BASE_NORMS,
+ vocab.lookups.get_table("lexeme_norm"),
)
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
@@ -974,7 +976,9 @@ class Language(object):
serializers = OrderedDict()
serializers["vocab"] = lambda: self.vocab.to_bytes()
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
- serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
+ serializers["meta.json"] = lambda: srsly.json_dumps(
+ OrderedDict(sorted(self.meta.items()))
+ )
for name, proc in self.pipeline:
if name in exclude:
continue
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index d26f0ce5c..63bbf2e0a 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -112,6 +112,7 @@ def ga_tokenizer():
def gu_tokenizer():
return get_lang_class("gu").Defaults.create_tokenizer()
+
@pytest.fixture(scope="session")
def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer()
@@ -246,7 +247,9 @@ def yo_tokenizer():
@pytest.fixture(scope="session")
def zh_tokenizer_char():
- return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
+ return get_lang_class("zh").Defaults.create_tokenizer(
+ config={"use_jieba": False, "use_pkuseg": False}
+ )
@pytest.fixture(scope="session")
@@ -258,7 +261,9 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
- return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
+ return get_lang_class("zh").Defaults.create_tokenizer(
+ config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
+ )
@pytest.fixture(scope="session")
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index 8f543e86a..863a7c210 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -50,7 +50,9 @@ def test_create_from_words_and_text(vocab):
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
- assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+ assert [t.text for t in doc if not t.text.isspace()] == [
+ word for word in words if not word.isspace()
+ ]
# partial whitespace in words
words = [" ", "'", "dogs", "'", "\n\n", "run", " "]
@@ -60,7 +62,9 @@ def test_create_from_words_and_text(vocab):
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
- assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+ assert [t.text for t in doc if not t.text.isspace()] == [
+ word for word in words if not word.isspace()
+ ]
# non-standard whitespace tokens
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
@@ -70,7 +74,9 @@ def test_create_from_words_and_text(vocab):
assert [t.text for t in doc] == [" ", "'", "dogs", "'", "\n\n", "run", " "]
assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
assert doc.text == text
- assert [t.text for t in doc if not t.text.isspace()] == [word for word in words if not word.isspace()]
+ assert [t.text for t in doc if not t.text.isspace()] == [
+ word for word in words if not word.isspace()
+ ]
# mismatch between words and text
with pytest.raises(ValueError):
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 1c2253dfa..4dcd07ad9 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -181,6 +181,7 @@ def test_is_sent_start(en_tokenizer):
doc.is_parsed = True
assert len(list(doc.sents)) == 2
+
def test_is_sent_end(en_tokenizer):
doc = en_tokenizer("This is a sentence. This is another.")
assert doc[4].is_sent_end is None
@@ -213,6 +214,7 @@ def test_token0_has_sent_start_true():
assert doc[1].is_sent_start is None
assert not doc.is_sentenced
+
def test_tokenlast_has_sent_end_true():
doc = Doc(Vocab(), words=["hello", "world"])
assert doc[0].is_sent_end is None
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index 12ece84b5..8d76ddd79 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_de(de_tokenizer):
- """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = de_tokenizer("Er lag auf seinem")
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index be14acc81..4f24865d0 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_el(el_tokenizer):
- """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 1109af150..ff67986a5 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -13,9 +13,9 @@ from ...util import get_doc
def test_noun_chunks_is_parsed(en_tokenizer):
- """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = en_tokenizer("This is a sentence")
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index 71069d313..66bbd8c3a 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_es(es_tokenizer):
- """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = es_tokenizer("en Oxford este verano")
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index e237f922d..999e788dd 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -62,4 +62,4 @@ def test_lex_attrs_like_number(es_tokenizer, text, match):
@pytest.mark.parametrize("word", ["once"])
def test_es_lex_attrs_capitals(word):
assert like_num(word)
- assert like_num(word.upper())
\ No newline at end of file
+ assert like_num(word.upper())
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 876bc0ea4..ea93a5a35 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
- """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = fr_tokenizer("trouver des travaux antérieurs")
diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py
index 9f3ae45a4..aa8d442a2 100644
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@@ -3,17 +3,16 @@ from __future__ import unicode_literals
import pytest
+
def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
tokens = gu_tokenizer(text)
assert len(tokens) == 9
+
@pytest.mark.parametrize(
"text,length",
- [
- ("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6),
- ("ખેતરની ખેડ કરવામાં આવે છે.", 5),
- ],
+ [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
)
def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
tokens = gu_tokenizer(text)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index 7bac808b3..add76f9b9 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_id(id_tokenizer):
- """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = id_tokenizer("sebelas")
diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py
index 92eca6b21..2883cf5bb 100644
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@@ -10,7 +10,16 @@ def test_ml_tokenizer_handles_long_text(ml_tokenizer):
assert len(tokens) == 5
-@pytest.mark.parametrize("text,length", [("എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 10), ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5)])
+@pytest.mark.parametrize(
+ "text,length",
+ [
+ (
+ "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
+ 10,
+ ),
+ ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
+ ],
+)
def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
tokens = ml_tokenizer(text)
assert len(tokens) == length
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index 17ec6cfda..653491a64 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -5,9 +5,9 @@ import pytest
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
- """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index 38086c255..a6283b65e 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -7,9 +7,9 @@ from ...util import get_doc
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
- """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
+ """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
To check this test, we're constructing a Doc
- with a new Vocab here and forcing is_parsed to 'False'
+ with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
"""
doc = sv_tokenizer("Studenten läste den bästa boken")
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 58133a88e..56f092ed8 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -34,5 +34,15 @@ def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
- nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
+ nlp = Chinese(
+ meta={
+ "tokenizer": {
+ "config": {
+ "use_jieba": False,
+ "use_pkuseg": True,
+ "pkuseg_model": "medicine",
+ }
+ }
+ }
+ )
zh_tokenizer_serialize(nlp.tokenizer)
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index 035798aa1..28240b6a9 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -43,12 +43,16 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
- updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+ updated_user_dict = _get_pkuseg_trie_data(
+ zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+ )
assert len(user_dict) == len(updated_user_dict) - 1
# reset user dict
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
- reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
+ reset_user_dict = _get_pkuseg_trie_data(
+ zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie
+ )
assert len(reset_user_dict) == 0
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 0295ada82..1112195da 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -265,15 +265,15 @@ def test_matcher_regex_shape(en_vocab):
@pytest.mark.parametrize(
- "cmp, bad",
+ "cmp, bad",
[
("==", ["a", "aaa"]),
("!=", ["aa"]),
(">=", ["a"]),
("<=", ["aaa"]),
(">", ["a", "aa"]),
- ("<", ["aa", "aaa"])
- ]
+ ("<", ["aa", "aaa"]),
+ ],
)
def test_matcher_compare_length(en_vocab, cmp, bad):
matcher = Matcher(en_vocab)
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 7e58b3e98..ee9220a29 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -106,7 +106,9 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
),
],
)
-def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents):
+def test_sentencizer_custom_punct(
+ en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents
+):
doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer(punct_chars=punct_chars)
doc = sentencizer(doc)
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 63faf44fc..3be0a75b3 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -37,7 +37,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
- assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
+ assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
@@ -56,9 +56,13 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
if strings1 == strings2:
- assert [s for s in vocab1_d.strings if s != "_SP"] == [s for s in vocab2_d.strings if s != "_SP"]
+ assert [s for s in vocab1_d.strings if s != "_SP"] == [
+ s for s in vocab2_d.strings if s != "_SP"
+ ]
else:
- assert [s for s in vocab1_d.strings if s != "_SP"] != [s for s in vocab2_d.strings if s != "_SP"]
+ assert [s for s in vocab1_d.strings if s != "_SP"] != [
+ s for s in vocab2_d.strings if s != "_SP"
+ ]
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -76,9 +80,8 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
def test_deserialize_vocab_seen_entries(strings, lex_attr):
# Reported in #2153
vocab = Vocab(strings=strings)
- length = len(vocab)
vocab.from_bytes(vocab.to_bytes())
- assert len(vocab.strings) == len(strings) + 1 # adds _SP
+ assert len(vocab.strings) == len(strings) + 1 # adds _SP
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -130,6 +133,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
else:
assert list(sstore1_d) != list(sstore2_d)
+
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_pickle_vocab(strings, lex_attr):
vocab = Vocab(strings=strings)
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 37b877561..53665d852 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -112,7 +112,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
data = (
"I'll return the ₹54 amount",
{
- "words": ["I", "'ll", "return", "the", "₹", "54", "amount",],
+ "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
"entities": [(16, 19, "MONEY")],
},
)
@@ -122,7 +122,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
data = (
"I'll return the $54 amount",
{
- "words": ["I", "'ll", "return", "the", "$", "54", "amount",],
+ "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
"entities": [(16, 19, "MONEY")],
},
)
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 24eb3a1af..1821f8abc 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -366,6 +366,7 @@ def test_vectors_serialize():
assert row == row_r
assert_equal(v.data, v_r.data)
+
def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov")
data = numpy.ndarray((5, 3), dtype="f")
@@ -375,4 +376,4 @@ def test_vector_is_oov():
vocab.set_vector("dog", data[1])
assert vocab["cat"].is_oov is True
assert vocab["dog"].is_oov is True
- assert vocab["hamster"].is_oov is False
\ No newline at end of file
+ assert vocab["hamster"].is_oov is False
diff --git a/spacy/util.py b/spacy/util.py
index d4cdca4e0..419c99bc0 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -774,7 +774,7 @@ def get_words_and_spaces(words, text):
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words))
if word_start > 0:
- text_words.append(text[text_pos:text_pos+word_start])
+ text_words.append(text[text_pos : text_pos + word_start])
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
From 69fb4bedf20384b475779ee58521e7aa94cf4852 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:14:28 +0200
Subject: [PATCH 57/69] Revert "doc_or_span -> obj"
This reverts commit 78bb9ff5e0e4adc01bd30e227657118d87546f83.
---
spacy/matcher/matcher.pyx | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 3d99f117a..4cfab915f 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -213,28 +213,28 @@ cdef class Matcher:
else:
yield doc
- def __call__(self, object obj):
+ def __call__(self, object doc_or_span):
"""Find all token sequences matching the supplied pattern.
- obj (Doc / Span): The document to match over.
+ doc_or_span (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
- if isinstance(obj, Doc):
- doc = obj
+ if isinstance(doc_or_span, Doc):
+ doc = doc_or_span
length = len(doc)
- elif isinstance(obj, Span):
- doc = obj.doc
- length = obj.end - obj.start
+ elif isinstance(doc_or_span, Span):
+ doc = doc_or_span.doc
+ length = doc_or_span.end - doc_or_span.start
else:
- raise ValueError(Errors.E195.format(good="Doc or Span", got=type(obj).__name__))
+ raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__))
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format())
- matches = find_matches(&self.patterns[0], self.patterns.size(), obj, length,
+ matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length,
extensions=self._extensions, predicates=self._extra_predicates)
for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None)
@@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
return matcher
-cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()):
"""Find matches in a doc, with a compiled array of patterns. Matches are
returned as a list of (id, start, end) tuples.
@@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, exten
else:
nr_extra_attr = 0
extra_attr_values = mem.alloc(length, sizeof(attr_t))
- for i, token in enumerate(obj):
+ for i, token in enumerate(doc_or_span):
for name, index in extensions.items():
value = token._.get(name)
if isinstance(value, basestring):
@@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object obj, int length, exten
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache,
- obj[i], extra_attr_values, predicates)
+ doc_or_span[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
From b1f45c9da3631d7d18002b8a939cccc6c24dd90b Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:19:58 +0200
Subject: [PATCH 58/69] obj -> doclike
---
spacy/lang/de/syntax_iterators.py | 6 +++---
spacy/lang/el/syntax_iterators.py | 6 +++---
spacy/lang/en/syntax_iterators.py | 6 +++---
spacy/lang/es/syntax_iterators.py | 4 ++--
spacy/lang/fa/syntax_iterators.py | 6 +++---
spacy/lang/fr/syntax_iterators.py | 6 +++---
spacy/lang/id/syntax_iterators.py | 6 +++---
spacy/lang/nb/syntax_iterators.py | 6 +++---
spacy/lang/sv/syntax_iterators.py | 6 +++---
spacy/matcher/matcher.pyx | 24 ++++++++++++------------
10 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index 13bb857ca..73c1b1a6e 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -28,7 +28,7 @@ def noun_chunks(obj):
"og",
"app",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -38,7 +38,7 @@ def noun_chunks(obj):
close_app = doc.vocab.strings.add("nk")
rbracket = 0
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if i < rbracket:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index f02619ac9..4317bdeb4 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases. Works on both Doc and Span.
"""
@@ -14,7 +14,7 @@ def noun_chunks(obj):
# obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -24,7 +24,7 @@ def noun_chunks(obj):
nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 5ff848124..6d366ec90 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr",
"ROOT",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 0badddca1..d403183ff 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors
-def noun_chunks(obj):
- doc = obj.doc
+def noun_chunks(doclike):
+ doc = doclike.doc
if not doc.is_parsed:
raise ValueError(Errors.E029)
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 5ff848124..6d366ec90 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr",
"ROOT",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 148884efe..84493ae79 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4cfab915f..0c1a56187 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -213,28 +213,28 @@ cdef class Matcher:
else:
yield doc
- def __call__(self, object doc_or_span):
+ def __call__(self, object doclike):
"""Find all token sequences matching the supplied pattern.
- doc_or_span (Doc or Span): The document to match over.
+ doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
- if isinstance(doc_or_span, Doc):
- doc = doc_or_span
+ if isinstance(doclike, Doc):
+ doc = doclike
length = len(doc)
- elif isinstance(doc_or_span, Span):
- doc = doc_or_span.doc
- length = doc_or_span.end - doc_or_span.start
+ elif isinstance(doclike, Span):
+ doc = doclike.doc
+ length = doclike.end - doclike.start
else:
- raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__))
+ raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format())
- matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length,
+ matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates)
for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None)
@@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
return matcher
-cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
"""Find matches in a doc, with a compiled array of patterns. Matches are
returned as a list of (id, start, end) tuples.
@@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
else:
nr_extra_attr = 0
extra_attr_values = mem.alloc(length, sizeof(attr_t))
- for i, token in enumerate(doc_or_span):
+ for i, token in enumerate(doclike):
for name, index in extensions.items():
value = token._.get(name)
if isinstance(value, basestring):
@@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache,
- doc_or_span[i], extra_attr_values, predicates)
+ doclike[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
From e2fe83e35d21afed9e12e9810921228b551e628a Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:20:29 +0200
Subject: [PATCH 59/69] Refer to correct object
---
spacy/lang/es/syntax_iterators.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index d403183ff..5fda35211 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -21,7 +21,7 @@ def noun_chunks(doclike):
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
token = doc[0]
- while token and token.i < len(doc):
+ while token and token.i < len(doclike):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
From bea863acd255407887806d1089c1f63896cdf084 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 14:24:38 +0200
Subject: [PATCH 60/69] Fix naming conflict and formatting
---
spacy/lang/zh/__init__.py | 24 ++++++++++--------------
1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 508c5a03f..9d1cb71a7 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -169,19 +169,16 @@ class ChineseTokenizer(DummyTokenizer):
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
- data = {"features_b": b"", "weights_b": b"", "processors_data": None}
- # pkuseg_features_b = b""
- # pkuseg_weights_b = b""
- # pkuseg_processors_data = None
+ pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}
def deserialize_pkuseg_features(b):
- data["features_b"] = b
+ pkuseg_data["features_b"] = b
def deserialize_pkuseg_weights(b):
- data["weights_b"] = b
+ pkuseg_data["weights_b"] = b
def deserialize_pkuseg_processors(b):
- data["processors_data"] = srsly.msgpack_loads(b)
+ pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
deserializers = OrderedDict(
(
@@ -193,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer):
)
util.from_bytes(data, deserializers, [])
- if data["features_b"] and data["weights_b"]:
+ if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
with open(tempdir / "features.pkl", "wb") as fileh:
- fileh.write(data["features_b"])
+ fileh.write(pkuseg_data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh:
- fileh.write(data["weights_b"])
+ fileh.write(pkuseg_data["weights_b"])
try:
import pkuseg
except ImportError:
@@ -208,10 +205,9 @@ class ChineseTokenizer(DummyTokenizer):
+ _PKUSEG_INSTALL_MSG
)
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
- if data["processors_data"]:
- (user_dict, do_process, common_words, other_words) = data[
- "processors_data"
- ]
+ if pkuseg_data["processors_data"]:
+ processors_data = pkuseg_data["processors_data"]
+ (user_dict, do_process, common_words, other_words) = processors_data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
From a9cb2882cb98674614e72232c4bc5133b92fa501 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 15:17:39 +0200
Subject: [PATCH 61/69] Rename argument: doc_or_span/obj -> doclike (#5463)
* doc_or_span -> obj
* Revert "doc_or_span -> obj"
This reverts commit 78bb9ff5e0e4adc01bd30e227657118d87546f83.
* obj -> doclike
* Refer to correct object
---
spacy/lang/de/syntax_iterators.py | 6 +++---
spacy/lang/el/syntax_iterators.py | 6 +++---
spacy/lang/en/syntax_iterators.py | 6 +++---
spacy/lang/es/syntax_iterators.py | 6 +++---
spacy/lang/fa/syntax_iterators.py | 6 +++---
spacy/lang/fr/syntax_iterators.py | 6 +++---
spacy/lang/id/syntax_iterators.py | 6 +++---
spacy/lang/nb/syntax_iterators.py | 6 +++---
spacy/lang/sv/syntax_iterators.py | 6 +++---
spacy/matcher/matcher.pyx | 24 ++++++++++++------------
10 files changed, 39 insertions(+), 39 deletions(-)
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index 13bb857ca..73c1b1a6e 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -28,7 +28,7 @@ def noun_chunks(obj):
"og",
"app",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -38,7 +38,7 @@ def noun_chunks(obj):
close_app = doc.vocab.strings.add("nk")
rbracket = 0
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if i < rbracket:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index f02619ac9..4317bdeb4 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases. Works on both Doc and Span.
"""
@@ -14,7 +14,7 @@ def noun_chunks(obj):
# obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -24,7 +24,7 @@ def noun_chunks(obj):
nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 5ff848124..6d366ec90 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr",
"ROOT",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 0badddca1..5fda35211 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors
-def noun_chunks(obj):
- doc = obj.doc
+def noun_chunks(doclike):
+ doc = doclike.doc
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -21,7 +21,7 @@ def noun_chunks(obj):
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
token = doc[0]
- while token and token.i < len(doc):
+ while token and token.i < len(doclike):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 5ff848124..6d366ec90 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr",
"ROOT",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 9495dcf1e..2ed2c1b35 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 148884efe..84493ae79 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
-def noun_chunks(obj):
+def noun_chunks(doclike):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
@@ -20,7 +20,7 @@ def noun_chunks(obj):
"nmod",
"nmod:poss",
]
- doc = obj.doc # Ensure works on both Doc and Span.
+ doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed:
raise ValueError(Errors.E029)
@@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
- for i, word in enumerate(obj):
+ for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4cfab915f..0c1a56187 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -213,28 +213,28 @@ cdef class Matcher:
else:
yield doc
- def __call__(self, object doc_or_span):
+ def __call__(self, object doclike):
"""Find all token sequences matching the supplied pattern.
- doc_or_span (Doc or Span): The document to match over.
+ doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
- if isinstance(doc_or_span, Doc):
- doc = doc_or_span
+ if isinstance(doclike, Doc):
+ doc = doclike
length = len(doc)
- elif isinstance(doc_or_span, Span):
- doc = doc_or_span.doc
- length = doc_or_span.end - doc_or_span.start
+ elif isinstance(doclike, Span):
+ doc = doclike.doc
+ length = doclike.end - doclike.start
else:
- raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__))
+ raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged:
raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format())
- matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length,
+ matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates)
for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None)
@@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
return matcher
-cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
"""Find matches in a doc, with a compiled array of patterns. Matches are
returned as a list of (id, start, end) tuples.
@@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
else:
nr_extra_attr = 0
extra_attr_values = mem.alloc(length, sizeof(attr_t))
- for i, token in enumerate(doc_or_span):
+ for i, token in enumerate(doclike):
for name, index in extensions.items():
value = token._.get(name)
if isinstance(value, basestring):
@@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache,
- doc_or_span[i], extra_attr_values, predicates)
+ doclike[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
From c6ec19c844ac8325b40a5e6be9a058882b617915 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 17:30:05 +0200
Subject: [PATCH 62/69] Add missing declaration
---
spacy/tests/lang/hy/test_tokenizer.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py
index 424fb886f..3eeb8b54e 100644
--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@@ -1,3 +1,4 @@
+# coding: utf8
from __future__ import unicode_literals
import pytest
From f7d10da555c089a2015fd0101b6198db395d82fc Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Thu, 21 May 2020 19:15:57 +0200
Subject: [PATCH 63/69] avoid unnecessary loop to check overlapping noun chunks
---
spacy/lang/el/syntax_iterators.py | 16 +++++-----------
spacy/lang/en/syntax_iterators.py | 14 ++++----------
spacy/lang/fa/syntax_iterators.py | 14 ++++----------
spacy/lang/fr/syntax_iterators.py | 14 ++++----------
spacy/lang/id/syntax_iterators.py | 14 ++++----------
spacy/lang/nb/syntax_iterators.py | 14 ++++----------
spacy/lang/sv/syntax_iterators.py | 14 ++++----------
7 files changed, 29 insertions(+), 71 deletions(-)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 5d6398aad..b5811c337 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -23,12 +23,12 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
flag = False
@@ -36,15 +36,12 @@ def noun_chunks(obj):
# check for patterns such as γραμμή παραγωγής
for potential_nmod in word.rights:
if potential_nmod.dep == nmod:
- w_range = range(word.left_edge.i, potential_nmod.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = potential_nmod.i + 1
yield word.left_edge.i, potential_nmod.i + 1, np_label
flag = True
break
if flag is False:
- seen.update(j for j in range(word.left_edge.i, word.i + 1))
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
# covers the case: έχει όμορφα και έξυπνα παιδιά
@@ -53,10 +50,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 0d43ebf37..dbb2d6c9f 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -28,18 +28,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -47,10 +44,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 0d43ebf37..dbb2d6c9f 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -28,18 +28,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -47,10 +44,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.i + 1
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 91b338eb3..b38be57fc 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -27,18 +27,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,10 +43,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 91b338eb3..b38be57fc 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -27,18 +27,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,10 +43,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 91b338eb3..b38be57fc 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -27,18 +27,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -46,10 +43,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 31e3302e9..12d351148 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -28,18 +28,15 @@ def noun_chunks(obj):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
- seen = set()
+ prev_end = -1
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
- if word.i in seen:
+ if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -47,10 +44,7 @@ def noun_chunks(obj):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- w_range = range(word.left_edge.i, word.right_edge.i + 1)
- if any(j in seen for j in w_range):
- continue
- seen.update(j for j in w_range)
+ prev_end = word.right_edge.i + 1
yield word.left_edge.i, word.right_edge.i + 1, np_label
From 51715b9f720e115fe91f4684c589c3e5666cec5b Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Thu, 21 May 2020 19:56:56 +0200
Subject: [PATCH 64/69] span / noun chunk has +1 because end is exclusive
---
spacy/lang/el/syntax_iterators.py | 6 +++---
spacy/lang/en/syntax_iterators.py | 4 ++--
spacy/lang/fa/syntax_iterators.py | 4 ++--
spacy/lang/fr/syntax_iterators.py | 4 ++--
spacy/lang/id/syntax_iterators.py | 4 ++--
spacy/lang/nb/syntax_iterators.py | 4 ++--
spacy/lang/sv/syntax_iterators.py | 4 ++--
7 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 10fa94f8c..4a40e28c2 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -36,12 +36,12 @@ def noun_chunks(doclike):
# check for patterns such as γραμμή παραγωγής
for potential_nmod in word.rights:
if potential_nmod.dep == nmod:
- prev_end = potential_nmod.i + 1
+ prev_end = potential_nmod.i
yield word.left_edge.i, potential_nmod.i + 1, np_label
flag = True
break
if flag is False:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
# covers the case: έχει όμορφα και έξυπνα παιδιά
@@ -50,7 +50,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 91152bd50..0f2b28b58 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -36,7 +36,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -44,7 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 91152bd50..0f2b28b58 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -36,7 +36,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -44,7 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.i + 1
+ prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 3523e2f02..d6c12e69f 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -35,7 +35,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -43,7 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 3523e2f02..d6c12e69f 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -35,7 +35,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -43,7 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 3523e2f02..d6c12e69f 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -35,7 +35,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -43,7 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 99621e6a9..84d295f96 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -36,7 +36,7 @@ def noun_chunks(doclike):
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@@ -44,7 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
- prev_end = word.right_edge.i + 1
+ prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
From 0f1beb5ff27bf19e14ddc3a8b80e2521a782c03c Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:05:03 +0200
Subject: [PATCH 65/69] Tidy up and avoid absolute spacy imports in core
---
spacy/cli/evaluate.py | 3 +--
spacy/kb.pxd | 5 ++---
spacy/kb.pyx | 17 ++++++-----------
spacy/language.py | 5 +----
4 files changed, 10 insertions(+), 20 deletions(-)
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 8a84684e5..be994de73 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals, division, print_function
import plac
-import spacy
from timeit import default_timer as timer
from wasabi import msg
@@ -45,7 +44,7 @@ def evaluate(
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
if model.startswith("blank:"):
- nlp = spacy.blank(model.replace("blank:", ""))
+ nlp = util.get_lang_class(model.replace("blank:", ""))()
else:
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d5aa382b1..518ce0f4e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -6,7 +6,7 @@ from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
-from spacy.vocab cimport Vocab
+from .vocab cimport Vocab
from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC
@@ -113,7 +113,7 @@ cdef class KnowledgeBase:
return new_index
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
- """
+ """
Initializing the vectors and making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
cf. https://github.com/explosion/preshed/issues/17
@@ -169,4 +169,3 @@ cdef class Reader:
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1
-
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 36a6dbd93..076f25267 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,23 +1,20 @@
# cython: infer_types=True
# cython: profile=True
# coding: utf8
-import warnings
-
-from spacy.errors import Errors, Warnings
-
-from pathlib import Path
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
-
from cpython.exc cimport PyErr_SetFromErrno
-
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
+from libcpp.vector cimport vector
+
+import warnings
+from os import path
+from pathlib import Path
from .typedefs cimport hash_t
-from os import path
-from libcpp.vector cimport vector
+from .errors import Errors, Warnings
cdef class Candidate:
@@ -586,5 +583,3 @@ cdef class Reader:
cdef int _read(self, void* value, size_t size) except -1:
status = fread(value, size, 1, self._fp)
return status
-
-
diff --git a/spacy/language.py b/spacy/language.py
index 0e5c46459..dae7d96a2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -4,10 +4,7 @@ from __future__ import absolute_import, unicode_literals
import random
import itertools
import warnings
-
from thinc.extra import load_nlp
-
-from spacy.util import minibatch
import weakref
import functools
from collections import OrderedDict
@@ -852,7 +849,7 @@ class Language(object):
*[mp.Pipe(False) for _ in range(n_process)]
)
- batch_texts = minibatch(texts, batch_size)
+ batch_texts = util.minibatch(texts, batch_size)
# Sender sends texts to the workers.
# This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once)
From cb02bff0ebe31ab0d3b13fad9fcd2424c09f6c4b Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:24:07 +0200
Subject: [PATCH 66/69] Add blank:{lang} shortcut to util.load_mode
---
spacy/tests/test_misc.py | 11 +++++++++++
spacy/util.py | 2 ++
2 files changed, 13 insertions(+)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 4075ccf64..3ac621649 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -135,3 +135,14 @@ def test_ascii_filenames():
root = Path(__file__).parent.parent
for path in root.glob("**/*"):
assert all(ord(c) < 128 for c in path.name), path.name
+
+
+def test_load_model_blank_shortcut():
+ """Test that using a model name like "blank:en" works as a shortcut for
+ spacy.blank("en").
+ """
+ nlp = util.load_model("blank:en")
+ assert nlp.lang == "en"
+ assert nlp.pipeline == []
+ with pytest.raises(ImportError):
+ util.load_model("blank:fjsfijsdof")
diff --git a/spacy/util.py b/spacy/util.py
index 419c99bc0..5fd296404 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -161,6 +161,8 @@ def load_model(name, **overrides):
if not data_path or not data_path.exists():
raise IOError(Errors.E049.format(path=path2str(data_path)))
if isinstance(name, basestring_): # in data dir / shortcut
+ if name.startswith("blank:"): # shortcut for blank model
+ return get_lang_class(name.replace("blank:", ""))()
if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides)
if is_package(name): # installed as package
From 53da6bd6724d5ab26da597faa275816fa3e1093e Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:45:33 +0200
Subject: [PATCH 67/69] Add course to landing [ci skip]
---
website/src/styles/landing.module.sass | 1 +
website/src/widgets/landing.js | 47 ++++++++++++++------------
2 files changed, 26 insertions(+), 22 deletions(-)
diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass
index e36e36c0a..c29c0fffb 100644
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@@ -86,6 +86,7 @@
.banner-content-small
display: block
+ margin-bottom: 0 !important
.banner-title
display: block
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 9aeec0cdc..c96905733 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -9,7 +9,6 @@ import {
LandingGrid,
LandingCard,
LandingCol,
- LandingButton,
LandingDemo,
LandingBannerGrid,
LandingBanner,
@@ -19,7 +18,8 @@ import { H2 } from '../components/typography'
import { Ul, Li } from '../components/list'
import Button from '../components/button'
import Link from '../components/link'
-import irlBackground from '../images/spacy-irl.jpg'
+
+import courseImage from '../../docs/images/course.jpg'
import BenchmarksChoi from 'usage/_benchmarks-choi.md'
@@ -148,13 +148,35 @@ const Landing = ({ data }) => {
+
+
+
+
+
+
+ In this free and interactive online course you’ll learn how to
+ use spaCy to build advanced natural language understanding systems, using both
+ rule-based and machine learning approaches. It includes{' '}
+ 55 exercises featuring videos, slide decks, multiple-choice
+ questions and interactive coding practice in the browser.
+
+
Prodigy is an annotation tool so efficient that data scientists
@@ -165,25 +187,6 @@ const Landing = ({ data }) => {
update your model in real-time and chain models together to build more complex
systems.
-
-
- We were pleased to invite the spaCy community and other folks working on Natural
- Language Processing to Berlin this summer for a small and intimate event{' '}
- July 6, 2019. We booked a beautiful venue, hand-picked an
- awesome lineup of speakers and scheduled plenty of social time to get to know
- each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
- research, development and applications, with keynotes by Sebastian Ruder
- (DeepMind) and Yoav Goldberg (Allen AI).
-
From 891fa590096ef1d1d9dbef013ebc9b9b34986aee Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:52:48 +0200
Subject: [PATCH 68/69] Use backwards-compatible super()
---
spacy/errors.py | 2 +-
spacy/lang/pl/lemmatizer.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 0750ab616..aca94d64e 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -7,7 +7,7 @@ def add_codes(err_cls):
class ErrorsWithCodes(err_cls):
def __getattribute__(self, code):
- msg = super().__getattribute__(code)
+ msg = super(ErrorsWithCodes, self).__getattribute__(code)
if code.startswith("__"): # python system attributes like __class__
return msg
else:
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index cd555b9c2..d0d843b2a 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -13,7 +13,7 @@ class PolishLemmatizer(Lemmatizer):
# lemmatization for nouns
def __init__(self, lookups, *args, **kwargs):
# this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
- super().__init__(lookups)
+ super(PolishLemmatizer, self).__init__(lookups)
self.lemma_lookups = {}
for tag in [
"ADJ",
From ee027de032ffb30abacabbb410ed66b0877e95b2 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 21:54:23 +0200
Subject: [PATCH 69/69] Update universe and display of videos [ci skip]
---
website/meta/universe.json | 128 +++++++++++++++++++++++++-----
website/src/templates/universe.js | 14 +++-
2 files changed, 118 insertions(+), 24 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 857e26813..58f4cc2aa 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -115,11 +115,11 @@
"print(text)"
],
"category": ["scientific", "biomedical"],
- "author": "Travis Hoppe",
+ "author": "Travis Hoppe",
"author_links": {
"github": "thoppe",
- "twitter":"metasemantic",
- "website" : "http://thoppe.github.io/"
+ "twitter": "metasemantic",
+ "website": "http://thoppe.github.io/"
}
},
{
@@ -1132,7 +1132,7 @@
"type": "education",
"id": "spacy-course",
"title": "Advanced NLP with spaCy",
- "slogan": "spaCy, 2019",
+ "slogan": "A free online course",
"description": "In this free interactive course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.",
"url": "https://course.spacy.io",
"image": "https://i.imgur.com/JC00pHW.jpg",
@@ -1185,10 +1185,38 @@
"youtube": "6zm9NC9uRkk",
"category": ["videos"]
},
+ {
+ "type": "education",
+ "id": "video-spacy-course",
+ "title": "Advanced NLP with spaCy · A free online course",
+ "description": "spaCy is a modern Python library for industrial-strength Natural Language Processing. In this free and interactive online course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.",
+ "url": "https://course.spacy.io/en",
+ "author": "Ines Montani",
+ "author_links": {
+ "twitter": "_inesmontani",
+ "github": "ines"
+ },
+ "youtube": "THduWAnG97k",
+ "category": ["videos"]
+ },
+ {
+ "type": "education",
+ "id": "video-spacy-course-de",
+ "title": "Modernes NLP mit spaCy · Ein Gratis-Onlinekurs",
+ "description": "spaCy ist eine moderne Python-Bibliothek für industriestarkes Natural Language Processing. In diesem kostenlosen und interaktiven Onlinekurs lernst du, mithilfe von spaCy fortgeschrittene Systeme für die Analyse natürlicher Sprache zu entwickeln und dabei sowohl regelbasierte Verfahren, als auch moderne Machine-Learning-Technologie einzusetzen.",
+ "url": "https://course.spacy.io/de",
+ "author": "Ines Montani",
+ "author_links": {
+ "twitter": "_inesmontani",
+ "github": "ines"
+ },
+ "youtube": "K1elwpgDdls",
+ "category": ["videos"]
+ },
{
"type": "education",
"id": "video-intro-to-nlp-episode-1",
- "title": "Intro to NLP with spaCy",
+ "title": "Intro to NLP with spaCy (1)",
"slogan": "Episode 1: Data exploration",
"description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
"author": "Vincent Warmerdam",
@@ -1202,7 +1230,7 @@
{
"type": "education",
"id": "video-intro-to-nlp-episode-2",
- "title": "Intro to NLP with spaCy",
+ "title": "Intro to NLP with spaCy (2)",
"slogan": "Episode 2: Rule-based Matching",
"description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
"author": "Vincent Warmerdam",
@@ -1213,6 +1241,34 @@
"youtube": "KL4-Mpgbahw",
"category": ["videos"]
},
+ {
+ "type": "education",
+ "id": "video-intro-to-nlp-episode-3",
+ "title": "Intro to NLP with spaCy (3)",
+ "slogan": "Episode 2: Evaluation",
+ "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
+ "author": "Vincent Warmerdam",
+ "author_links": {
+ "twitter": "fishnets88",
+ "github": "koaning"
+ },
+ "youtube": "4V0JDdohxAk",
+ "category": ["videos"]
+ },
+ {
+ "type": "education",
+ "id": "video-intro-to-nlp-episode-4",
+ "title": "Intro to NLP with spaCy (4)",
+ "slogan": "Episode 4: Named Entity Recognition",
+ "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
+ "author": "Vincent Warmerdam",
+ "author_links": {
+ "twitter": "fishnets88",
+ "github": "koaning"
+ },
+ "youtube": "IqOJU1-_Fi0",
+ "category": ["videos"]
+ },
{
"type": "education",
"id": "video-spacy-irl-entity-linking",
@@ -1286,6 +1342,22 @@
},
"category": ["podcasts"]
},
+ {
+ "type": "education",
+ "id": "podcast-init2",
+ "title": "Podcast.__init__ #256: An Open Source Toolchain For NLP From Explosion AI",
+ "slogan": "March 2020",
+ "description": "The state of the art in natural language processing is a constantly moving target. With the rise of deep learning, previously cutting edge techniques have given way to robust language models. Through it all the team at Explosion AI have built a strong presence with the trifecta of SpaCy, Thinc, and Prodigy to support fast and flexible data labeling to feed deep learning models and performant and scalable text processing. In this episode founder and open source author Matthew Honnibal shares his experience growing a business around cutting edge open source libraries for the machine learning developent process.",
+ "iframe": "https://cdn.podlove.org/web-player/share.html?episode=https%3A%2F%2Fwww.pythonpodcast.com%2F%3Fpodlove_player4%3D614",
+ "iframe_height": 200,
+ "thumb": "https://i.imgur.com/rpo6BuY.png",
+ "url": "https://www.pythonpodcast.com/explosion-ai-natural-language-processing-episode-256/",
+ "author": "Tobias Macey",
+ "author_links": {
+ "website": "https://www.podcastinit.com"
+ },
+ "category": ["podcasts"]
+ },
{
"type": "education",
"id": "talk-python-podcast",
@@ -1348,6 +1420,18 @@
},
"category": ["podcasts"]
},
+ {
+ "type": "education",
+ "id": "video-entity-linking",
+ "title": "Training a custom entity linking mode with spaCy",
+ "author": "Sofie Van Landeghem",
+ "author_links": {
+ "twitter": "OxyKodit",
+ "github": "svlandeg"
+ },
+ "youtube": "8u57WSXVpmw",
+ "category": ["videos"]
+ },
{
"id": "adam_qas",
"title": "ADAM: Question Answering System",
@@ -2182,22 +2266,22 @@
"pip": "pyate",
"code_example": [
"import spacy",
- "from pyate.term_extraction_pipeline import TermExtractionPipeline",
- "",
- "nlp = spacy.load('en_core_web_sm')",
- "nlp.add_pipe(TermExtractionPipeline())",
- "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
- "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
- "",
- "doc = nlp(string)",
- "print(doc._.combo_basic.sort_values(ascending=False).head(5))",
- "\"\"\"\"\"\"",
- "dysfunctional tumor 1.443147",
- "tumor suppressors 1.443147",
- "genetic changes 1.386294",
- "cancer cells 1.386294",
- "dysfunctional tumor suppressors 1.298612",
- "\"\"\"\"\"\""
+ "from pyate.term_extraction_pipeline import TermExtractionPipeline",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "nlp.add_pipe(TermExtractionPipeline())",
+ "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
+ "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
+ "",
+ "doc = nlp(string)",
+ "print(doc._.combo_basic.sort_values(ascending=False).head(5))",
+ "\"\"\"\"\"\"",
+ "dysfunctional tumor 1.443147",
+ "tumor suppressors 1.443147",
+ "genetic changes 1.386294",
+ "cancer cells 1.386294",
+ "dysfunctional tumor suppressors 1.298612",
+ "\"\"\"\"\"\""
],
"code_language": "python",
"url": "https://github.com/kevinlu1248/pyate",
diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js
index e49e81b01..4a4e13bec 100644
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@@ -14,7 +14,7 @@ import Sidebar from '../components/sidebar'
import Section from '../components/section'
import Main from '../components/main'
import Footer from '../components/footer'
-import { H3, Label, InlineList } from '../components/typography'
+import { H3, H5, Label, InlineList } from '../components/typography'
import { YouTube, SoundCloud, Iframe } from '../components/embed'
import { github, markdownToReact } from '../components/util'
@@ -86,7 +86,10 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
)
return cover ? (
@@ -95,6 +98,13 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
+ ) : data.id === 'videos' ? (
+
+
+ {header}
+
{title}
+
+
) : (