From 1139247532d42ccc16e2e1c548924d83d7615637 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 9 Mar 2020 12:09:41 +0100 Subject: [PATCH 001/119] Revert changes to token_match priority from #4374 * Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly --- spacy/lang/hu/punctuation.py | 6 +++--- spacy/lang/tokenizer_exceptions.py | 6 +++++- spacy/tests/tokenizer/test_urls.py | 8 ++++++-- spacy/tokenizer.pyx | 14 +++++++++---- website/docs/usage/linguistic-features.md | 24 +++++++++++++---------- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..a010bb7ae 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -21,7 +20,8 @@ _prefixes = ( ) _suffixes = ( - LIST_PUNCT + [r"\+"] + + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..42dbc7bac 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .char_classes import ALPHA_LOWER +from .char_classes import ALPHA_LOWER, ALPHA from ..symbols import ORTH, POS, TAG, LEMMA, SPACE @@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" + # in order to support the prefix tokenization (see prefix test cases in test_urls). + r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -54,6 +56,8 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" + # in order to support the suffix tokenization (see suffix test cases in test_urls), + r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..2d82e213c 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - "http://foo.com/blah_blah_(wikipedia)", - "http://foo.com/blah_blah_(wikipedia)_(again)", + pytest.param( + "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() + ), + pytest.param( + "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() + ), "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..6f7e44061 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -239,6 +239,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -455,6 +457,10 @@ cdef class Tokenizer: suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + break if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -475,12 +481,12 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] - if substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..60a6699a9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(substring) + substring = '' + break if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, split = suffix_search(substring).start() suffixes.append(substring[split:]) substring = substring[:split] - if substring in special_cases: - tokens.extend(special_cases[substring]) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(substring) substring = '' + elif substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Check whether we have an explicitly defined rule for this substring. If we +2. Look for a token match. If there is a match, stop processing and keep this token. +3. Check whether we have an explicitly defined rule for this substring. If we do, use it. -3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that special cases always get priority. -4. If we didn't consume a prefix, try to consume a suffix and then go back to +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, + so that the token match and special cases always get priority. +5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -5. If we can't consume a prefix or a suffix, look for a special case. -6. Next, look for a token match. +6. If we can't consume a prefix or a suffix, look for a special case. 7. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. 8. Once we can't consume any more of the string, handle it as a single token. From 0c31f03ec5525cd33224a880b6d678c69019727d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 9 Mar 2020 13:41:01 +0100 Subject: [PATCH 002/119] Update docs [ci skip] --- website/docs/usage/linguistic-features.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 60a6699a9..0ceae4c4f 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -838,8 +838,6 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. - Note that prefixes and suffixes will be split off before `token_match` is - applied. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its From 493c77462a236fae204920e8a3fa22d70833d2fc Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 18:46:51 +0200 Subject: [PATCH 003/119] issue5230: test cases covering known sources of resource warnings --- spacy/tests/regression/test_issue5230.py | 112 +++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 spacy/tests/regression/test_issue5230.py diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py new file mode 100644 index 000000000..e3d7c7e82 --- /dev/null +++ b/spacy/tests/regression/test_issue5230.py @@ -0,0 +1,112 @@ +import warnings + +import numpy +import pytest +import srsly + +from spacy.kb import KnowledgeBase +from spacy.vectors import Vectors +from spacy.language import Language +from spacy.pipeline import Pipe +from spacy.tests.util import make_tempdir + + +@pytest.mark.xfail +def test_language_to_disk_resource_warning(): + nlp = Language() + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + # catch only warnings raised in spacy.language since there may be others from other components or pipelines + warnings.filterwarnings( + "always", module="spacy.language", category=ResourceWarning + ) + nlp.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_vectors_to_disk_resource_warning(): + data = numpy.zeros((3, 300), dtype="f") + keys = ["cat", "dog", "rat"] + vectors = Vectors(data=data, keys=keys) + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + vectors.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_custom_pipes_to_disk_resource_warning(): + # create dummy pipe partially implementing interface -- only want to test to_disk + class SerializableDummy(object): + def __init__(self, **cfg): + if cfg: + self.cfg = cfg + else: + self.cfg = None + super(SerializableDummy, self).__init__() + + def to_bytes(self, exclude=tuple(), disable=None, **kwargs): + return srsly.msgpack_dumps({"dummy": srsly.json_dumps(None)}) + + def from_bytes(self, bytes_data, exclude): + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + pass + + def from_disk(self, path, exclude=tuple(), **kwargs): + return self + + class MyPipe(Pipe): + def __init__(self, vocab, model=True, **cfg): + if cfg: + self.cfg = cfg + else: + self.cfg = None + self.model = SerializableDummy() + self.vocab = SerializableDummy() + + pipe = MyPipe(None) + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + pipe.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_tagger_to_disk_resource_warning(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("tagger")) + tagger = nlp.get_pipe("tagger") + # need to add model for two reasons: + # 1. no model leads to error in serialization, + # 2. the affected line is the one for model serialization + tagger.begin_training(pipeline=nlp.pipeline) + + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + tagger.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_entity_linker_to_disk_resource_warning(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("entity_linker")) + entity_linker = nlp.get_pipe("entity_linker") + # need to add model for two reasons: + # 1. no model leads to error in serialization, + # 2. the affected line is the one for model serialization + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + entity_linker.set_kb(kb) + entity_linker.begin_training(pipeline=nlp.pipeline) + + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + entity_linker.to_disk(d) + assert len(w) == 0 From 1cd975d4a5cf50eb5a2b16a30e8b520c7778af40 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 18:54:32 +0200 Subject: [PATCH 004/119] issue5230: fixed resource warnings in language --- spacy/language.py | 5 ++--- spacy/tests/regression/test_issue5230.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 56619080d..0eb062eae 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -903,9 +903,8 @@ class Language(object): serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( p, exclude=["vocab"] ) - serializers["meta.json"] = lambda p: p.open("w").write( - srsly.json_dumps(self.meta) - ) + serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta) + for name, proc in self.pipeline: if not hasattr(proc, "name"): continue diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index e3d7c7e82..be84875e7 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -11,7 +11,6 @@ from spacy.pipeline import Pipe from spacy.tests.util import make_tempdir -@pytest.mark.xfail def test_language_to_disk_resource_warning(): nlp = Language() with make_tempdir() as d: From 273ed452bb4ba148d491dcec4b321a6293bdcd30 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 19:22:32 +0200 Subject: [PATCH 005/119] issue5230: added unicode declaration at top of the file --- spacy/tests/regression/test_issue5230.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index be84875e7..9cfa3fc05 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,3 +1,4 @@ +# coding: utf8 import warnings import numpy From 71cc903d65b8946a4c6cd04cb2ca38b8a19eb5c4 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 20:30:41 +0200 Subject: [PATCH 006/119] issue5230: replaced open statements on path objects so that serialization still works an files are closed --- spacy/pipeline/pipes.pyx | 6 +++--- spacy/tests/regression/test_issue5230.py | 4 ---- spacy/vectors.pyx | 10 +++++++++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a20c9b6df..ce95b2752 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -202,7 +202,7 @@ class Pipe(object): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = self.model.to_disk exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -625,7 +625,7 @@ class Tagger(Pipe): serialize = OrderedDict(( ("vocab", lambda p: self.vocab.to_disk(p)), ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), + ("model", self.model.to_disk), ("cfg", lambda p: srsly.write_json(p, self.cfg)) )) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -1394,7 +1394,7 @@ class EntityLinker(Pipe): serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = self.model.to_disk exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9cfa3fc05..716a4624b 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -24,7 +24,6 @@ def test_language_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_vectors_to_disk_resource_warning(): data = numpy.zeros((3, 300), dtype="f") keys = ["cat", "dog", "rat"] @@ -36,7 +35,6 @@ def test_vectors_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_custom_pipes_to_disk_resource_warning(): # create dummy pipe partially implementing interface -- only want to test to_disk class SerializableDummy(object): @@ -76,7 +74,6 @@ def test_custom_pipes_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_tagger_to_disk_resource_warning(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) @@ -93,7 +90,6 @@ def test_tagger_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_entity_linker_to_disk_resource_warning(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f3c20fb7f..62d176c6c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -376,8 +376,16 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) + + def save_vectors(path): + # the source of numpy.save indicates that the file object is closed after use. + # but it seems that somehow this does not happen, as ResourceWarnings are raised here. + # in order to not rely on this, wrap in context manager. + with path.open("wb") as _file: + save_array(self.data, _file) + serializers = OrderedDict(( - ("vectors", lambda p: save_array(self.data, p.open("wb"))), + ("vectors", save_vectors), ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, []) From cde96f6c64220bf6a82cf4288f6e2bfbbc97eb0a Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 20:51:12 +0200 Subject: [PATCH 007/119] issue5230: optimized unit test a bit --- spacy/tests/regression/test_issue5230.py | 61 +++++++++--------------- 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 716a4624b..76d4d3e96 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,41 +1,28 @@ # coding: utf8 import warnings -import numpy import pytest import srsly - +from numpy import zeros from spacy.kb import KnowledgeBase from spacy.vectors import Vectors + from spacy.language import Language from spacy.pipeline import Pipe from spacy.tests.util import make_tempdir -def test_language_to_disk_resource_warning(): - nlp = Language() - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - # catch only warnings raised in spacy.language since there may be others from other components or pipelines - warnings.filterwarnings( - "always", module="spacy.language", category=ResourceWarning - ) - nlp.to_disk(d) - assert len(w) == 0 +def nlp(): + return Language() -def test_vectors_to_disk_resource_warning(): - data = numpy.zeros((3, 300), dtype="f") +def vectors(): + data = zeros((3, 1), dtype="f") keys = ["cat", "dog", "rat"] - vectors = Vectors(data=data, keys=keys) - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - vectors.to_disk(d) - assert len(w) == 0 + return Vectors(data=data, keys=keys) -def test_custom_pipes_to_disk_resource_warning(): +def custom_pipe(): # create dummy pipe partially implementing interface -- only want to test to_disk class SerializableDummy(object): def __init__(self, **cfg): @@ -66,15 +53,10 @@ def test_custom_pipes_to_disk_resource_warning(): self.model = SerializableDummy() self.vocab = SerializableDummy() - pipe = MyPipe(None) - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - pipe.to_disk(d) - assert len(w) == 0 + return MyPipe(None) -def test_tagger_to_disk_resource_warning(): +def tagger(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) tagger = nlp.get_pipe("tagger") @@ -82,15 +64,10 @@ def test_tagger_to_disk_resource_warning(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.begin_training(pipeline=nlp.pipeline) - - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - tagger.to_disk(d) - assert len(w) == 0 + return tagger -def test_entity_linker_to_disk_resource_warning(): +def entity_linker(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) entity_linker = nlp.get_pipe("entity_linker") @@ -100,9 +77,17 @@ def test_entity_linker_to_disk_resource_warning(): kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) + return entity_linker + +@pytest.mark.parametrize( + "obj", + [nlp(), vectors(), custom_pipe(), tagger(), entity_linker()], + ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], +) +def test_to_disk_resource_warning(obj): with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) - entity_linker.to_disk(d) - assert len(w) == 0 + obj.to_disk(d) + assert len(warnings_list) == 0 From b63871ceff4497ca61bd066c8432603bc73c6a8b Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Mon, 6 Apr 2020 21:04:06 +0200 Subject: [PATCH 008/119] issue5230: added contributors agreement --- .github/contributors/lfiedler.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/lfiedler.md diff --git a/.github/contributors/lfiedler.md b/.github/contributors/lfiedler.md new file mode 100644 index 000000000..61f8ffeb4 --- /dev/null +++ b/.github/contributors/lfiedler.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Leander Fiedler | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 06 April 2020 | +| GitHub username | lfiedler | +| Website (optional) | | \ No newline at end of file From e1e25c7e302876b85dc7a95c0f5cf768fbac3f1d Mon Sep 17 00:00:00 2001 From: lfiedler Date: Mon, 6 Apr 2020 21:36:02 +0200 Subject: [PATCH 009/119] issue5230: added unittest test case for completion --- spacy/tests/regression/test_issue5230.py | 28 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 76d4d3e96..1a03fa0d2 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,5 +1,6 @@ # coding: utf8 import warnings +from unittest import TestCase import pytest import srsly @@ -80,14 +81,31 @@ def entity_linker(): return entity_linker -@pytest.mark.parametrize( - "obj", +objects_to_test = ( [nlp(), vectors(), custom_pipe(), tagger(), entity_linker()], - ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], + ["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], ) -def test_to_disk_resource_warning(obj): + + +def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - assert len(warnings_list) == 0 + return warnings_list + + +@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) +def test_to_disk_resource_warning(obj): + warnings_list = write_obj_and_catch_warnings(obj) + assert len(warnings_list) == 0 + + +class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) + + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) From 8c1d0d628fb196abd33859b18a597eb0414e6c55 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Fri, 10 Apr 2020 20:35:52 +0200 Subject: [PATCH 010/119] issue5230 writer now checks instance of loc parameter before trying to operate on it --- spacy/kb.pyx | 4 ++-- spacy/tests/regression/test_issue5230.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..7c6865eed 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -446,10 +446,10 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): - if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc if isinstance(loc, Path): loc = bytes(loc) + if path.exists(loc): + assert not path.isdir(loc), "%s is directory." % loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'wb') if not self._fp: diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 1a03fa0d2..b7c6b9b1d 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -5,7 +5,7 @@ from unittest import TestCase import pytest import srsly from numpy import zeros -from spacy.kb import KnowledgeBase +from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language @@ -101,6 +101,19 @@ def test_to_disk_resource_warning(obj): assert len(warnings_list) == 0 +def test_writer_with_path_py35(): + writer = None + with make_tempdir() as d: + path = d / "test" + try: + writer = Writer(path) + except Exception as e: + pytest.fail(str(e)) + finally: + if writer: + writer.close() + + class TestToDiskResourceWarningUnittest(TestCase): def test_resource_warning(self): scenarios = zip(*objects_to_test) From a7bdfe42e13bdb2e61edcb3b4bf9203e041ef3f0 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Fri, 10 Apr 2020 21:14:33 +0200 Subject: [PATCH 011/119] issue5230 added print statement to warnings filter to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index b7c6b9b1d..03027fe39 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -98,6 +98,8 @@ def write_obj_and_catch_warnings(obj): @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) + for warning in warnings_list: + print(warning.message) assert len(warnings_list) == 0 From 88ca40a15d010fe50da383f4664f8064046f7540 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Fri, 10 Apr 2020 21:45:53 +0200 Subject: [PATCH 012/119] issue5230 raise warnings as errors to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 03027fe39..adc9307ce 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -90,7 +90,7 @@ objects_to_test = ( def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: - warnings.filterwarnings("always", category=ResourceWarning) + warnings.filterwarnings("error", category=ResourceWarning) obj.to_disk(d) return warnings_list @@ -98,8 +98,6 @@ def write_obj_and_catch_warnings(obj): @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) - for warning in warnings_list: - print(warning.message) assert len(warnings_list) == 0 From ca2a7a44db29b3ffbcf24459a8c0332742c8b676 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Fri, 10 Apr 2020 22:26:55 +0200 Subject: [PATCH 013/119] issue5230 store string values of warnings to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index adc9307ce..c78a84ad7 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -90,9 +90,9 @@ objects_to_test = ( def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: - warnings.filterwarnings("error", category=ResourceWarning) + warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - return warnings_list + return list(map(lambda w: w.message, warnings_list)) @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) From d2bb649227ce5a24e53d7526cf7892643eb297c9 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Fri, 10 Apr 2020 23:21:13 +0200 Subject: [PATCH 014/119] issue5230 filter warnings in addition to filterwarnings to prevent deprecation warnings in python35(win) setup to pop up --- spacy/tests/regression/test_issue5230.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index c78a84ad7..ae735c7bd 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -92,7 +92,8 @@ def write_obj_and_catch_warnings(obj): with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - return list(map(lambda w: w.message, warnings_list)) + # in python3.5 it seems that deprecation warnings are not filtered by filterwarnings + return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) From d60e2d3ebf33fc0c4280117b08f6e3ef9ad63ff9 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Sun, 12 Apr 2020 09:08:41 +0200 Subject: [PATCH 015/119] issue5230 added unit test for dumping and loading knowledgebase --- spacy/tests/regression/test_issue5230.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index ae735c7bd..337c82255 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -115,6 +115,23 @@ def test_writer_with_path_py35(): writer.close() +def test_save_and_load_knowledge_base(): + nlp = Language() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + with make_tempdir() as d: + path = d / "kb" + try: + kb.dump(path) + except Exception as e: + pytest.fail(str(e)) + + try: + kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb_loaded.load_bulk(path) + except Exception as e: + pytest.fail(str(e)) + + class TestToDiskResourceWarningUnittest(TestCase): def test_resource_warning(self): scenarios = zip(*objects_to_test) From 67000068304b9a125ec792f32bed8491767dbed1 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Sun, 12 Apr 2020 09:34:54 +0200 Subject: [PATCH 016/119] issue5230 attempted fix of pytest segfault for python3.5 --- spacy/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 7c6865eed..14327f0d6 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -491,10 +491,10 @@ cdef class Writer: cdef class Reader: def __init__(self, object loc): - assert path.exists(loc) - assert not path.isdir(loc) if isinstance(loc, Path): loc = bytes(loc) + assert path.exists(loc) + assert not path.isdir(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'rb') if not self._fp: From cef0c909b9dc1afd37511db4cbfd1863f27a371a Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Wed, 15 Apr 2020 19:28:33 +0200 Subject: [PATCH 017/119] issue5230 changed reference to function to anonymous function --- spacy/pipeline/pipes.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ce95b2752..8af76a0fb 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -202,7 +202,7 @@ class Pipe(object): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): - serialize["model"] = self.model.to_disk + serialize["model"] = lambda p: self.model.to_disk(p) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) From a3401b11946b9aba06dd3e83a1877c156e7ddeb4 Mon Sep 17 00:00:00 2001 From: Leander Fiedler Date: Wed, 15 Apr 2020 21:52:52 +0200 Subject: [PATCH 018/119] issue5230 changed reference to function to anonymous function --- spacy/pipeline/pipes.pyx | 4 ++-- spacy/vectors.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 8af76a0fb..fc077fc82 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -625,7 +625,7 @@ class Tagger(Pipe): serialize = OrderedDict(( ("vocab", lambda p: self.vocab.to_disk(p)), ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", self.model.to_disk), + ("model", lambda p: self.model.to_disk(p)), ("cfg", lambda p: srsly.write_json(p, self.cfg)) )) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -1394,7 +1394,7 @@ class EntityLinker(Pipe): serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) if self.model not in (None, True, False): - serialize["model"] = self.model.to_disk + serialize["model"] = lambda p: self.model.to_disk(p) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 62d176c6c..2877d2d7d 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -385,7 +385,7 @@ cdef class Vectors: save_array(self.data, _file) serializers = OrderedDict(( - ("vectors", save_vectors), + ("vectors", lambda p: save_vectors(p)), ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, []) From 565e0eef73fab8c394339239cc48e4a83e068dfd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 5 May 2020 10:35:33 +0200 Subject: [PATCH 019/119] Add tokenizer option for token match with affixes To fix the slow tokenizer URL (#4374) and allow `token_match` to take priority over prefixes and suffixes by default, introduce a new tokenizer option for a token match pattern that's applied after prefixes and suffixes but before infixes. --- spacy/lang/fr/tokenizer_exceptions.py | 4 --- spacy/lang/hu/tokenizer_exceptions.py | 3 +-- spacy/lang/tokenizer_exceptions.py | 7 ++--- spacy/language.py | 5 +++- spacy/tests/tokenizer/test_urls.py | 12 +++------ spacy/tokenizer.pxd | 1 + spacy/tokenizer.pyx | 37 +++++++++++++++++++++++---- website/docs/api/tokenizer.md | 3 ++- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index cb1702300..465626d39 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .punctuation import ELISION, HYPHENS -from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA @@ -455,9 +454,6 @@ _regular_exp += [ for hc in _hyphen_combination ] -# URLs -_regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..d328baa22 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from ..punctuation import ALPHA_LOWER, CURRENCY -from ..tokenizer_exceptions import URL_PATTERN from ...symbols import ORTH @@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( TOKENIZER_EXCEPTIONS = _exc -TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match +TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index f1eabd9aa..6a9a5363f 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" - # in order to support the prefix tokenization (see prefix test cases in test_urls). - r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -56,13 +54,12 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" - # in order to support the suffix tokenization (see suffix test cases in test_urls), - r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match +TOKEN_MATCH = None +TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index e89f80f08..d4f6c78ec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -86,6 +86,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match + token_match_with_affixes = cls.token_match_with_affixes prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -103,10 +104,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, + token_match_with_affixes=token_match_with_affixes, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH + token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2d82e213c..2f76111e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() - ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() - ), + "http://foo.com/blah_blah_(wikipedia)", + "http://foo.com/blah_blah_(wikipedia)_(again)", "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", @@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is not None + assert en_tokenizer.token_match_with_affixes(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is None + assert en_tokenizer.token_match_with_affixes(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..70d49bb39 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,6 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match + cdef object _token_match_with_affixes cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 16a2cf27b..cf0421158 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,8 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None, + token_match_with_affixes=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,6 +44,8 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. + token_match_with_affixes (callable): A boolean function matching strings to be + recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. EXAMPLE: @@ -55,6 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match + self.token_match_with_affixes = token_match_with_affixes self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -70,6 +74,14 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() + property token_match_with_affixes: + def __get__(self): + return self._token_match_with_affixes + + def __set__(self, token_match_with_affixes): + self._token_match_with_affixes = token_match_with_affixes + self._flush_cache() + property prefix_search: def __get__(self): return self._prefix_search @@ -108,11 +120,12 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, - self._rules, + self.rules, self.prefix_search, self.suffix_search, self.infix_finditer, - self.token_match) + self.token_match, + self.token_match_with_affixes) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -297,7 +310,9 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif (self.token_match and self.token_match(string)) or \ + (self.token_match_with_affixes and \ + self.token_match_with_affixes(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -450,6 +465,11 @@ cdef class Tokenizer: suffix_search = self.suffix_search infix_finditer = self.infix_finditer token_match = self.token_match + if token_match is None: + token_match = re.compile("a^").match + token_match_with_affixes = self.token_match_with_affixes + if token_match_with_affixes is None: + token_match_with_affixes = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -485,6 +505,9 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif token_match_with_affixes(substring): + tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -549,6 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -570,11 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), + ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -585,6 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): + self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..f73e851f7 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -41,7 +41,8 @@ the | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} From 36a94c409a50e3d815924197d668e0ae315d4352 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 20 May 2020 23:06:03 +0200 Subject: [PATCH 020/119] failing test to reproduce overlapping spans problem --- spacy/tests/regression/test_issue5458.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 spacy/tests/regression/test_issue5458.py diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py new file mode 100644 index 000000000..33281c858 --- /dev/null +++ b/spacy/tests/regression/test_issue5458.py @@ -0,0 +1,21 @@ +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks +from spacy.tests.util import get_doc +from spacy.vocab import Vocab + + +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + + en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc) From b509a3e7fcadf84c257c1e5168b6dc926b8b2f3d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 20 May 2020 23:06:39 +0200 Subject: [PATCH 021/119] fix: use actual range in 'seen' instead of subtree --- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/language.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 5ff848124..22f7fcf81 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label @@ -46,7 +46,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/language.py b/spacy/language.py index 703806627..c4eb26bad 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -418,7 +418,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. From b221bcf1ba3907552d4c3b660d1902b0a1c26b2e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 00:17:28 +0200 Subject: [PATCH 022/119] fixing all languages --- spacy/lang/el/syntax_iterators.py | 14 +++++++------- spacy/lang/en/syntax_iterators.py | 10 ++++++---- spacy/lang/fa/syntax_iterators.py | 10 ++++++---- spacy/lang/fr/syntax_iterators.py | 10 ++++++---- spacy/lang/id/syntax_iterators.py | 10 ++++++---- spacy/lang/nb/syntax_iterators.py | 10 ++++++---- spacy/lang/sv/syntax_iterators.py | 10 ++++++---- 7 files changed, 43 insertions(+), 31 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index f02619ac9..5d6398aad 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -31,16 +31,15 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue flag = False if word.pos == NOUN: # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - seen.update( - j for j in range(word.left_edge.i, potential_nmod.i + 1) - ) + w_range = range(word.left_edge.i, potential_nmod.i + 1) + if any(j in seen for j in w_range): + continue + seen.update(j for j in w_range) yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break @@ -54,9 +53,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 22f7fcf81..0d43ebf37 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 5ff848124..0d43ebf37 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 148884efe..31e3302e9 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label From f7d10da555c089a2015fd0101b6198db395d82fc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 19:15:57 +0200 Subject: [PATCH 023/119] avoid unnecessary loop to check overlapping noun chunks --- spacy/lang/el/syntax_iterators.py | 16 +++++----------- spacy/lang/en/syntax_iterators.py | 14 ++++---------- spacy/lang/fa/syntax_iterators.py | 14 ++++---------- spacy/lang/fr/syntax_iterators.py | 14 ++++---------- spacy/lang/id/syntax_iterators.py | 14 ++++---------- spacy/lang/nb/syntax_iterators.py | 14 ++++---------- spacy/lang/sv/syntax_iterators.py | 14 ++++---------- 7 files changed, 29 insertions(+), 71 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5d6398aad..b5811c337 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -23,12 +23,12 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: flag = False @@ -36,15 +36,12 @@ def noun_chunks(obj): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - w_range = range(word.left_edge.i, potential_nmod.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = potential_nmod.i + 1 yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -53,10 +50,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 31e3302e9..12d351148 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label From 51715b9f720e115fe91f4684c589c3e5666cec5b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 May 2020 19:56:56 +0200 Subject: [PATCH 024/119] span / noun chunk has +1 because end is exclusive --- spacy/lang/el/syntax_iterators.py | 6 +++--- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/lang/fa/syntax_iterators.py | 4 ++-- spacy/lang/fr/syntax_iterators.py | 4 ++-- spacy/lang/id/syntax_iterators.py | 4 ++-- spacy/lang/nb/syntax_iterators.py | 4 ++-- spacy/lang/sv/syntax_iterators.py | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 10fa94f8c..4a40e28c2 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -36,12 +36,12 @@ def noun_chunks(doclike): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - prev_end = potential_nmod.i + 1 + prev_end = potential_nmod.i yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -50,7 +50,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 99621e6a9..84d295f96 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label From 0f1beb5ff27bf19e14ddc3a8b80e2521a782c03c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:05:03 +0200 Subject: [PATCH 025/119] Tidy up and avoid absolute spacy imports in core --- spacy/cli/evaluate.py | 3 +-- spacy/kb.pxd | 5 ++--- spacy/kb.pyx | 17 ++++++----------- spacy/language.py | 5 +---- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 8a84684e5..be994de73 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals, division, print_function import plac -import spacy from timeit import default_timer as timer from wasabi import msg @@ -45,7 +44,7 @@ def evaluate( msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) if model.startswith("blank:"): - nlp = spacy.blank(model.replace("blank:", "")) + nlp = util.get_lang_class(model.replace("blank:", ""))() else: nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d5aa382b1..518ce0f4e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE -from spacy.vocab cimport Vocab +from .vocab cimport Vocab from .typedefs cimport hash_t from .structs cimport KBEntryC, AliasC @@ -113,7 +113,7 @@ cdef class KnowledgeBase: return new_index cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: - """ + """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 @@ -169,4 +169,3 @@ cdef class Reader: cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int _read(self, void* value, size_t size) except -1 - diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 36a6dbd93..076f25267 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,23 +1,20 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -import warnings - -from spacy.errors import Errors, Warnings - -from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t +from libcpp.vector cimport vector + +import warnings +from os import path +from pathlib import Path from .typedefs cimport hash_t -from os import path -from libcpp.vector cimport vector +from .errors import Errors, Warnings cdef class Candidate: @@ -586,5 +583,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/language.py b/spacy/language.py index 0e5c46459..dae7d96a2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,10 +4,7 @@ from __future__ import absolute_import, unicode_literals import random import itertools import warnings - from thinc.extra import load_nlp - -from spacy.util import minibatch import weakref import functools from collections import OrderedDict @@ -852,7 +849,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) From cb02bff0ebe31ab0d3b13fad9fcd2424c09f6c4b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:24:07 +0200 Subject: [PATCH 026/119] Add blank:{lang} shortcut to util.load_mode --- spacy/tests/test_misc.py | 11 +++++++++++ spacy/util.py | 2 ++ 2 files changed, 13 insertions(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4075ccf64..3ac621649 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -135,3 +135,14 @@ def test_ascii_filenames(): root = Path(__file__).parent.parent for path in root.glob("**/*"): assert all(ord(c) < 128 for c in path.name), path.name + + +def test_load_model_blank_shortcut(): + """Test that using a model name like "blank:en" works as a shortcut for + spacy.blank("en"). + """ + nlp = util.load_model("blank:en") + assert nlp.lang == "en" + assert nlp.pipeline == [] + with pytest.raises(ImportError): + util.load_model("blank:fjsfijsdof") diff --git a/spacy/util.py b/spacy/util.py index 419c99bc0..5fd296404 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -161,6 +161,8 @@ def load_model(name, **overrides): if not data_path or not data_path.exists(): raise IOError(Errors.E049.format(path=path2str(data_path))) if isinstance(name, basestring_): # in data dir / shortcut + if name.startswith("blank:"): # shortcut for blank model + return get_lang_class(name.replace("blank:", ""))() if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) if is_package(name): # installed as package From 53da6bd6724d5ab26da597faa275816fa3e1093e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:45:33 +0200 Subject: [PATCH 027/119] Add course to landing [ci skip] --- website/src/styles/landing.module.sass | 1 + website/src/widgets/landing.js | 47 ++++++++++++++------------ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass index e36e36c0a..c29c0fffb 100644 --- a/website/src/styles/landing.module.sass +++ b/website/src/styles/landing.module.sass @@ -86,6 +86,7 @@ .banner-content-small display: block + margin-bottom: 0 !important .banner-title display: block diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 9aeec0cdc..c96905733 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -9,7 +9,6 @@ import { LandingGrid, LandingCard, LandingCol, - LandingButton, LandingDemo, LandingBannerGrid, LandingBanner, @@ -19,7 +18,8 @@ import { H2 } from '../components/typography' import { Ul, Li } from '../components/list' import Button from '../components/button' import Link from '../components/link' -import irlBackground from '../images/spacy-irl.jpg' + +import courseImage from '../../docs/images/course.jpg' import BenchmarksChoi from 'usage/_benchmarks-choi.md' @@ -148,13 +148,35 @@ const Landing = ({ data }) => { + + + Advanced NLP with spaCy: A free online course + +
+
+ In this free and interactive online course you’ll learn how to + use spaCy to build advanced natural language understanding systems, using both + rule-based and machine learning approaches. It includes{' '} + 55 exercises featuring videos, slide decks, multiple-choice + questions and interactive coding practice in the browser. +
+ Prodigy is an annotation tool so efficient that data scientists @@ -165,25 +187,6 @@ const Landing = ({ data }) => { update your model in real-time and chain models together to build more complex systems. - - - We were pleased to invite the spaCy community and other folks working on Natural - Language Processing to Berlin this summer for a small and intimate event{' '} - July 6, 2019. We booked a beautiful venue, hand-picked an - awesome lineup of speakers and scheduled plenty of social time to get to know - each other and exchange ideas. The YouTube playlist includes 12 talks about NLP - research, development and applications, with keynotes by Sebastian Ruder - (DeepMind) and Yoav Goldberg (Allen AI). -
From 891fa590096ef1d1d9dbef013ebc9b9b34986aee Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 20:52:48 +0200 Subject: [PATCH 028/119] Use backwards-compatible super() --- spacy/errors.py | 2 +- spacy/lang/pl/lemmatizer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0750ab616..aca94d64e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -7,7 +7,7 @@ def add_codes(err_cls): class ErrorsWithCodes(err_cls): def __getattribute__(self, code): - msg = super().__getattribute__(code) + msg = super(ErrorsWithCodes, self).__getattribute__(code) if code.startswith("__"): # python system attributes like __class__ return msg else: diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index cd555b9c2..d0d843b2a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -13,7 +13,7 @@ class PolishLemmatizer(Lemmatizer): # lemmatization for nouns def __init__(self, lookups, *args, **kwargs): # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super().__init__(lookups) + super(PolishLemmatizer, self).__init__(lookups) self.lemma_lookups = {} for tag in [ "ADJ", From ee027de032ffb30abacabbb410ed66b0877e95b2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 21:54:23 +0200 Subject: [PATCH 029/119] Update universe and display of videos [ci skip] --- website/meta/universe.json | 128 +++++++++++++++++++++++++----- website/src/templates/universe.js | 14 +++- 2 files changed, 118 insertions(+), 24 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 857e26813..58f4cc2aa 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -115,11 +115,11 @@ "print(text)" ], "category": ["scientific", "biomedical"], - "author": "Travis Hoppe", + "author": "Travis Hoppe", "author_links": { "github": "thoppe", - "twitter":"metasemantic", - "website" : "http://thoppe.github.io/" + "twitter": "metasemantic", + "website": "http://thoppe.github.io/" } }, { @@ -1132,7 +1132,7 @@ "type": "education", "id": "spacy-course", "title": "Advanced NLP with spaCy", - "slogan": "spaCy, 2019", + "slogan": "A free online course", "description": "In this free interactive course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", "url": "https://course.spacy.io", "image": "https://i.imgur.com/JC00pHW.jpg", @@ -1185,10 +1185,38 @@ "youtube": "6zm9NC9uRkk", "category": ["videos"] }, + { + "type": "education", + "id": "video-spacy-course", + "title": "Advanced NLP with spaCy · A free online course", + "description": "spaCy is a modern Python library for industrial-strength Natural Language Processing. In this free and interactive online course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", + "url": "https://course.spacy.io/en", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "THduWAnG97k", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-course-de", + "title": "Modernes NLP mit spaCy · Ein Gratis-Onlinekurs", + "description": "spaCy ist eine moderne Python-Bibliothek für industriestarkes Natural Language Processing. In diesem kostenlosen und interaktiven Onlinekurs lernst du, mithilfe von spaCy fortgeschrittene Systeme für die Analyse natürlicher Sprache zu entwickeln und dabei sowohl regelbasierte Verfahren, als auch moderne Machine-Learning-Technologie einzusetzen.", + "url": "https://course.spacy.io/de", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "K1elwpgDdls", + "category": ["videos"] + }, { "type": "education", "id": "video-intro-to-nlp-episode-1", - "title": "Intro to NLP with spaCy", + "title": "Intro to NLP with spaCy (1)", "slogan": "Episode 1: Data exploration", "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", "author": "Vincent Warmerdam", @@ -1202,7 +1230,7 @@ { "type": "education", "id": "video-intro-to-nlp-episode-2", - "title": "Intro to NLP with spaCy", + "title": "Intro to NLP with spaCy (2)", "slogan": "Episode 2: Rule-based Matching", "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", "author": "Vincent Warmerdam", @@ -1213,6 +1241,34 @@ "youtube": "KL4-Mpgbahw", "category": ["videos"] }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-3", + "title": "Intro to NLP with spaCy (3)", + "slogan": "Episode 2: Evaluation", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "4V0JDdohxAk", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-4", + "title": "Intro to NLP with spaCy (4)", + "slogan": "Episode 4: Named Entity Recognition", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "IqOJU1-_Fi0", + "category": ["videos"] + }, { "type": "education", "id": "video-spacy-irl-entity-linking", @@ -1286,6 +1342,22 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "podcast-init2", + "title": "Podcast.__init__ #256: An Open Source Toolchain For NLP From Explosion AI", + "slogan": "March 2020", + "description": "The state of the art in natural language processing is a constantly moving target. With the rise of deep learning, previously cutting edge techniques have given way to robust language models. Through it all the team at Explosion AI have built a strong presence with the trifecta of SpaCy, Thinc, and Prodigy to support fast and flexible data labeling to feed deep learning models and performant and scalable text processing. In this episode founder and open source author Matthew Honnibal shares his experience growing a business around cutting edge open source libraries for the machine learning developent process.", + "iframe": "https://cdn.podlove.org/web-player/share.html?episode=https%3A%2F%2Fwww.pythonpodcast.com%2F%3Fpodlove_player4%3D614", + "iframe_height": 200, + "thumb": "https://i.imgur.com/rpo6BuY.png", + "url": "https://www.pythonpodcast.com/explosion-ai-natural-language-processing-episode-256/", + "author": "Tobias Macey", + "author_links": { + "website": "https://www.podcastinit.com" + }, + "category": ["podcasts"] + }, { "type": "education", "id": "talk-python-podcast", @@ -1348,6 +1420,18 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "video-entity-linking", + "title": "Training a custom entity linking mode with spaCy", + "author": "Sofie Van Landeghem", + "author_links": { + "twitter": "OxyKodit", + "github": "svlandeg" + }, + "youtube": "8u57WSXVpmw", + "category": ["videos"] + }, { "id": "adam_qas", "title": "ADAM: Question Answering System", @@ -2182,22 +2266,22 @@ "pip": "pyate", "code_example": [ "import spacy", - "from pyate.term_extraction_pipeline import TermExtractionPipeline", - "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(TermExtractionPipeline())", - "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", - "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", - "", - "doc = nlp(string)", - "print(doc._.combo_basic.sort_values(ascending=False).head(5))", - "\"\"\"\"\"\"", - "dysfunctional tumor 1.443147", - "tumor suppressors 1.443147", - "genetic changes 1.386294", - "cancer cells 1.386294", - "dysfunctional tumor suppressors 1.298612", - "\"\"\"\"\"\"" + "from pyate.term_extraction_pipeline import TermExtractionPipeline", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(TermExtractionPipeline())", + "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", + "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", + "", + "doc = nlp(string)", + "print(doc._.combo_basic.sort_values(ascending=False).head(5))", + "\"\"\"\"\"\"", + "dysfunctional tumor 1.443147", + "tumor suppressors 1.443147", + "genetic changes 1.386294", + "cancer cells 1.386294", + "dysfunctional tumor suppressors 1.298612", + "\"\"\"\"\"\"" ], "code_language": "python", "url": "https://github.com/kevinlu1248/pyate", diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index e49e81b01..4a4e13bec 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -14,7 +14,7 @@ import Sidebar from '../components/sidebar' import Section from '../components/section' import Main from '../components/main' import Footer from '../components/footer' -import { H3, Label, InlineList } from '../components/typography' +import { H3, H5, Label, InlineList } from '../components/typography' import { YouTube, SoundCloud, Iframe } from '../components/embed' import { github, markdownToReact } from '../components/util' @@ -86,7 +86,10 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC ) return cover ? ( @@ -95,6 +98,13 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC {title

+ ) : data.id === 'videos' ? ( +
+ + {header} +
{title}
+ +
) : ( Date: Fri, 22 May 2020 10:14:34 +0200 Subject: [PATCH 030/119] Disallow merging 0-length spans --- spacy/errors.py | 1 + spacy/tests/doc/test_retokenize_merge.py | 7 +++++++ spacy/tokens/_retokenize.pyx | 2 ++ 3 files changed, 10 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index aca94d64e..6d92545d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -567,6 +567,7 @@ class Errors(object): E197 = ("Row out of bounds, unable to add row {row} for key {key}.") E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") + E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") @add_codes diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..636b7bb14 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab): retokenizer.merge(doc[0:2]) assert len(doc) == 2 assert doc[0].text == "hello world" + + +def test_retokenize_disallow_zero_length(en_vocab): + doc = Doc(en_vocab, words=["hello", "world", "!"]) + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[1:1]) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 512ad73bc..ce8e510d6 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -55,6 +55,8 @@ cdef class Retokenizer: """ if (span.start, span.end) in self._spans_to_merge: return + if span.end - span.start <= 0: + raise ValueError(Errors.E199.format(start=span.start, end=span.end)) for token in span: if token.i in self.tokens_to_merge: raise ValueError(Errors.E102.format(token=repr(token))) From e4a1b5dab1f2de60fa0ddbb3e80282b0749635da Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 22 May 2020 12:41:03 +0200 Subject: [PATCH 031/119] Rename to url_match Rename to `url_match` and update docs. --- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/language.py | 8 ++--- spacy/tests/tokenizer/test_urls.py | 4 +-- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 40 +++++++++++------------ website/docs/api/tokenizer.md | 16 ++++----- website/docs/usage/linguistic-features.md | 23 ++++++++----- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 6a9a5363f..67349916b 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -59,7 +59,7 @@ URL_PATTERN = ( ).strip() TOKEN_MATCH = None -TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match +URL_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index 2c7f4e2b5..53a788f2a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES +from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc @@ -89,7 +89,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match - token_match_with_affixes = cls.token_match_with_affixes + url_match = cls.url_match prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -107,12 +107,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, - token_match_with_affixes=token_match_with_affixes, + url_match=url_match, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH - token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES + url_match = URL_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2f76111e5..65ba93d66 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is not None + assert en_tokenizer.url_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is None + assert en_tokenizer.url_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 70d49bb39..694ea49cc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,7 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match - cdef object _token_match_with_affixes + cdef object _url_match cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index cf0421158..154a42c4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -31,7 +31,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - token_match_with_affixes=None): + url_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -44,7 +44,7 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. - token_match_with_affixes (callable): A boolean function matching strings to be + url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. @@ -58,7 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match - self.token_match_with_affixes = token_match_with_affixes + self.url_match = url_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -74,12 +74,12 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() - property token_match_with_affixes: + property url_match: def __get__(self): - return self._token_match_with_affixes + return self._url_match - def __set__(self, token_match_with_affixes): - self._token_match_with_affixes = token_match_with_affixes + def __set__(self, url_match): + self._url_match = url_match self._flush_cache() property prefix_search: @@ -125,7 +125,7 @@ cdef class Tokenizer: self.suffix_search, self.infix_finditer, self.token_match, - self.token_match_with_affixes) + self.url_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -311,8 +311,8 @@ cdef class Tokenizer: if cache_hit: pass elif (self.token_match and self.token_match(string)) or \ - (self.token_match_with_affixes and \ - self.token_match_with_affixes(string)): + (self.url_match and \ + self.url_match(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -467,9 +467,9 @@ cdef class Tokenizer: token_match = self.token_match if token_match is None: token_match = re.compile("a^").match - token_match_with_affixes = self.token_match_with_affixes - if token_match_with_affixes is None: - token_match_with_affixes = re.compile("a^").match + url_match = self.url_match + if url_match is None: + url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -505,8 +505,8 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' - elif token_match_with_affixes(substring): - tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + elif url_match(substring): + tokens.append(("URL_MATCH", substring)) substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) @@ -572,7 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), + ("url_match", lambda: _get_regex_pattern(self.url_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -594,12 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), - ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), + ("url_match", lambda b: data.setdefault("url_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -610,8 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match - if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): - self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match + if "url_match" in data and isinstance(data["url_match"], basestring_): + self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index f73e851f7..6f8badfe8 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -35,15 +35,15 @@ the > ``` | Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 91ca1267b..bcc943436 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, if token_match(substring): tokens.append(substring) substring = '' + elif url_match(substring): + tokens.append(substring) + substring = '' elif substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Look for a token match. If there is a match, stop processing and keep this token. -3. Check whether we have an explicitly defined rule for this substring. If we - do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that the token match and special cases always get priority. +2. Look for a token match. If there is a match, stop processing and keep this + token. +3. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to + #2, so that the token match and special cases always get priority. 5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -6. If we can't consume a prefix or a suffix, look for a special case. -7. Look for "infixes" — stuff like hyphens etc. and split the substring into +6. If we can't consume a prefix or a suffix, look for a URL match. +7. If there's no URL match, then look for a special case. +8. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. -8. Once we can't consume any more of the string, handle it as a single token. +9. Once we can't consume any more of the string, handle it as a single token. #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} @@ -836,6 +841,8 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. +6. An optional boolean function `url_match`, which is similar to `token_match` + except prefixes and suffixes are removed before applying the match. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its From 65c7e82de24739977d7ca775d585cacc7dc25cd5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 13:50:30 +0200 Subject: [PATCH 032/119] Auto-format and remove 2.3 feature [ci skip] --- website/docs/api/token.md | 150 +++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 83 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 69dac23d6..0fa86b7bc 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -351,25 +351,9 @@ property to `0` for the first word of the document. - assert doc[4].sent_start == 1 + assert doc[4].is_sent_start == True ``` + -## Token.is_sent_end {#is_sent_end tag="property" new="2"} - -A boolean value indicating whether the token ends a sentence. `None` if -unknown. Defaults to `True` for the last token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[3].is_sent_end -> assert not doc[4].is_sent_end -> ``` - -| Name | Type | Description | -| ----------- | ---- | ------------------------------------ | -| **RETURNS** | bool | Whether the token ends a sentence. | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -424,71 +408,71 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| Name | Type | Description | +| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | unicode | Verbatim text content. | +| `text_with_ws` | unicode | Text content, with trailing space character if present. | +| `whitespace_` | unicode | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | unicode | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | unicode | Fine-grained part-of-speech. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | unicode | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | unicode | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | From c685ee734ad7e3d103fbd5725033353737563b40 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 May 2020 14:22:36 +0200 Subject: [PATCH 033/119] Fix compat for v2.x branch --- spacy/tests/regression/test_issue5152.py | 3 +++ spacy/tests/regression/test_issue5230.py | 25 +++++++++++-------- spacy/tests/regression/test_issue5458.py | 5 ++++ .../serialize/test_serialize_vocab_strings.py | 2 ++ spacy/tests/vocab_vectors/test_vectors.py | 2 ++ 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..758ac9c14 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 337c82255..2b14ff589 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,16 +1,17 @@ # coding: utf8 import warnings from unittest import TestCase - import pytest import srsly from numpy import zeros from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors - from spacy.language import Language from spacy.pipeline import Pipe -from spacy.tests.util import make_tempdir +from spacy.compat import is_python2 + + +from ..util import make_tempdir def nlp(): @@ -96,12 +97,14 @@ def write_obj_and_catch_warnings(obj): return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) assert len(warnings_list) == 0 +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") def test_writer_with_path_py35(): writer = None with make_tempdir() as d: @@ -132,11 +135,13 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) -class TestToDiskResourceWarningUnittest(TestCase): - def test_resource_warning(self): - scenarios = zip(*objects_to_test) +if not is_python2: - for scenario in scenarios: - with self.subTest(msg=scenario[1]): - warnings_list = write_obj_and_catch_warnings(scenario[0]) - self.assertEqual(len(warnings_list), 0) + class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) + + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 33281c858..3281e2a8c 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc @@ -6,11 +9,13 @@ from spacy.vocab import Vocab def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans + # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] vocab = Vocab(strings=words) dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + # fmt: on en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) en_doc.noun_chunks_iterator = noun_chunks diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 3be0a75b3..4727899a3 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -5,6 +5,7 @@ import pytest import pickle from spacy.vocab import Vocab from spacy.strings import StringStore +from spacy.compat import is_python2 from ..util import make_tempdir @@ -134,6 +135,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) != list(sstore2_d) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 1821f8abc..576ca93d2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -10,6 +10,7 @@ from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc +from spacy.compat import is_python2 from ..util import add_vecs_to_vocab, make_tempdir @@ -339,6 +340,7 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) From aa53ce69962103f5e2386210c7c9bf16e2f0bcd7 Mon Sep 17 00:00:00 2001 From: Jannis <34443309+JannisTriesToCode@users.noreply.github.com> Date: Fri, 22 May 2020 19:50:26 +0200 Subject: [PATCH 034/119] Documentation Typo Fix (#5492) * Fix typo Change 'realize' to 'realise' * Add contributer agreement --- .github/contributors/JannisTriesToCode.md | 106 ++++++++++++++++++++++ website/docs/usage/adding-languages.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/JannisTriesToCode.md diff --git a/.github/contributors/JannisTriesToCode.md b/.github/contributors/JannisTriesToCode.md new file mode 100644 index 000000000..d834794c5 --- /dev/null +++ b/.github/contributors/JannisTriesToCode.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ----------------------------- | +| Name | Jannis Rauschke | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.05.2020 | +| GitHub username | JannisTriesToCode | +| Website (optional) | https://twitter.com/JRauschke | diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 70411ec0b..29de08266 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -288,7 +288,7 @@ common spelling. This has no effect on any other token attributes, or tokenization in general, but it ensures that **equivalent tokens receive similar representations**. This can improve the model's predictions on words that weren't common in the training data, but are equivalent to other words – for -example, "realize" and "realize", or "thx" and "thanks". +example, "realise" and "realize", or "thx" and "thanks". Similarly, spaCy also includes [global base norms](https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py) From ae1c179f3a3bdd02906ce340d1d972402b9c0b62 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 23 May 2020 17:58:19 +0200 Subject: [PATCH 035/119] Remove the nested quote --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 58f4cc2aa..23aa42334 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2211,7 +2211,7 @@ "", "nlp = spacy.load('en_core_web_sm')", "nlp.add_pipe(LanguageDetector())", - "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')", + "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'", "assert doc._.language_score >= 0.8" From e06ca7ea24d151b77719b02f7430d724aa27a406 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 May 2020 10:13:56 +0200 Subject: [PATCH 036/119] Switch to new add API in PhraseMatcher unpickle --- spacy/matcher/phrasematcher.pyx | 2 +- spacy/tests/matcher/test_phrase_matcher.py | 24 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b66ec35b8..00c3357f5 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -332,7 +332,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr): matcher = PhraseMatcher(vocab, attr=attr) for key, specs in docs.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, specs, on_match=callback) return matcher diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..60aa584ef 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import srsly from mock import Mock from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -266,3 +267,26 @@ def test_phrase_matcher_basic_check(en_vocab): pattern = Doc(en_vocab, words=["hello", "world"]) with pytest.raises(ValueError): matcher.add("TEST", pattern) + + +def test_phrase_matcher_pickle(en_vocab): + matcher = PhraseMatcher(en_vocab) + mock = Mock() + matcher.add("TEST", [Doc(en_vocab, words=["test"])]) + matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock) + doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"]) + assert len(matcher) == 2 + + b = srsly.pickle_dumps(matcher) + matcher_unpickled = srsly.pickle_loads(b) + + # call after pickling to avoid recursion error related to mock + matches = matcher(doc) + matches_unpickled = matcher_unpickled(doc) + + assert len(matcher) == len(matcher_unpickled) + assert matches == matches_unpickled + + # clunky way to vaguely check that callback is unpickled + (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] + assert isinstance(callbacks.get("TEST2"), Mock) From 8b8efa1b42446b94e70ebd2cc2990f84167ae303 Mon Sep 17 00:00:00 2001 From: Rajat <22280243+R1j1t@users.noreply.github.com> Date: Mon, 25 May 2020 15:00:23 +0530 Subject: [PATCH 037/119] update spacy universe with my project (#5497) * added contextualSpellCheck in spacy universe meta * removed extra formatting by code * updated with permanent links * run json linter used by spacy * filled SCA * updated the description --- .github/contributors/R1j1t.md | 106 ++++++++++++++++++++++++++++++++++ website/meta/universe.json | 30 ++++++++++ 2 files changed, 136 insertions(+) create mode 100644 .github/contributors/R1j1t.md diff --git a/.github/contributors/R1j1t.md b/.github/contributors/R1j1t.md new file mode 100644 index 000000000..a92f1e092 --- /dev/null +++ b/.github/contributors/R1j1t.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Rajat | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24 May 2020 | +| GitHub username | R1j1t | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index 23aa42334..58be719ed 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2293,6 +2293,36 @@ }, "category": ["pipeline", "research"], "tags": ["term_extraction"] + }, + { + "id": "contextualSpellCheck", + "title": "Contextual Spell Check", + "slogan": "Contextual spell correction using BERT (bidirectional representations)", + "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE. In the coming days, I would like to focus on RWE and optimising the package by implementing it in cython.", + "github": "R1j1t/contextualSpellCheck", + "pip": "contextualSpellCheck", + "code_example": [ + "import spacy", + "import contextualSpellCheck", + "", + "nlp = spacy.load('en')", + "contextualSpellCheck.add_to_pipe(nlp)", + "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')", + "", + "print(doc._.performed_spellCheck) #Should be True", + "print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million." + ], + "code_language": "python", + "url": "https://github.com/R1j1t/contextualSpellCheck", + "thumb": "https://user-images.githubusercontent.com/22280243/82760949-98e68480-9e14-11ea-952e-4738620fd9e3.png", + "image": "https://user-images.githubusercontent.com/22280243/82138959-2852cd00-9842-11ea-918a-49b2a7873ef6.png", + "author": "Rajat Goel", + "author_links": { + "github": "r1j1t", + "website": "https://github.com/R1j1t" + }, + "category": ["pipeline", "conversational", "research"], + "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] } ], From 736f3cb5af4ab2f77b4c7dd5e64331f433355a8d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 May 2020 12:03:49 +0200 Subject: [PATCH 038/119] Bump version and deps for v2.3.0 * spacy to v2.3.0 * thinc to v7.4.1 * spacy-lookups-data to v0.3.2 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 6 +++--- spacy/about.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 827e2a797..fe66494ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.4.0", + "thinc==7.4.1", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index ec30efc16..b93def651 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0 +thinc==7.4.1 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index af3579f88..1e29f1ead 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0 + thinc==7.4.1 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0 + thinc==7.4.1 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=1.0.2,<1.1.0 @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.3.1,<0.4.0 + spacy_lookups_data>=0.3.2,<0.4.0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = diff --git a/spacy/about.py b/spacy/about.py index 84dc86aa8..91810fa68 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.4" +__version__ = "2.3.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 3f727bc539542e97a8a0d94299f69405eda96ad3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 May 2020 12:57:20 +0200 Subject: [PATCH 039/119] Switch to v2.3.0.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 91810fa68..be1b3ae56 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0" +__version__ = "2.3.0.dev0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From c9c7b135c05fa8688202704e3cbbed80734f76af Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 25 May 2020 15:24:24 +0200 Subject: [PATCH 040/119] Update Makefile for v2.3.0 (#5502) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index cf96d6294..1891692ec 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex From 69897b45d89877d6b243d122ed7d13fca315503c Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 25 May 2020 16:39:22 +0200 Subject: [PATCH 041/119] Handle spacy.pex renaming in Makefile (#5503) --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 1891692ec..2764da118 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 chmod a+rx $@ + cp $@ dist/spacy.pex dist/pytest.pex : wheelhouse/pytest-*.whl $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock From 1eed101be9adc8f94036761099d512f26439a2c5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 May 2020 09:56:12 +0200 Subject: [PATCH 042/119] Fix Polish lemmatizer for deserialized models Restructure Polish lemmatizer not to depend on lookups data in `__init__` since the lemmatizer is initialized before the lookups data is loaded from a saved model. The lookups tables are accessed first in `__call__` instead once the data is available. --- spacy/lang/pl/lemmatizer.py | 87 +++++++++++++------------------------ 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index d0d843b2a..8b8d7fe27 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -6,98 +6,73 @@ from ...parts_of_speech import NAMES class PolishLemmatizer(Lemmatizer): - # This lemmatizer implements lookup lemmatization based on - # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS - # It utilizes some prefix based improvements for - # verb and adjectives lemmatization, as well as case-sensitive - # lemmatization for nouns - def __init__(self, lookups, *args, **kwargs): - # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super(PolishLemmatizer, self).__init__(lookups) - self.lemma_lookups = {} - for tag in [ - "ADJ", - "ADP", - "ADV", - "AUX", - "NOUN", - "NUM", - "PART", - "PRON", - "VERB", - "X", - ]: - self.lemma_lookups[tag] = self.lookups.get_table( - "lemma_lookup_" + tag.lower(), {} - ) - self.lemma_lookups["DET"] = self.lemma_lookups["X"] - self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] - + # This lemmatizer implements lookup lemmatization based on the Morfeusz + # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. + # It utilizes some prefix based improvements for verb and adjectives + # lemmatization, as well as case-sensitive lemmatization for nouns. def __call__(self, string, univ_pos, morphology=None): if isinstance(univ_pos, int): univ_pos = NAMES.get(univ_pos, "X") univ_pos = univ_pos.upper() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": - return self.lemmatize_noun(string, morphology) + return self.lemmatize_noun(string, morphology, lookup_table) if univ_pos != "PROPN": string = string.lower() if univ_pos == "ADJ": - return self.lemmatize_adj(string, morphology) + return self.lemmatize_adj(string, morphology, lookup_table) elif univ_pos == "VERB": - return self.lemmatize_verb(string, morphology) + return self.lemmatize_verb(string, morphology, lookup_table) - lemma_dict = self.lemma_lookups.get(univ_pos, {}) - return [lemma_dict.get(string, string.lower())] + return [lookup_table.get(string, string.lower())] - def lemmatize_adj(self, string, morphology): + def lemmatize_adj(self, string, morphology, lookup_table): # this method utilizes different procedures for adjectives # with 'nie' and 'naj' prefixes - lemma_dict = self.lemma_lookups["ADJ"] - if string[:3] == "nie": search_string = string[3:] if search_string[:3] == "naj": naj_search_string = search_string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] if string[:3] == "naj": naj_search_string = string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_verb(self, string, morphology): + def lemmatize_verb(self, string, morphology, lookup_table): # this method utilizes a different procedure for verbs # with 'nie' prefix - lemma_dict = self.lemma_lookups["VERB"] - if string[:3] == "nie": search_string = string[3:] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_noun(self, string, morphology): + def lemmatize_noun(self, string, morphology, lookup_table): # this method is case-sensitive, in order to work # for incorrectly tagged proper names - lemma_dict = self.lemma_lookups["NOUN"] - if string != string.lower(): - if string.lower() in lemma_dict: - return [lemma_dict[string.lower()]] - elif string in lemma_dict: - return [lemma_dict[string]] + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] return [string.lower()] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] def lookup(self, string, orth=None): return string.lower() From b6b5908f5e9ca4e6a2e46ca42a2d370b00119d44 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 May 2020 14:50:53 +0200 Subject: [PATCH 043/119] Prefer _SP over SP for default tag map space attrs If `_SP` is already in the tag map, use the mapping from `_SP` instead of `SP` so that `SP` can be a valid non-space tag. (Chinese has a non-space tag `SP` which was overriding the mapping of `_SP` to `SPACE`.) --- spacy/morphology.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..a9bab38ed 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -152,7 +152,10 @@ cdef class Morphology: self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. - space_attrs = tag_map.get('SP', {POS: SPACE}) + if '_SP' in tag_map: + space_attrs = tag_map.get('_SP') + else: + space_attrs = tag_map.get('SP', {POS: SPACE}) if '_SP' not in tag_map: self.strings.add('_SP') tag_map = dict(tag_map) From f00488ab3078a57211f28dffee7588b669e5b4ca Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 26 May 2020 16:41:39 +0200 Subject: [PATCH 044/119] Update train_intent_parser.py --- examples/training/train_intent_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index d2472b6b9..a91102093 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -2,7 +2,7 @@ # coding: utf-8 """Using the parser to recognise your own semantics -spaCy's parser component can be used to trained to predict any type of tree +spaCy's parser component can be trained to predict any type of tree structure over your input text. You can also predict trees over whole documents or chat logs, with connections between the sentence-roots used to annotate discourse structure. In this example, we'll build a message parser for a common From aad0610a853b4731806397537f867878fec5efa8 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 26 May 2020 22:30:53 +0200 Subject: [PATCH 045/119] Map NR to PROPN (#5512) --- spacy/lang/zh/tag_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..f9b5389ac 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X -from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE +from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE, PROPN # The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn # Treebank tag set. We also map the tags to the simpler Universal Dependencies @@ -28,7 +28,7 @@ TAG_MAP = { "URL": {POS: X}, "INF": {POS: X}, "NN": {POS: NOUN}, - "NR": {POS: NOUN}, + "NR": {POS: PROPN}, "NT": {POS: NOUN}, "VA": {POS: VERB}, "VC": {POS: VERB}, From 25de2a2191c168ce133d922c4e2e041684431228 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 27 May 2020 14:48:54 +0200 Subject: [PATCH 046/119] Improve vector name loading from model meta --- spacy/language.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 53a788f2a..2058def8a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -934,15 +934,26 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ + def deserialize_meta(path): + if path.exists(): + data = srsly.read_json(path) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(path): + if path.exists(): + self.vocab.from_disk(path) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() - deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) - deserializers["vocab"] = lambda p: self.vocab.from_disk( - p - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( p, exclude=["vocab"] ) @@ -996,14 +1007,23 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ + def deserialize_meta(b): + data = srsly.json_loads(b) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(b): + self.vocab.from_bytes(b) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() - deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) - deserializers["vocab"] = lambda b: self.vocab.from_bytes( - b - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( b, exclude=["vocab"] ) @@ -1069,7 +1089,7 @@ class component(object): def _fix_pretrained_vectors_name(nlp): # TODO: Replace this once we handle vectors consistently as static # data - if "vectors" in nlp.meta and nlp.meta["vectors"].get("name"): + if "vectors" in nlp.meta and "name" in nlp.meta["vectors"]: nlp.vocab.vectors.name = nlp.meta["vectors"]["name"] elif not nlp.vocab.vectors.size: nlp.vocab.vectors.name = None From 5f0a91cf3771a96e6bcd0c63a9d70e3fc74020d1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 09:56:29 +0200 Subject: [PATCH 047/119] fix conv-depth parameter --- website/docs/api/cli.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 505977be9..b49a2fb08 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -455,7 +455,7 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] -[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] +[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] @@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | | `output_dir` | positional | Directory to write models to on each epoch. | | `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--conv-depth`, `-cd` | option | Depth of CNN layers. | | `--cnn-window`, `-cW` 2.2.2 | option | Window size for CNN layers. | | `--cnn-pieces`, `-cP` 2.2.2 | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). | | `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. | @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 04ba37b667764c5b18825a5ee8ce513962e73bcd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 13:52:39 +0200 Subject: [PATCH 048/119] fix description --- examples/training/pretrain_textcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index f3e493f6a..d29e20ad1 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats): width=("Width of CNN layers", "positional", None, int), embed_size=("Embedding rows", "positional", None, int), pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), + train_iters=("Number of iterations to train", "option", "tn", int), train_examples=("Number of labelled examples", "option", "eg", int), vectors_model=("Name or path to vectors model to learn from"), ) From e1b7cbd197954928974296c6a622ddb70211dd30 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 29 May 2020 14:33:47 +0200 Subject: [PATCH 049/119] Remove MorphAnalysis __str__ and __repr__ --- spacy/tokens/morphanalysis.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e09870741..12f2f6cc3 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -46,12 +46,6 @@ cdef class MorphAnalysis: """The number of features in the analysis.""" return self.c.length - def __str__(self): - return self.to_json() - - def __repr__(self): - return self.to_json() - def __hash__(self): return self.key From 291483157dacfc80ecd6ba2f7e097fbe98a4395a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 17:38:33 +0200 Subject: [PATCH 050/119] prevent loading a pretrained Tok2Vec layer AND pretrained components --- spacy/cli/train.py | 9 +++++++-- spacy/errors.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ce095c15..d4de9aeb4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,6 +15,7 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu +from ..errors import Errors from ..gold import GoldCorpus from ..compat import path2str from ..lookups import Lookups @@ -182,6 +183,7 @@ def train( msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 + base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) @@ -227,6 +229,7 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) + base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) @@ -299,7 +302,7 @@ def train( # Load in pretrained weights if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) + components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config @@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) -def _load_pretrained_tok2vec(nlp, loc): +def _load_pretrained_tok2vec(nlp, loc, base_components): """Load pretrained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ @@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc): loaded = [] for name, component in nlp.pipeline: if hasattr(component, "model") and hasattr(component.model, "tok2vec"): + if name in base_components: + raise ValueError(Errors.E200.format(component=name)) component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/errors.py b/spacy/errors.py index 6d92545d7..11b601e19 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -568,6 +568,8 @@ class Errors(object): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") + E200 = ("Specifying a base model with a pretrained component '{component}' " + "can not be combined with adding a pretrained Tok2Vec layer.") @add_codes From 64adda32029b867b25bc6f3313863abfc70a6fd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 May 2020 23:21:55 +0200 Subject: [PATCH 051/119] Revert "Remove peeking from Parser.begin_training (#5456)" This reverts commit 9393253b66b5f9fc6c5e58806cf261da5afd1778. The model shouldn't need to see all examples, and actually in v3 there's no equivalent step. All examples are provided to the component, for the component to do stuff like figuring out the labels. The model just needs to do stuff like shape inference. --- spacy/syntax/nn_parser.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fafa492c6..d5c6bf2a8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,6 +9,7 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np +from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -620,15 +621,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - docs = [] - golds = [] - for raw_text, annots_brackets in get_gold_tuples(): + doc_sample = [] + gold_sample = [] + for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - docs.append(Doc(self.vocab, words=words)) - golds.append(GoldParse(docs[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(docs, golds) + doc_sample.append(Doc(self.vocab, words=words)) + gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) From 15134ef611f6d63ccf45afaacc0e6240d26576f9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 30 May 2020 12:53:32 +0200 Subject: [PATCH 052/119] fix deserialization order --- spacy/vectors.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 51ddc3f9a..aec086e6c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -425,9 +425,9 @@ cdef class Vectors: self.data = xp.load(str(path)) serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), ("vectors", load_vectors), + ("keys", load_keys), + ("key2row", load_key2row), )) util.from_disk(path, serializers, []) self._sync_unset() From a005ccd6d7b0d62018481cd5f0ffe34d7fb51ab3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 31 May 2020 19:57:54 +0200 Subject: [PATCH 053/119] Preserve _SP when filtering tag map in Tagger To allow "SP" as a tag (for Chinese OntoNotes), preserve "_SP" if present as the reference `SPACE` POS in the tag map in `Tagger.begin_training()`. --- spacy/pipeline/pipes.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ccd847ef1..105ce00e6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -526,6 +526,8 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, From 7d5a89661e690473114e52d3ca1c27ef2ff733e9 Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Sun, 31 May 2020 20:13:39 +0200 Subject: [PATCH 054/119] contributor agreement signed (#5525) --- .github/contributors/leomrocha.md | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/leomrocha.md diff --git a/.github/contributors/leomrocha.md b/.github/contributors/leomrocha.md new file mode 100644 index 000000000..495654153 --- /dev/null +++ b/.github/contributors/leomrocha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Leonardo M. Rocha | +| Company name (if applicable) | | +| Title or role (if applicable) | Eng. | +| Date | 31/05/2020 | +| GitHub username | leomrocha | +| Website (optional) | | From c21c308ecbf1021a180d2a6c201eda57e73078ca Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Sun, 31 May 2020 22:08:12 +0200 Subject: [PATCH 055/119] corrected issue #5524 changed 'STRING TERMINATOR' for LATIN SMALL LIGATURE OE' (#5526) --- spacy/lang/fr/_tokenizer_exceptions_list.py | 65 ++++++++++----------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..0fcf02351 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -534,7 +534,6 @@ FR_BASE_EXCEPTIONS = [ "Beaumont-Hamel", "Beaumont-Louestault", "Beaumont-Monteux", - "Beaumont-Pied-de-Bœuf", "Beaumont-Pied-de-Bœuf", "Beaumont-Sardolles", "Beaumont-Village", @@ -951,7 +950,7 @@ FR_BASE_EXCEPTIONS = [ "Buxières-sous-les-Côtes", "Buzy-Darmont", "Byhleguhre-Byhlen", - "Bœurs-en-Othe", + "Bœurs-en-Othe", "Bâle-Campagne", "Bâle-Ville", "Béard-Géovreissiat", @@ -1589,11 +1588,11 @@ FR_BASE_EXCEPTIONS = [ "Cruci-Falgardiens", "Cruquius-Oost", "Cruviers-Lascours", - "Crèvecœur-en-Auge", - "Crèvecœur-en-Brie", - "Crèvecœur-le-Grand", - "Crèvecœur-le-Petit", - "Crèvecœur-sur-l'Escaut", + "Crèvecœur-en-Auge", + "Crèvecœur-en-Brie", + "Crèvecœur-le-Grand", + "Crèvecœur-le-Petit", + "Crèvecœur-sur-l'Escaut", "Crécy-Couvé", "Créon-d'Armagnac", "Cubjac-Auvézère-Val-d'Ans", @@ -1619,7 +1618,7 @@ FR_BASE_EXCEPTIONS = [ "Cuxac-Cabardès", "Cuxac-d'Aude", "Cuyk-Sainte-Agathe", - "Cœuvres-et-Valsery", + "Cœuvres-et-Valsery", "Céaux-d'Allègre", "Céleste-Empire", "Cénac-et-Saint-Julien", @@ -1682,7 +1681,7 @@ FR_BASE_EXCEPTIONS = [ "Devrai-Gondragnières", "Dhuys et Morin-en-Brie", "Diane-Capelle", - "Dieffenbach-lès-Wœrth", + "Dieffenbach-lès-Wœrth", "Diekhusen-Fahrstedt", "Diennes-Aubigny", "Diensdorf-Radlow", @@ -1755,7 +1754,7 @@ FR_BASE_EXCEPTIONS = [ "Durdat-Larequille", "Durfort-Lacapelette", "Durfort-et-Saint-Martin-de-Sossenac", - "Dœuil-sur-le-Mignon", + "Dœuil-sur-le-Mignon", "Dão-Lafões", "Débats-Rivière-d'Orpra", "Décines-Charpieu", @@ -2690,8 +2689,8 @@ FR_BASE_EXCEPTIONS = [ "Kuhlen-Wendorf", "KwaZulu-Natal", "Kyzyl-Arvat", - "Kœur-la-Grande", - "Kœur-la-Petite", + "Kœur-la-Grande", + "Kœur-la-Petite", "Kölln-Reisiek", "Königsbach-Stein", "Königshain-Wiederau", @@ -4027,7 +4026,7 @@ FR_BASE_EXCEPTIONS = [ "Marcilly-d'Azergues", "Marcillé-Raoul", "Marcillé-Robert", - "Marcq-en-Barœul", + "Marcq-en-Barœul", "Marcy-l'Etoile", "Marcy-l'Étoile", "Mareil-Marly", @@ -4261,7 +4260,7 @@ FR_BASE_EXCEPTIONS = [ "Monlezun-d'Armagnac", "Monléon-Magnoac", "Monnetier-Mornex", - "Mons-en-Barœul", + "Mons-en-Barœul", "Monsempron-Libos", "Monsteroux-Milieu", "Montacher-Villegardin", @@ -4351,7 +4350,7 @@ FR_BASE_EXCEPTIONS = [ "Mornay-Berry", "Mortain-Bocage", "Morteaux-Couliboeuf", - "Morteaux-Coulibœuf", + "Morteaux-Coulibœuf", "Morteaux-Coulibœuf", "Mortes-Frontières", "Mory-Montcrux", @@ -4394,7 +4393,7 @@ FR_BASE_EXCEPTIONS = [ "Muncq-Nieurlet", "Murtin-Bogny", "Murtin-et-le-Châtelet", - "Mœurs-Verdey", + "Mœurs-Verdey", "Ménestérol-Montignac", "Ménil'muche", "Ménil-Annelles", @@ -4615,7 +4614,7 @@ FR_BASE_EXCEPTIONS = [ "Neuves-Maisons", "Neuvic-Entier", "Neuvicq-Montguyon", - "Neuville-lès-Lœuilly", + "Neuville-lès-Lœuilly", "Neuvy-Bouin", "Neuvy-Deux-Clochers", "Neuvy-Grandchamp", @@ -4776,8 +4775,8 @@ FR_BASE_EXCEPTIONS = [ "Nuncq-Hautecôte", "Nurieux-Volognat", "Nuthe-Urstromtal", - "Nœux-les-Mines", - "Nœux-lès-Auxi", + "Nœux-les-Mines", + "Nœux-lès-Auxi", "Nâves-Parmelan", "Nézignan-l'Evêque", "Nézignan-l'Évêque", @@ -5346,7 +5345,7 @@ FR_BASE_EXCEPTIONS = [ "Quincy-Voisins", "Quincy-sous-le-Mont", "Quint-Fonsegrives", - "Quœux-Haut-Maînil", + "Quœux-Haut-Maînil", "Quœux-Haut-Maînil", "Qwa-Qwa", "R.-V.", @@ -5634,12 +5633,12 @@ FR_BASE_EXCEPTIONS = [ "Saint Aulaye-Puymangou", "Saint Geniez d'Olt et d'Aubrac", "Saint Martin de l'If", - "Saint-Denœux", - "Saint-Jean-de-Bœuf", - "Saint-Martin-le-Nœud", - "Saint-Michel-Tubœuf", + "Saint-Denœux", + "Saint-Jean-de-Bœuf", + "Saint-Martin-le-Nœud", + "Saint-Michel-Tubœuf", "Saint-Paul - Flaugnac", - "Saint-Pierre-de-Bœuf", + "Saint-Pierre-de-Bœuf", "Saint-Thegonnec Loc-Eguiner", "Sainte-Alvère-Saint-Laurent Les Bâtons", "Salignac-Eyvignes", @@ -6211,7 +6210,7 @@ FR_BASE_EXCEPTIONS = [ "Tite-Live", "Titisee-Neustadt", "Tobel-Tägerschen", - "Togny-aux-Bœufs", + "Togny-aux-Bœufs", "Tongre-Notre-Dame", "Tonnay-Boutonne", "Tonnay-Charente", @@ -6339,7 +6338,7 @@ FR_BASE_EXCEPTIONS = [ "Vals-près-le-Puy", "Valverde-Enrique", "Valzin-en-Petite-Montagne", - "Vandœuvre-lès-Nancy", + "Vandœuvre-lès-Nancy", "Varces-Allières-et-Risset", "Varenne-l'Arconce", "Varenne-sur-le-Doubs", @@ -6460,9 +6459,9 @@ FR_BASE_EXCEPTIONS = [ "Villenave-d'Ornon", "Villequier-Aumont", "Villerouge-Termenès", - "Villers-aux-Nœuds", + "Villers-aux-Nœuds", "Villez-sur-le-Neubourg", - "Villiers-en-Désœuvre", + "Villiers-en-Désœuvre", "Villieu-Loyes-Mollon", "Villingen-Schwenningen", "Villié-Morgon", @@ -6470,7 +6469,7 @@ FR_BASE_EXCEPTIONS = [ "Vilosnes-Haraumont", "Vilters-Wangs", "Vincent-Froideville", - "Vincy-Manœuvre", + "Vincy-Manœuvre", "Vincy-Manœuvre", "Vincy-Reuil-et-Magny", "Vindrac-Alayrac", @@ -6514,8 +6513,8 @@ FR_BASE_EXCEPTIONS = [ "Vrigne-Meusiens", "Vrijhoeve-Capelle", "Vuisternens-devant-Romont", - "Vœlfling-lès-Bouzonville", - "Vœuil-et-Giget", + "Vœlfling-lès-Bouzonville", + "Vœuil-et-Giget", "Vélez-Blanco", "Vélez-Málaga", "Vélez-Rubio", @@ -6618,7 +6617,7 @@ FR_BASE_EXCEPTIONS = [ "Wust-Fischbeck", "Wutha-Farnroda", "Wy-dit-Joli-Village", - "Wœlfling-lès-Sarreguemines", + "Wœlfling-lès-Sarreguemines", "Wünnewil-Flamatt", "X-SAMPA", "X-arbre", From 925e93857034c29c46a8b582db4969df7ba50c06 Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Mon, 1 Jun 2020 18:18:34 +0200 Subject: [PATCH 056/119] Spanish tokenizer exception and examples improvement (#5531) * Spanish tokenizer exception additions. Added Spanish question examples * erased slang tokenization examples --- spacy/lang/es/examples.py | 6 +++++- spacy/lang/es/tokenizer_exceptions.py | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..7ab0a7dfe 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -18,5 +18,9 @@ sentences = [ "El gato come pescado.", "Veo al hombre con el telescopio.", "La araña come moscas.", - "El pingüino incuba en su nido.", + "El pingüino incuba en su nido sobre el hielo.", + "¿Dónde estais?", + "¿Quién es el presidente Francés?", + "¿Dónde está encuentra la capital de Argentina?", + "¿Cuándo nació José de San Martín?", ] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 2c2631086..891323705 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA -_exc = { - "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], - "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}], -} +_exc = {} for exc_data in [ + {ORTH: "n°", LEMMA: "número"}, + {ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "dna.", LEMMA: "docena"}, + {ORTH: "dpto.", LEMMA: "departamento"}, + {ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "pág.", LEMMA: "página"}, {ORTH: "p.ej.", LEMMA: "por ejemplo"}, @@ -20,6 +21,8 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "vol.", NORM: "volúmen"}, + ]: _exc[exc_data[ORTH]] = [exc_data] @@ -39,10 +42,14 @@ for h in range(1, 12 + 1): for orth in [ "a.C.", "a.J.C.", + "d.C.", + "d.J.C.", "apdo.", "Av.", "Avda.", "Cía.", + "Dr.", + "Dra.", "EE.UU.", "etc.", "fig.", @@ -58,8 +65,10 @@ for orth in [ "Prof.", "Profa.", "q.e.p.d.", + "Q.E.P.D." "S.A.", "S.L.", + "S.R.L." "s.s.s.", "Sr.", "Sra.", From bbc1836581932d24818df064da8d64c7ec03ca23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 2 Jun 2020 17:23:16 +0200 Subject: [PATCH 057/119] Add rudimentary version checks on model load --- spacy/errors.py | 12 ++++++++++++ spacy/tests/test_misc.py | 30 ++++++++++++++++++++++++++++++ spacy/util.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 11b601e19..baed574f8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -115,6 +115,18 @@ class Warnings(object): "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") + W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and " + "is incompatible with the current spaCy version ({current}). This " + "may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") + W032 = ("Unable to determine model compatibility for model '{model}' " + "({model_version}) with the current spaCy version ({current}). " + "This may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") @add_codes diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 3ac621649..bb7ade35e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -10,6 +10,7 @@ from spacy import prefer_gpu, require_gpu from spacy.compat import symlink_to, symlink_remove, path2str, is_windows from spacy._ml import PrecomputableAffine from subprocess import CalledProcessError +from .util import make_tempdir @pytest.fixture @@ -146,3 +147,32 @@ def test_load_model_blank_shortcut(): assert nlp.pipeline == [] with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof") + + +def test_load_model_version_compat(): + """Test warnings for various spacy_version specifications in meta. Since + this is more of a hack for v2, manually specify the current major.minor + version to simplify test creation.""" + nlp = util.load_model("blank:en") + assert nlp.meta["spacy_version"].startswith(">=2.3") + with make_tempdir() as d: + # no change: compatible + nlp.to_disk(d) + nlp2 = util.load_model(d) + + # additional compatible upper pin + nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" + nlp.to_disk(d) + nlp2 = util.load_model(d) + + # incompatible older version + nlp.meta["spacy_version"] = ">=2.2.5" + nlp.to_disk(d) + with pytest.warns(UserWarning): + nlp_reloaded = util.load_model(d) + + # invalid version specification + nlp.meta["spacy_version"] = ">@#$%_invalid_version" + nlp.to_disk(d) + with pytest.warns(UserWarning): + nlp_reloaded = util.load_model(d) diff --git a/spacy/util.py b/spacy/util.py index 5fd296404..36df5725f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,6 +17,7 @@ import srsly import catalogue import sys import warnings +from . import about try: import jsonschema @@ -250,6 +251,33 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + about_major_minor = ".".join(about.__version__.split(".")[:2]) + if about_major_minor is not None and not meta["spacy_version"].startswith( + ">=" + about_major_minor + ): + # try to simplify version requirements from model meta to vx.x + # for warning message + meta_spacy_version = "v" + ".".join( + meta["spacy_version"].replace(">=", "").split(".")[:2] + ) + # if the format is unexpected, supply the full version + if not re.match(r"v\d+\.\d+", meta_spacy_version): + meta_spacy_version = meta["spacy_version"] + warn_msg = Warnings.W031.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + version=meta_spacy_version, + current=about.__version__, + ) + warnings.warn(warn_msg) + else: + warn_msg = Warnings.W032.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + current=about.__version__, + ) + warnings.warn(warn_msg) return meta From 75f08ad62d9d0b9ea8ecf0454da332f99b00ec45 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 2 Jun 2020 17:41:25 +0200 Subject: [PATCH 058/119] Remove unnecessary check --- spacy/util.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 36df5725f..5362952e2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,9 +253,7 @@ def get_model_meta(path): raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: about_major_minor = ".".join(about.__version__.split(".")[:2]) - if about_major_minor is not None and not meta["spacy_version"].startswith( - ">=" + about_major_minor - ): + if not meta["spacy_version"].startswith(">=" + about_major_minor): # try to simplify version requirements from model meta to vx.x # for warning message meta_spacy_version = "v" + ".".join( From a57bdeecacb664e80e0c8408492e28eb9dd31a79 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 12:10:12 +0200 Subject: [PATCH 059/119] Test util.get_model_meta instead of util.load_model --- spacy/tests/test_misc.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bb7ade35e..a361d5c0f 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest import os import ctypes +import srsly from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu @@ -158,21 +159,22 @@ def test_load_model_version_compat(): with make_tempdir() as d: # no change: compatible nlp.to_disk(d) - nlp2 = util.load_model(d) + meta_path = Path(d / "meta.json") + util.get_model_meta(d) # additional compatible upper pin nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - nlp.to_disk(d) - nlp2 = util.load_model(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) + util.get_model_meta(d) # incompatible older version nlp.meta["spacy_version"] = ">=2.2.5" - nlp.to_disk(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) with pytest.warns(UserWarning): - nlp_reloaded = util.load_model(d) + util.get_model_meta(d) # invalid version specification nlp.meta["spacy_version"] = ">@#$%_invalid_version" - nlp.to_disk(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) with pytest.warns(UserWarning): - nlp_reloaded = util.load_model(d) + util.get_model_meta(d) From 8c758ed1ebc3d35f03707e593b83b214d40f434b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 3 Jun 2020 12:11:57 +0200 Subject: [PATCH 060/119] Fix meta path --- spacy/tests/test_misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index a361d5c0f..d48ba24a2 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -164,17 +164,17 @@ def test_load_model_version_compat(): # additional compatible upper pin nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) util.get_model_meta(d) # incompatible older version nlp.meta["spacy_version"] = ">=2.2.5" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d) # invalid version specification nlp.meta["spacy_version"] = ">@#$%_invalid_version" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d) From 410fb7ee437b649c8bd291da84db5dc7cd65db45 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 5 Jun 2020 02:15:43 +0900 Subject: [PATCH 061/119] Add Japanese Model (#5544) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add more rules to deal with Japanese UD mappings Japanese UD rules sometimes give different UD tags to tokens with the same underlying POS tag. The UD spec indicates these cases should be disambiguated using the output of a tool called "comainu", but rules are enough to get the right result. These rules are taken from Ginza at time of writing, see #3756. * Add new tags from GSD This is a few rare tags that aren't in Unidic but are in the GSD data. * Add basic Japanese sentencization This code is taken from Ginza again. * Add sentenceizer quote handling Could probably add more paired characters but this will do for now. Also includes some tests. * Replace fugashi with SudachiPy * Modify tag format to match GSD annotations Some of the tests still need to be updated, but I want to get this up for testing training. * Deal with case with closing punct without opening * refactor resolve_pos() * change tag field separator from "," to "-" * add TAG_ORTH_MAP * add TAG_BIGRAM_MAP * revise rules for 連体詞 * revise rules for 連体詞 * improve POS about 2% * add syntax_iterator.py (not mature yet) * improve syntax_iterators.py * improve syntax_iterators.py * add phrases including nouns and drop NPs consist of STOP_WORDS * First take at noun chunks This works in many situations but still has issues in others. If the start of a subtree has no noun, then nested phrases can be generated. また行きたい、そんな気持ちにさせてくれるお店です。 [そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店] For some reason て gets included sometimes. Not sure why. ゲンに連れ添って円盤生物を調査するパートナーとなる。 [て円盤生物, ...] Some phrases that look like they should be split are grouped together; not entirely sure that's wrong. This whole thing becomes one chunk: 道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み * Use new generic get_words_and_spaces The new get_words_and_spaces function is simpler than what was used in Japanese, so it's good to be able to switch to it. However, there was an issue. The new function works just on text, so POS info could get out of sync. Fixing this required a small change to the way dtokens (tokens with POS and lemma info) were generated. Specifically, multiple extraneous spaces now become a single token, so when generating dtokens multiple space tokens should be created in a row. * Fix noun_chunks, should be working now * Fix some tests, add naughty strings tests Some of the existing tests changed because the tokenization mode of Sudachi changed to the more fine-grained A mode. Sudachi also has issues with some strings, so this adds a test against the naughty strings. * Remove empty Sudachi tokens Not doing this creates zero-length tokens and causes errors in the internal spaCy processing. * Add yield_bunsetu back in as a separate piece of code Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Co-authored-by: hiroshi --- spacy/lang/ja/__init__.py | 152 ++++++++++++++------- spacy/lang/ja/bunsetu.py | 144 ++++++++++++++++++++ spacy/lang/ja/syntax_iterators.py | 55 ++++++++ spacy/lang/ja/tag_bigram_map.py | 37 +++++ spacy/lang/ja/tag_map.py | 158 ++++++++++++---------- spacy/lang/ja/tag_orth_map.py | 30 ++++ spacy/tests/lang/ja/test_lemmatization.py | 2 +- spacy/tests/lang/ja/test_tokenizer.py | 35 +++-- 8 files changed, 486 insertions(+), 127 deletions(-) create mode 100644 spacy/lang/ja/bunsetu.py create mode 100644 spacy/lang/ja/syntax_iterators.py create mode 100644 spacy/lang/ja/tag_bigram_map.py create mode 100644 spacy/lang/ja/tag_orth_map.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..09546467e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -5,97 +5,148 @@ import re from collections import namedtuple from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP +from .tag_orth_map import TAG_ORTH_MAP +from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG -from ...language import Language -from ...tokens import Doc from ...compat import copy_reg -from ...util import DummyTokenizer +from ...language import Language +from ...symbols import POS +from ...tokens import Doc +from ...util import DummyTokenizer, get_words_and_spaces + +# Hold the attributes we need with convenient names +DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) # Handling for multiple spaces in a row is somewhat awkward, this simplifies # the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) -DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) +DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) +DummySpace = DummyNode(" ", " ", " ") -def try_fugashi_import(): - """Fugashi is required for Japanese support, so check for it. +def try_sudachi_import(): + """SudachiPy is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: - import fugashi + from sudachipy import dictionary, tokenizer - return fugashi + tok = dictionary.Dictionary().create( + mode=tokenizer.Tokenizer.SplitMode.A + ) + return tok except ImportError: raise ImportError( - "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" + "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" ) -def resolve_pos(token): +def resolve_pos(token, next_token): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context - in the sentence. This function adds information to the POS tag to - resolve ambiguous mappings. + in the sentence. This function returns resolved POSs for both token + and next_token by tuple. """ - # this is only used for consecutive ascii spaces - if token.surface == " ": - return "空白" + # Some tokens have their UD tag decided based on the POS of the following + # token. - # TODO: This is a first take. The rules here are crude approximations. - # For many of these, full dependencies are needed to properly resolve - # PoS mappings. - if token.pos == "連体詞,*,*,*": - if re.match(r"[こそあど此其彼]の", token.surface): - return token.pos + ",DET" - if re.match(r"[こそあど此其彼]", token.surface): - return token.pos + ",PRON" - return token.pos + ",ADJ" - return token.pos + # orth based rules + if token.pos in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[token.pos[0]] + if token.surface in orth_map: + return orth_map[token.surface], None + + # tag bi-gram mapping + if next_token: + tag_bigram = token.pos[0], next_token.pos[0] + if tag_bigram in TAG_BIGRAM_MAP: + bipos = TAG_BIGRAM_MAP[tag_bigram] + if bipos[0] is None: + return TAG_MAP[token.pos[0]][POS], bipos[1] + else: + return bipos + + return TAG_MAP[token.pos[0]][POS], None -def get_words_and_spaces(tokenizer, text): - """Get the individual tokens that make up the sentence and handle white space. +# Use a mapping of paired punctuation to avoid splitting quoted sentences. +pairpunct = {'「':'」', '『': '』', '【': '】'} - Japanese doesn't usually use white space, and MeCab's handling of it for - multiple spaces in a row is somewhat awkward. + +def separate_sentences(doc): + """Given a doc, mark tokens that start sentences based on Unidic tags. """ - tokens = tokenizer.parseToNodeList(text) + stack = [] # save paired punctuation + for i, token in enumerate(doc[:-2]): + # Set all tokens after the first to false by default. This is necessary + # for the doc code to be aware we've done sentencization, see + # `is_sentenced`. + token.sent_start = (i == 0) + if token.tag_: + if token.tag_ == "補助記号-括弧開": + ts = str(token) + if ts in pairpunct: + stack.append(pairpunct[ts]) + elif stack and ts == stack[-1]: + stack.pop() + + if token.tag_ == "補助記号-句点": + next_token = doc[i+1] + if next_token.tag_ != token.tag_ and not stack: + next_token.sent_start = True + + +def get_dtokens(tokenizer, text): + tokens = tokenizer.tokenize(text) words = [] - spaces = [] - for token in tokens: - # If there's more than one space, spaces after the first become tokens - for ii in range(len(token.white_space) - 1): - words.append(DummySpace) - spaces.append(False) - - words.append(token) - spaces.append(bool(token.white_space)) - return words, spaces + for ti, token in enumerate(tokens): + tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) + inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) + dtoken = DetailedToken( + token.surface(), + (tag, inf), + token.dictionary_form()) + if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': + # don't add multiple space tokens in a row + continue + words.append(dtoken) + # remove empty tokens. These can be produced with characters like … that + # Sudachi normalizes internally. + words = [ww for ww in words if len(ww.surface) > 0] + return words class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_fugashi_import().Tagger() - self.tokenizer.parseToNodeList("") # see #2901 + self.tokenizer = try_sudachi_import() def __call__(self, text): - dtokens, spaces = get_words_and_spaces(self.tokenizer, text) + dtokens = get_dtokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + words, spaces = get_words_and_spaces(words, text) + unidic_tags = [",".join(x.pos) for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) - unidic_tags = [] - for token, dtoken in zip(doc, dtokens): - unidic_tags.append(dtoken.pos) - token.tag_ = resolve_pos(dtoken) + next_pos = None + for ii, (token, dtoken) in enumerate(zip(doc, dtokens)): + ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None + token.tag_ = dtoken.pos[0] + if next_pos: + token.pos = next_pos + next_pos = None + else: + token.pos, next_pos = resolve_pos(dtoken, ntoken) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.feature.lemma or dtoken.surface + token.lemma_ = dtoken.lemma doc.user_data["unidic_tags"] = unidic_tags + + separate_sentences(doc) return doc @@ -104,6 +155,7 @@ class JapaneseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py new file mode 100644 index 000000000..7c3eee336 --- /dev/null +++ b/spacy/lang/ja/bunsetu.py @@ -0,0 +1,144 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + + +POS_PHRASE_MAP = { + "NOUN": "NP", + "NUM": "NP", + "PRON": "NP", + "PROPN": "NP", + + "VERB": "VP", + + "ADJ": "ADJP", + + "ADV": "ADVP", + + "CCONJ": "CCONJP", +} + + +# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] +def yield_bunsetu(doc, debug=False): + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + prev = None + prev_tag = None + prev_dep = None + prev_head = None + for t in doc: + pos = t.pos_ + pos_type = POS_PHRASE_MAP.get(pos, None) + tag = t.tag_ + dep = t.dep_ + head = t.head.i + if debug: + print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + + # DET is always an individual bunsetu + if pos == "DET": + if bunsetu: + yield bunsetu, phrase_type, phrase + yield [t], None, None + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + + # PRON or Open PUNCT always splits bunsetu + elif tag == "補助記号-括弧開": + if bunsetu: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = True + phrase_type = None + phrase = None + + # bunsetu head not appeared + elif phrase_type is None: + if bunsetu and prev_tag == "補助記号-読点": + yield bunsetu, phrase_type, phrase + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + bunsetu.append(t) + if pos_type: # begin phrase + phrase = [t] + phrase_type = pos_type + if pos_type in {"ADVP", "CCONJP"}: + bunsetu_may_end = True + + # entering new bunsetu + elif pos_type and ( + pos_type != phrase_type or # different phrase type arises + bunsetu_may_end # same phrase type but bunsetu already ended + ): + # exceptional case: NOUN to VERB + if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + bunsetu.append(t) + phrase_type = "VP" + phrase.append(t) + # exceptional case: VERB to NOUN + elif phrase_type == "VP" and pos_type == "NP" and ( + prev_dep == 'compound' and prev_head == t.i or + dep == 'compound' and prev == head or + prev_dep == 'nmod' and prev_head == t.i + ): + bunsetu.append(t) + phrase_type = "NP" + phrase.append(t) + else: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = False + phrase_type = pos_type + phrase = [t] + + # NOUN bunsetu + elif phrase_type == "NP": + bunsetu.append(t) + if not bunsetu_may_end and (( + (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # VERB bunsetu + elif phrase_type == "VP": + bunsetu.append(t) + if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + phrase.append(t) + else: + bunsetu_may_end = True + + # ADJ bunsetu + elif phrase_type == "ADJP" and tag != '連体詞': + bunsetu.append(t) + if not bunsetu_may_end and (( + pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # other bunsetu + else: + bunsetu.append(t) + + prev = t.i + prev_tag = t.tag_ + prev_dep = t.dep_ + prev_head = head + + if bunsetu: + yield bunsetu, phrase_type, phrase diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py new file mode 100644 index 000000000..cd1e4fde7 --- /dev/null +++ b/spacy/lang/ja/syntax_iterators.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON, VERB + +# XXX this can probably be pruned a bit +labels = [ + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", +] + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings.add(label) for label in labels] + conj = doc.vocab.strings.add("conj") + np_label = doc.vocab.strings.add("NP") + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + unseen = [w.i for w in word.subtree if w.i not in seen] + if not unseen: + continue + + # this takes care of particles etc. + seen.update(j.i for j in word.subtree) + # This avoids duplicating embedded clauses + seen.update(range(word.i + 1)) + + # if the head of this is a verb, mark that and rights seen + # Don't do the subtree as that can hide other phrases + if word.head.pos == VERB: + seen.add(word.head.i) + seen.update(w.i for w in word.head.rights) + yield unseen[0], word.i + 1, np_label + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py new file mode 100644 index 000000000..5ed9aec89 --- /dev/null +++ b/spacy/lang/ja/tag_bigram_map.py @@ -0,0 +1,37 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB + +# mapping from tag bi-gram to pos of previous token +TAG_BIGRAM_MAP = { + # This covers only small part of AUX. + ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), + + ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), + # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), + + # This covers acl, advcl, obl and root, but has side effect for compound. + ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), + # This covers almost all of the deps + ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), + + ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), + ("副詞", "動詞-非自立可能"): (None, VERB), + ("形容詞-一般", "動詞-非自立可能"): (None, VERB), + ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB), + ("接頭辞", "動詞-非自立可能"): (None, VERB), + ("助詞-係助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), + ("補助記号-読点", "動詞-非自立可能"): (None, VERB), + + ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), + + ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), + ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), + + ("動詞-一般", "助詞-副助詞"): (None, PART), + ("動詞-非自立可能", "助詞-副助詞"): (None, PART), + ("助動詞", "助詞-副助詞"): (None, PART), +} diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..ad416e109 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,82 +1,104 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN +from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { # Explanation of Unidic tags: # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf - # Universal Dependencies Mapping: + # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/pos/all.html - "記号,一般,*,*": { - POS: PUNCT + "記号-一般": { + POS: NOUN }, # this includes characters used to represent sounds like ドレミ - "記号,文字,*,*": { - POS: PUNCT - }, # this is for Greek and Latin characters used as sumbols, as in math - "感動詞,フィラー,*,*": {POS: INTJ}, - "感動詞,一般,*,*": {POS: INTJ}, - # this is specifically for unicode full-width space - "空白,*,*,*": {POS: X}, - # This is used when sequential half-width spaces are present + "記号-文字": { + POS: NOUN + }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math + "感動詞-フィラー": {POS: INTJ}, + "感動詞-一般": {POS: INTJ}, + "空白": {POS: SPACE}, - "形状詞,一般,*,*": {POS: ADJ}, - "形状詞,タリ,*,*": {POS: ADJ}, - "形状詞,助動詞語幹,*,*": {POS: ADJ}, - "形容詞,一般,*,*": {POS: ADJ}, - "形容詞,非自立可能,*,*": {POS: AUX}, # XXX ADJ if alone, AUX otherwise - "助詞,格助詞,*,*": {POS: ADP}, - "助詞,係助詞,*,*": {POS: ADP}, - "助詞,終助詞,*,*": {POS: PART}, - "助詞,準体助詞,*,*": {POS: SCONJ}, # の as in 走るのが速い - "助詞,接続助詞,*,*": {POS: SCONJ}, # verb ending て - "助詞,副助詞,*,*": {POS: PART}, # ばかり, つつ after a verb - "助動詞,*,*,*": {POS: AUX}, - "接続詞,*,*,*": {POS: SCONJ}, # XXX: might need refinement - "接頭辞,*,*,*": {POS: NOUN}, - "接尾辞,形状詞的,*,*": {POS: ADJ}, # がち, チック - "接尾辞,形容詞的,*,*": {POS: ADJ}, # -らしい - "接尾辞,動詞的,*,*": {POS: NOUN}, # -じみ - "接尾辞,名詞的,サ変可能,*": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* - "接尾辞,名詞的,一般,*": {POS: NOUN}, - "接尾辞,名詞的,助数詞,*": {POS: NOUN}, - "接尾辞,名詞的,副詞可能,*": {POS: NOUN}, # -後, -過ぎ - "代名詞,*,*,*": {POS: PRON}, - "動詞,一般,*,*": {POS: VERB}, - "動詞,非自立可能,*,*": {POS: VERB}, # XXX VERB if alone, AUX otherwise - "動詞,非自立可能,*,*,AUX": {POS: AUX}, - "動詞,非自立可能,*,*,VERB": {POS: VERB}, - "副詞,*,*,*": {POS: ADV}, - "補助記号,AA,一般,*": {POS: SYM}, # text art - "補助記号,AA,顔文字,*": {POS: SYM}, # kaomoji - "補助記号,一般,*,*": {POS: SYM}, - "補助記号,括弧開,*,*": {POS: PUNCT}, # open bracket - "補助記号,括弧閉,*,*": {POS: PUNCT}, # close bracket - "補助記号,句点,*,*": {POS: PUNCT}, # period or other EOS marker - "補助記号,読点,*,*": {POS: PUNCT}, # comma - "名詞,固有名詞,一般,*": {POS: PROPN}, # general proper noun - "名詞,固有名詞,人名,一般": {POS: PROPN}, # person's name - "名詞,固有名詞,人名,姓": {POS: PROPN}, # surname - "名詞,固有名詞,人名,名": {POS: PROPN}, # first name - "名詞,固有名詞,地名,一般": {POS: PROPN}, # place name - "名詞,固有名詞,地名,国": {POS: PROPN}, # country name - "名詞,助動詞語幹,*,*": {POS: AUX}, - "名詞,数詞,*,*": {POS: NUM}, # includes Chinese numerals - "名詞,普通名詞,サ変可能,*": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun - "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB}, - "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN}, # ex: 下手 - "名詞,普通名詞,一般,*": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 - "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ}, - "名詞,普通名詞,助数詞可能,*": {POS: NOUN}, # counter / unit - "名詞,普通名詞,副詞可能,*": {POS: NOUN}, - "連体詞,*,*,*": {POS: ADJ}, # XXX this has exceptions based on literal token - "連体詞,*,*,*,ADJ": {POS: ADJ}, - "連体詞,*,*,*,PRON": {POS: PRON}, - "連体詞,*,*,*,DET": {POS: DET}, + + "形状詞-一般": {POS: ADJ}, + "形状詞-タリ": {POS: ADJ}, + "形状詞-助動詞語幹": {POS: AUX}, + + "形容詞-一般": {POS: ADJ}, + + "形容詞-非自立可能": {POS: ADJ}, # XXX ADJ if alone, AUX otherwise + + "助詞-格助詞": {POS: ADP}, + + "助詞-係助詞": {POS: ADP}, + + "助詞-終助詞": {POS: PART}, + "助詞-準体助詞": {POS: SCONJ}, # の as in 走るのが速い + "助詞-接続助詞": {POS: SCONJ}, # verb ending て0 + + "助詞-副助詞": {POS: ADP}, # ばかり, つつ after a verb + + "助動詞": {POS: AUX}, + + "接続詞": {POS: CCONJ}, # XXX: might need refinement + "接頭辞": {POS: NOUN}, + "接尾辞-形状詞的": {POS: PART}, # がち, チック + + "接尾辞-形容詞的": {POS: AUX}, # -らしい + + "接尾辞-動詞的": {POS: PART}, # -じみ + "接尾辞-名詞的-サ変可能": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞-名詞的-一般": {POS: NOUN}, + "接尾辞-名詞的-助数詞": {POS: NOUN}, + "接尾辞-名詞的-副詞可能": {POS: NOUN}, # -後, -過ぎ + + "代名詞": {POS: PRON}, + + "動詞-一般": {POS: VERB}, + + "動詞-非自立可能": {POS: AUX}, # XXX VERB if alone, AUX otherwise + + "副詞": {POS: ADV}, + + "補助記号-AA-一般": {POS: SYM}, # text art + "補助記号-AA-顔文字": {POS: PUNCT}, # kaomoji + + "補助記号-一般": {POS: SYM}, + + "補助記号-括弧開": {POS: PUNCT}, # open bracket + "補助記号-括弧閉": {POS: PUNCT}, # close bracket + "補助記号-句点": {POS: PUNCT}, # period or other EOS marker + "補助記号-読点": {POS: PUNCT}, # comma + + "名詞-固有名詞-一般": {POS: PROPN}, # general proper noun + "名詞-固有名詞-人名-一般": {POS: PROPN}, # person's name + "名詞-固有名詞-人名-姓": {POS: PROPN}, # surname + "名詞-固有名詞-人名-名": {POS: PROPN}, # first name + "名詞-固有名詞-地名-一般": {POS: PROPN}, # place name + "名詞-固有名詞-地名-国": {POS: PROPN}, # country name + + "名詞-助動詞語幹": {POS: AUX}, + "名詞-数詞": {POS: NUM}, # includes Chinese numerals + + "名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + + "名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, + + "名詞-普通名詞-一般": {POS: NOUN}, + + "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 + + "名詞-普通名詞-助数詞可能": {POS: NOUN}, # counter / unit + + "名詞-普通名詞-副詞可能": {POS: NOUN}, + + "連体詞": {POS: DET}, # XXX this has exceptions based on literal token + + # GSD tags. These aren't in Unidic, but we need them for the GSD data. + "外国語": {POS: PROPN}, # Foreign words + + "絵文字・記号等": {POS: SYM}, # emoji / kaomoji ^^; + } diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py new file mode 100644 index 000000000..355cc655b --- /dev/null +++ b/spacy/lang/ja/tag_orth_map.py @@ -0,0 +1,30 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X + +# mapping from tag bi-gram to pos of previous token +TAG_ORTH_MAP = { + "空白": { + " ": SPACE, + " ": X, + }, + "助詞-副助詞": { + "たり": PART, + }, + "連体詞": { + "あの": DET, + "かの": DET, + "この": DET, + "その": DET, + "どの": DET, + "彼の": DET, + "此の": DET, + "其の": DET, + "ある": PRON, + "こんな": PRON, + "そんな": PRON, + "どんな": PRON, + "あらゆる": PRON, + }, +} diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..58cd3f3bf 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -6,7 +6,7 @@ import pytest @pytest.mark.parametrize( "word,lemma", - [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")], + [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")], ) def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..5213aed58 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -14,20 +14,26 @@ TOKENIZER_TESTS = [ ] TAG_TESTS = [ - ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']), - ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']), - ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']), - ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']), - ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*']) + ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']), + ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']), + ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']), + ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']), + ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能']) ] POS_TESTS = [ - ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']), + ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']), ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']), ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']), ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) ] + +SENTENCE_TESTS = [ + ('あれ。これ。', ['あれ。', 'これ。']), + ('「伝染るんです。」という漫画があります。', + ['「伝染るんです。」という漫画があります。']), + ] # fmt: on @@ -43,14 +49,27 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags +#XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos +@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) +def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): + sents = [str(sent) for sent in ja_tokenizer(text).sents] + assert sents == expected_sents + def test_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == " " - assert tokens[2].orth_ == " " + assert tokens[1].orth_ == " " + +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS + +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) +def test_tokenizer_naughty_strings(ja_tokenizer, text): + tokens = ja_tokenizer(text) + assert tokens.text_with_ws == text + From 4d1ba6feb414177457fcec5983038216e32f1a12 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 4 Jun 2020 19:16:33 +0200 Subject: [PATCH 062/119] add tag variant for 2.3 (#5542) --- website/docs/api/cli.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b49a2fb08..6f4b8bb73 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 1ac43d78f9f8e1d0fea518be0c020888cf117bda Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 4 Jun 2020 20:02:05 +0200 Subject: [PATCH 063/119] Avoid libc.stdint for UINT64_MAX (#5545) --- spacy/lexeme.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index dec2993fa..1df516dcb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -12,7 +12,6 @@ import numpy import warnings from thinc.neural.util import get_array_module -from libc.stdint cimport UINT64_MAX from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP @@ -23,7 +22,7 @@ from .attrs import intify_attrs from .errors import Errors, Warnings -OOV_RANK = UINT64_MAX +OOV_RANK = 0xffffffffffffffff # UINT64_MAX memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) EMPTY_LEXEME.id = OOV_RANK From 009119fa66c39f86fe500e35c087cb67dec5a4a8 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 6 Jun 2020 00:22:18 +0200 Subject: [PATCH 064/119] Requirements/setup for Japanese (#5553) * Add sudachipy and sudachidict_core to Makefile * Switch ja requirements from fugashi to sudachipy --- Makefile | 4 ++-- setup.cfg | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 2764da118..865bf44c5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core chmod a+rx $@ cp $@ dist/spacy.pex @@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/setup.cfg b/setup.cfg index 1e29f1ead..e556ba19c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -78,7 +78,8 @@ cuda102 = cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = - fugashi>=0.1.3 + sudachipy>=0.4.5 + sudachidict_core>=20200330 ko = natto-py==0.9.0 th = From 456bf47f5184127510e39aaef7135a8ed979bc86 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Mon, 8 Jun 2020 22:49:34 +0900 Subject: [PATCH 065/119] fix a bug causing mis-alignments (#5560) --- .github/contributors/hiroshi-matsuda-rit.md | 106 ++++++++++++++++++++ spacy/lang/ja/__init__.py | 82 +++++++++++---- 2 files changed, 169 insertions(+), 19 deletions(-) create mode 100644 .github/contributors/hiroshi-matsuda-rit.md diff --git a/.github/contributors/hiroshi-matsuda-rit.md b/.github/contributors/hiroshi-matsuda-rit.md new file mode 100644 index 000000000..bf19125fb --- /dev/null +++ b/.github/contributors/hiroshi-matsuda-rit.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Hiroshi Matsuda | +| Company name (if applicable) | Megagon Labs, Tokyo | +| Title or role (if applicable) | Research Scientist | +| Date | June 6, 2020 | +| GitHub username | hiroshi-matsuda-rit | +| Website (optional) | | diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 09546467e..a623c7bdd 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,7 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -import re from collections import namedtuple from .stop_words import STOP_WORDS @@ -14,7 +13,9 @@ from ...compat import copy_reg from ...language import Language from ...symbols import POS from ...tokens import Doc -from ...util import DummyTokenizer, get_words_and_spaces +from ...util import DummyTokenizer + +from ...errors import Errors # Hold the attributes we need with convenient names DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) @@ -41,7 +42,7 @@ def try_sudachi_import(): ) -def resolve_pos(token, next_token): +def resolve_pos(orth, pos, next_pos): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context @@ -53,22 +54,22 @@ def resolve_pos(token, next_token): # token. # orth based rules - if token.pos in TAG_ORTH_MAP: - orth_map = TAG_ORTH_MAP[token.pos[0]] - if token.surface in orth_map: - return orth_map[token.surface], None + if pos[0] in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[pos[0]] + if orth in orth_map: + return orth_map[orth], None # tag bi-gram mapping - if next_token: - tag_bigram = token.pos[0], next_token.pos[0] + if next_pos: + tag_bigram = pos[0], next_pos[0] if tag_bigram in TAG_BIGRAM_MAP: bipos = TAG_BIGRAM_MAP[tag_bigram] if bipos[0] is None: - return TAG_MAP[token.pos[0]][POS], bipos[1] + return TAG_MAP[pos[0]][POS], bipos[1] else: return bipos - return TAG_MAP[token.pos[0]][POS], None + return TAG_MAP[pos[0]][POS], None # Use a mapping of paired punctuation to avoid splitting quoted sentences. @@ -120,6 +121,48 @@ def get_dtokens(tokenizer, text): words = [ww for ww in words if len(ww.surface) > 0] return words + +def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): + words = [x.surface for x in dtokens] + if "".join("".join(words).split()) != "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_lemmas = [] + text_tags = [] + text_spaces = [] + text_pos = 0 + # normalize words to remove all whitespace tokens + norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) + # align words with text + for word, dtoken in zip(norm_words, norm_dtokens): + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + w = text[text_pos:text_pos + word_start] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_lemmas.append(dtoken.lemma) + text_tags.append(dtoken.pos) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + w = text[text_pos:] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + return text_words, text_lemmas, text_tags, text_spaces + + class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) @@ -128,22 +171,23 @@ class JapaneseTokenizer(DummyTokenizer): def __call__(self, text): dtokens = get_dtokens(self.tokenizer, text) - words = [x.surface for x in dtokens] - words, spaces = get_words_and_spaces(words, text) - unidic_tags = [",".join(x.pos) for x in dtokens] + words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) doc = Doc(self.vocab, words=words, spaces=spaces) next_pos = None - for ii, (token, dtoken) in enumerate(zip(doc, dtokens)): - ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None - token.tag_ = dtoken.pos[0] + for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): + token.tag_ = unidic_tag[0] if next_pos: token.pos = next_pos next_pos = None else: - token.pos, next_pos = resolve_pos(dtoken, ntoken) + token.pos, next_pos = resolve_pos( + token.orth_, + unidic_tag, + unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None + ) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.lemma + token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags separate_sentences(doc) From 3bf111585d251ceb6dc41ca5c097a85ca194fb3f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 8 Jun 2020 16:29:05 +0200 Subject: [PATCH 066/119] Update Japanese tokenizer config and add serialization (#5562) * Use `config` dict for tokenizer settings * Add serialization of split mode setting * Add tests for tokenizer split modes and serialization of split mode setting Based on #5561 --- spacy/lang/ja/__init__.py | 77 +++++++++++++++++++++++---- spacy/tests/lang/ja/test_serialize.py | 37 +++++++++++++ spacy/tests/lang/ja/test_tokenizer.py | 26 +++++++-- 3 files changed, 127 insertions(+), 13 deletions(-) create mode 100644 spacy/tests/lang/ja/test_serialize.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index a623c7bdd..294c6b38d 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,7 +1,8 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from collections import namedtuple +import srsly +from collections import namedtuple, OrderedDict from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -10,12 +11,13 @@ from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG from ...compat import copy_reg +from ...errors import Errors from ...language import Language from ...symbols import POS from ...tokens import Doc from ...util import DummyTokenizer +from ... import util -from ...errors import Errors # Hold the attributes we need with convenient names DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) @@ -26,14 +28,20 @@ DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) DummySpace = DummyNode(" ", " ", " ") -def try_sudachi_import(): +def try_sudachi_import(split_mode="A"): """SudachiPy is required for Japanese support, so check for it. - It it's not available blow up and explain how to fix it.""" + It it's not available blow up and explain how to fix it. + split_mode should be one of these values: "A", "B", "C", None->"A".""" try: from sudachipy import dictionary, tokenizer - + split_mode = { + None: tokenizer.Tokenizer.SplitMode.A, + "A": tokenizer.Tokenizer.SplitMode.A, + "B": tokenizer.Tokenizer.SplitMode.B, + "C": tokenizer.Tokenizer.SplitMode.C, + }[split_mode] tok = dictionary.Dictionary().create( - mode=tokenizer.Tokenizer.SplitMode.A + mode=split_mode ) return tok except ImportError: @@ -164,9 +172,10 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): class JapaneseTokenizer(DummyTokenizer): - def __init__(self, cls, nlp=None): + def __init__(self, cls, nlp=None, config={}): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_sudachi_import() + self.split_mode = config.get("split_mode", None) + self.tokenizer = try_sudachi_import(self.split_mode) def __call__(self, text): dtokens = get_dtokens(self.tokenizer, text) @@ -193,6 +202,54 @@ class JapaneseTokenizer(DummyTokenizer): separate_sentences(doc) return doc + def _get_config(self): + config = OrderedDict( + ( + ("split_mode", self.split_mode), + ) + ) + return config + + def _set_config(self, config={}): + self.split_mode = config.get("split_mode", None) + + def to_bytes(self, **kwargs): + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ) + ) + util.from_bytes(data, deserializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ) + ) + util.from_disk(path, serializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -203,8 +260,8 @@ class JapaneseDefaults(Language.Defaults): writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod - def create_tokenizer(cls, nlp=None): - return JapaneseTokenizer(cls, nlp) + def create_tokenizer(cls, nlp=None, config={}): + return JapaneseTokenizer(cls, nlp, config) class Japanese(Language): diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py new file mode 100644 index 000000000..018e645bb --- /dev/null +++ b/spacy/tests/lang/ja/test_serialize.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.ja import Japanese +from ...util import make_tempdir + + +def test_ja_tokenizer_serialize(ja_tokenizer): + tokenizer_bytes = ja_tokenizer.to_bytes() + nlp = Japanese() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + with make_tempdir() as d: + file_path = d / "tokenizer" + ja_tokenizer.to_disk(file_path) + nlp = Japanese() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + # split mode is (de)serialized correctly + nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_r = Japanese() + nlp_bytes = nlp.to_bytes() + nlp_r.from_bytes(nlp_bytes) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" + + with make_tempdir() as d: + nlp.to_disk(d) + nlp_r = Japanese() + nlp_r.from_disk(d) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 5213aed58..82c43fe4c 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import pytest +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS +from spacy.lang.ja import Japanese # fmt: off TOKENIZER_TESTS = [ @@ -55,21 +57,39 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] assert sents == expected_sents -def test_extra_spaces(ja_tokenizer): +def test_ja_tokenizer_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") assert tokens[1].orth_ == " " -from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS @pytest.mark.parametrize("text", NAUGHTY_STRINGS) -def test_tokenizer_naughty_strings(ja_tokenizer, text): +def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): tokens = ja_tokenizer(text) assert tokens.text_with_ws == text + +@pytest.mark.parametrize("text,len_a,len_b,len_c", + [ + ("選挙管理委員会", 4, 3, 1), + ("客室乗務員", 3, 2, 1), + ("労働者協同組合", 4, 3, 1), + ("機能性食品", 3, 2, 1), + ] +) +def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): + nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) + nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) + + assert len(ja_tokenizer(text)) == len_a + assert len(nlp_a(text)) == len_a + assert len(nlp_b(text)) == len_b + assert len(nlp_c(text)) == len_c From d1799da200782fb5f3b09bee58cf00092e5a05f0 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 8 Jun 2020 19:47:32 +0200 Subject: [PATCH 067/119] bot for answered issues (#5563) * add tiangolo's issue manager * fix formatting * spaces, tabs, who knows * formatting * I'll get this right at some point * maybe one more space ? --- .github/workflows/issue-manager.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/issue-manager.yml diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml new file mode 100644 index 000000000..8a5c1ee94 --- /dev/null +++ b/.github/workflows/issue-manager.yml @@ -0,0 +1,28 @@ +name: Issue Manager + +on: + schedule: + - cron: "0 0 * * *" + issue_comment: + types: + - created + - edited + issues: + types: + - labeled + +jobs: + issue-manager: + runs-on: ubuntu-latest + steps: + - uses: tiangolo/issue-manager@0.2.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + config: > + { + "answered": { + "delay": "P3D", + "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", + "remove_label": true + } + } From de00f967ce5fd720633c717252aae83b6f2b1602 Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Mon, 8 Jun 2020 19:26:30 +0100 Subject: [PATCH 068/119] adding spacy-universal-sentence-encoder (#5534) * adding spacy-universal-sentence-encoder * update affiliation * updated code example --- .github/contributors/MartinoMensio.md | 4 ++-- website/meta/universe.json | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/contributors/MartinoMensio.md b/.github/contributors/MartinoMensio.md index 1cd32d622..27e453699 100644 --- a/.github/contributors/MartinoMensio.md +++ b/.github/contributors/MartinoMensio.md @@ -99,8 +99,8 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | | Name | Martino Mensio | -| Company name (if applicable) | Polytechnic University of Turin | -| Title or role (if applicable) | Student | +| Company name (if applicable) | The Open University | +| Title or role (if applicable) | PhD Student | | Date | 17 November 2017 | | GitHub username | MartinoMensio | | Website (optional) | https://martinomensio.github.io/ | diff --git a/website/meta/universe.json b/website/meta/universe.json index 58be719ed..2c74a2964 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,29 @@ { "resources": [ + { + "id": "spacy-universal-sentence-encoder", + "title": "SpaCy - Universal Sentence Encoder", + "slogan": "Make use of Google's Universal Sentence Encoder directly within SpaCy", + "description": "This library lets you use Universal Sentence Encoder embeddings of Docs, Spans and Tokens directly from TensorFlow Hub", + "github": "MartinoMensio/spacy-universal-sentence-encoder-tfhub", + "code_example": [ + "import spacy_universal_sentence_encoder", + "load one of the models: ['en_use_md', 'en_use_lg', 'xx_use_md', 'xx_use_lg']", + "nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')", + "# get two documents", + "doc_1 = nlp('Hi there, how are you?')", + "doc_2 = nlp('Hello there, how are you doing today?')", + "# use the similarity method that is based on the vectors, on Doc, Span or Token", + "print(doc_1.similarity(doc_2[0:7]))" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, { "id": "whatlies", "title": "whatlies", From f162815f45c69dd71e194361284dbef3939fb9fc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 8 Jun 2020 21:09:23 +0200 Subject: [PATCH 069/119] Handle empty and whitespace-only docs for Japanese (#5564) Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer. --- spacy/lang/ja/__init__.py | 10 ++++++++++ spacy/tests/lang/ja/test_tokenizer.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 294c6b38d..39e0445c2 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): text_tags = [] text_spaces = [] text_pos = 0 + # handle empty and whitespace-only texts + if len(words) == 0: + return text_words, text_lemmas, text_tags, text_spaces + elif len([word for word in words if not word.isspace()]) == 0: + assert text.isspace() + text_words = [text] + text_lemmas = [text] + text_tags = [gap_tag] + text_spaces = [False] + return text_words, text_lemmas, text_tags, text_spaces # normalize words to remove all whitespace tokens norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) # align words with text diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 82c43fe4c..30cba42b1 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_a(text)) == len_a assert len(nlp_b(text)) == len_b assert len(nlp_c(text)) == len_c + + +def test_ja_tokenizer_emptyish_texts(ja_tokenizer): + doc = ja_tokenizer("") + assert len(doc) == 0 + doc = ja_tokenizer(" ") + assert len(doc) == 1 + doc = ja_tokenizer("\n\n\n \t\t \n\n\n") + assert len(doc) == 1 From 86112d2168dc1d763a233a8c531f09002101818e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 9 Jun 2020 08:57:38 +0200 Subject: [PATCH 070/119] update issue manager's version --- .github/workflows/issue-manager.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 8a5c1ee94..b789494a2 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -15,7 +15,7 @@ jobs: issue-manager: runs-on: ubuntu-latest steps: - - uses: tiangolo/issue-manager@0.2.0 + - uses: tiangolo/issue-manager@0.2.1 with: token: ${{ secrets.GITHUB_TOKEN }} config: > From b7e6e1b9a75ea1301ea8253cd2c6a5d3740cef12 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 9 Jun 2020 12:00:59 +0200 Subject: [PATCH 071/119] Disable sentence segmentation in ja tokenizer (#5566) --- spacy/lang/ja/__init__.py | 1 - spacy/tests/lang/ja/test_tokenizer.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 39e0445c2..371cc0f98 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -209,7 +209,6 @@ class JapaneseTokenizer(DummyTokenizer): token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags - separate_sentences(doc) return doc def _get_config(self): diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 30cba42b1..26be5cf59 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -58,6 +58,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): assert pos == expected_pos +@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] From 0a70bd62811778b59429fa23871b6ca862678636 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 9 Jun 2020 15:47:31 +0200 Subject: [PATCH 072/119] Bump version to 2.3.0.dev1 (#5567) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index be1b3ae56..90b5f9245 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0.dev0" +__version__ = "2.3.0.dev1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 12c1965070a1a8bbe80efaae5755116633d94886 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 10 Jun 2020 10:46:12 +0200 Subject: [PATCH 073/119] set delay to 7 days --- .github/workflows/issue-manager.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b789494a2..b52095fe8 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -21,7 +21,7 @@ jobs: config: > { "answered": { - "delay": "P3D", + "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", "remove_label": true } From 28db7dd5d9aaf53a3c4e9b13048415502d998aae Mon Sep 17 00:00:00 2001 From: Jones Martins Date: Wed, 10 Jun 2020 13:47:04 -0300 Subject: [PATCH 074/119] Add missing pronoums/determiners (#5569) * Add missing pronoums/determiners * Add test for missing pronoums * Add contributor file --- .github/contributors/jonesmartins.md | 106 +++++++++++++++++++++++++ spacy/lang/en/tokenizer_exceptions.py | 2 +- spacy/tests/lang/en/test_exceptions.py | 2 +- 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/jonesmartins.md diff --git a/.github/contributors/jonesmartins.md b/.github/contributors/jonesmartins.md new file mode 100644 index 000000000..5663f6193 --- /dev/null +++ b/.github/contributors/jonesmartins.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jones Martins | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-10 | +| GitHub username | jonesmartins | +| Website (optional) | | diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 6a553052b..f8367c0f5 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -139,7 +139,7 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: +for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index a78e1815f..1ff64eff2 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -46,7 +46,7 @@ def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"]) +@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"]) def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 From bab30e4ad2ad35d7133b7f8027a3558a02e018e4 Mon Sep 17 00:00:00 2001 From: Jones Martins Date: Wed, 10 Jun 2020 16:54:06 -0300 Subject: [PATCH 075/119] Add "c'mon" token exception (#5570) * Add "c'mon" exception * Fix typo in "C'mon" exception --- spacy/lang/en/tokenizer_exceptions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index f8367c0f5..964a714ae 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -399,6 +399,14 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], + "c'mon": [ + {ORTH: "c'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ], + "C'mon": [ + {ORTH: "C'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ] } _exc.update(_other_exc) From fe167fcf7d23ee6c73877a11351984221a9aacd5 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 10:23:50 +0200 Subject: [PATCH 076/119] Update pytest conf for sudachipy with Japanese (#5574) --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() From 556895177edbc5d7dc64e0f95e36273a2fb16478 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 13:47:37 +0200 Subject: [PATCH 077/119] Expand Japanese requirements warning (#5572) Include explicit install instructions in Japanese requirements warning. --- spacy/lang/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 371cc0f98..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -46,7 +46,10 @@ def try_sudachi_import(split_mode="A"): return tok except ImportError: raise ImportError( - "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) From 18c6dc8093df4e075f6168b98afd500a73a384e6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 11 Jun 2020 14:09:40 +0200 Subject: [PATCH 078/119] removing label both on comment and on close --- .github/workflows/issue-manager.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b52095fe8..3fb42ed01 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -20,9 +20,10 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} config: > { - "answered": { + "resolved": { "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", - "remove_label": true + "remove_label_on_comment": true, + "remove_label_on_close": true } } From fa46e0bef2226d1ba673537d2097d92f151304c5 Mon Sep 17 00:00:00 2001 From: theudas Date: Fri, 12 Jun 2020 02:03:23 +0200 Subject: [PATCH 079/119] Added Parameter to NEL to take n sentences into account (#5548) * added setting for neighbour sentence in NEL * added spaCy contributor agreement * added multi sentence also for training * made the try-except block smaller --- .github/contributors/theudas.md | 106 ++++++++++++++++++++++++++ spacy/pipeline/pipes.pyx | 131 ++++++++++++++++++++------------ 2 files changed, 189 insertions(+), 48 deletions(-) create mode 100644 .github/contributors/theudas.md diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 105ce00e6..01472a6d0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1170,6 +1170,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1218,6 +1221,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1228,17 +1234,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1309,69 +1332,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From 44967a3f9cfc3e20375aac3782897325785e15a9 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 10:23:50 +0200 Subject: [PATCH 080/119] Update pytest conf for sudachipy with Japanese (#5574) --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() From 4724fa4cf4b24be92a15c39c564d571eeae1470a Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 13:47:37 +0200 Subject: [PATCH 081/119] Expand Japanese requirements warning (#5572) Include explicit install instructions in Japanese requirements warning. --- spacy/lang/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 371cc0f98..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -46,7 +46,10 @@ def try_sudachi_import(split_mode="A"): return tok except ImportError: raise ImportError( - "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) From 3f5e2f9d99bc8ad3b86c53b8c9eadcba56c5a1a7 Mon Sep 17 00:00:00 2001 From: theudas Date: Fri, 12 Jun 2020 02:03:23 +0200 Subject: [PATCH 082/119] Added Parameter to NEL to take n sentences into account (#5548) * added setting for neighbour sentence in NEL * added spaCy contributor agreement * added multi sentence also for training * made the try-except block smaller --- .github/contributors/theudas.md | 106 ++++++++++++++++++++++++++ spacy/pipeline/pipes.pyx | 131 ++++++++++++++++++++------------ 2 files changed, 189 insertions(+), 48 deletions(-) create mode 100644 .github/contributors/theudas.md diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 105ce00e6..01472a6d0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1170,6 +1170,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1218,6 +1221,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1228,17 +1234,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1309,69 +1332,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From aa5b40fa6423916ae79bf6e750a17c50020f4078 Mon Sep 17 00:00:00 2001 From: Arvind Srinivasan Date: Sat, 13 Jun 2020 19:26:26 +0530 Subject: [PATCH 083/119] Added Tamil Example Sentences (#5583) * Added Examples for Tamil Sentences #### Description This PR add example sentences for the Tamil language which were missing as per issue #1107 #### Type of Change This is an enhancement. * Accepting spaCy Contributor Agreement * Signed on my behalf as an individual --- .github/contributors/Arvindcheenu.md | 106 +++++++++++++++++++++++++++ spacy/lang/ta/examples.py | 5 ++ 2 files changed, 111 insertions(+) create mode 100644 .github/contributors/Arvindcheenu.md diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] From c482f20778f3464fefbc7aa57782de5fe713a77f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:04 +0200 Subject: [PATCH 084/119] Fix and add warnings related to spacy-lookups-data (#5588) * Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.) --- spacy/errors.py | 13 ++++++++++--- spacy/pipeline/pipes.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/parser/test_ner.py | 17 +++++++++++++++++ spacy/tests/test_lemmatizer.py | 6 +++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list From c94f7d0e75e9e4ce25b719edee3adb4ecd74ee50 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:51 +0200 Subject: [PATCH 085/119] Updates to docstrings (#5589) --- spacy/gold.pyx | 1 + spacy/vocab.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} From f698007907518833d853740a5add8cd2b2a253b1 Mon Sep 17 00:00:00 2001 From: Arvind Srinivasan Date: Sat, 13 Jun 2020 19:26:26 +0530 Subject: [PATCH 086/119] Added Tamil Example Sentences (#5583) * Added Examples for Tamil Sentences #### Description This PR add example sentences for the Tamil language which were missing as per issue #1107 #### Type of Change This is an enhancement. * Accepting spaCy Contributor Agreement * Signed on my behalf as an individual --- .github/contributors/Arvindcheenu.md | 106 +++++++++++++++++++++++++++ spacy/lang/ta/examples.py | 5 ++ 2 files changed, 111 insertions(+) create mode 100644 .github/contributors/Arvindcheenu.md diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] From e867e9fa8ffe8b7eec9185bb1d35c39c835458d1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:04 +0200 Subject: [PATCH 087/119] Fix and add warnings related to spacy-lookups-data (#5588) * Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.) --- spacy/errors.py | 13 ++++++++++--- spacy/pipeline/pipes.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/parser/test_ner.py | 17 +++++++++++++++++ spacy/tests/test_lemmatizer.py | 6 +++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list From 0d8405aafac08353d91ead0cf060fd2962e540da Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:51 +0200 Subject: [PATCH 088/119] Updates to docstrings (#5589) --- spacy/gold.pyx | 1 + spacy/vocab.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} From 7ff447c5a0198600bfb8f4a43b042a6ed8276126 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Jun 2020 18:22:25 +0200 Subject: [PATCH 089/119] Set version to v2.3.0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 90b5f9245..91810fa68 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0.dev1" +__version__ = "2.3.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From d5110ffbf2474339ffde948fc6d899873484285e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 15:37:35 +0200 Subject: [PATCH 090/119] Documentation updates for v2.3.0 (#5593) * Update website models for v2.3.0 * Add docs for Chinese word segmentation * Tighten up Chinese docs section * Merge branch 'master' into docs/v2.3.0 [ci skip] * Merge branch 'master' into docs/v2.3.0 [ci skip] * Auto-format and update version * Update matcher.md * Update languages and sorting * Typo in landing page * Infobox about token_match behavior * Add meta and basic docs for Japanese * POS -> TAG in models table * Add info about lookups for normalization * Updates to API docs for v2.3 * Update adding norm exceptions for adding languages * Add --omit-extra-lookups to CLI API docs * Add initial draft of "What's New in v2.3" * Add new in v2.3 tags to Chinese and Japanese sections * Add tokenizer to migration section * Add new in v2.3 flags to init-model * Typo * More what's new in v2.3 Co-authored-by: Ines Montani --- README.md | 17 +- website/docs/api/cli.md | 21 ++- website/docs/api/cython-structs.md | 3 - website/docs/api/goldparse.md | 1 + website/docs/api/lexeme.md | 2 +- website/docs/api/matcher.md | 11 +- website/docs/api/sentencizer.md | 2 +- website/docs/api/token.md | 2 +- website/docs/api/vocab.md | 3 + website/docs/usage/adding-languages.md | 34 +++- website/docs/usage/linguistic-features.md | 23 ++- website/docs/usage/models.md | 117 ++++++++++++ website/docs/usage/v2-3.md | 213 ++++++++++++++++++++++ website/meta/languages.json | 149 ++++++++++----- website/meta/sidebars.json | 1 + website/src/templates/models.js | 2 +- website/src/widgets/landing.js | 2 +- website/src/widgets/languages.js | 4 +- 18 files changed, 519 insertions(+), 88 deletions(-) create mode 100644 website/docs/usage/v2-3.md diff --git a/README.md b/README.md index 31dc78d63..4b5f3d0fa 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. spaCy comes with [pretrained statistical models](https://spacy.io/models) and word vectors, and -currently supports tokenization for **50+ languages**. It features +currently supports tokenization for **60+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. -💫 **Version 2.2 out now!** +💫 **Version 2.3 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -32,7 +32,7 @@ It's commercial open-source software, released under the MIT license. | --------------- | -------------------------------------------------------------- | | [spaCy 101] | New to spaCy? Here's everything you need to know! | | [Usage Guides] | How to use spaCy and its features. | -| [New in v2.2] | New features, backwards incompatibilities and migration guide. | +| [New in v2.3] | New features, backwards incompatibilities and migration guide. | | [API Reference] | The detailed reference for spaCy's API. | | [Models] | Download statistical language models for spaCy. | | [Universe] | Libraries, extensions, demos, books and courses. | @@ -40,7 +40,7 @@ It's commercial open-source software, released under the MIT license. | [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 -[new in v2.2]: https://spacy.io/usage/v2-2 +[new in v2.3]: https://spacy.io/usage/v2-3 [usage guides]: https://spacy.io/usage/ [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models @@ -113,12 +113,13 @@ of `v2.0.13`). pip install spacy ``` -To install additional data tables for lemmatization in **spaCy v2.2+** you can -run `pip install spacy[lookups]` or install +To install additional data tables for lemmatization and normalization in +**spaCy v2.2+** you can run `pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) separately. The lookups package is needed to create blank models with -lemmatization data, and to lemmatize in languages that don't yet come with -pretrained models and aren't powered by third-party libraries. +lemmatization data for v2.2+ plus normalization data for v2.3+, and to +lemmatize in languages that don't yet come with pretrained models and aren't +powered by third-party libraries. When using pip it is generally recommended to install packages in a virtual environment to avoid modifying system state: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 6f4b8bb73..fe8877c69 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -541,16 +541,17 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| `--omit-extra-lookups`, `-OEL` 2.3 | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 935bce25d..8ee1f1b9a 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -171,9 +171,6 @@ struct. | `shape` | `attr_t` | Transform of the lexeme's string, to show orthographic features. | | `prefix` | `attr_t` | Length-N substring from the start of the lexeme. Defaults to `N=1`. | | `suffix` | `attr_t` | Length-N substring from the end of the lexeme. Defaults to `N=3`. | -| `cluster` | `attr_t` | Brown cluster ID. | -| `prob` | `float` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | -| `sentiment` | `float` | A scalar value indicating positivity or negativity. | ### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 443913311..5df625991 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -22,6 +22,7 @@ missing – the gradient for those labels will be zero. | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | | `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | +| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False.`. | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..f7f6d654c 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation. | `like_url` | bool | Does the lexeme resemble a URL? | | `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Is the lexeme out-of-vocabulary? | +| `is_oov` | bool | Does the lexeme have a word vector? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | | `lang_` | unicode | Language of the parent vocabulary. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..ac2f898e0 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -40,7 +40,8 @@ string where an integer is expected) or unexpected property names. ## Matcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. +Find all token sequences matching the supplied patterns on the `Doc`. As of +spaCy v2.3, the `Matcher` can also be called on `Span` objects. > #### Example > @@ -54,10 +55,10 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doc` | `Doc` | The document to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).. | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..5a1ea162a 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -42,7 +42,7 @@ Initialize the sentencizer. | Name | Type | Description | | ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | | **RETURNS** | `Sentencizer` | The newly constructed object. | ## Sentencizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 0fa86b7bc..9f8594c96 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -459,7 +459,7 @@ The L2 norm of the token's vector representation. | `like_url` | bool | Does the token resemble a URL? | | `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_oov` | bool | Does the token have a word vector? | | `is_stop` | bool | Is the token part of a "stop list"? | | `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..2be6d67ed 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,6 +27,9 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | +| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | +| `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | +| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` 2.2 | unicode | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..d42aad705 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -297,9 +297,35 @@ though `$` and `€` are very different, spaCy normalizes them both to `$`. This way, they'll always be seen as similar, no matter how common they were in the training data. -Norm exceptions can be provided as a simple dictionary. For more examples, see -the English -[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/norm_exceptions.py). +As of spaCy v2.3, language-specific norm exceptions are provided as a +JSON dictionary in the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather +than in the main library. For a full example, see +[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json). + +```json +### Example +{ + "cos": "because", + "fav": "favorite", + "accessorise": "accessorize", + "accessorised": "accessorized" +} +``` + +If you're adding tables for a new languages, be sure to add the tables to +[`spacy_lookups_data/__init__.py`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/__init__.py) +and register the entry point under `spacy_lookups` in +[`setup.cfg`](https://github.com/explosion/spacy-lookups-data/blob/master/setup.cfg). + +Alternatively, you can initialize your language [`Vocab`](/api/vocab) with a +[`Lookups`](/api/lookups) object that includes the table `lexeme_norm`. + + + +Previously in spaCy v2.0-v2.2, norm exceptions were provided as a simple python +dictionary. For more examples, see the English +[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/v2.2.x/spacy/lang/en/norm_exceptions.py). ```python ### Example @@ -327,6 +353,8 @@ norm exceptions overwrite any of the global exceptions, they should be added first. Also note that the tokenizer exceptions will always have priority over the attribute getters. + + ### Lexical attributes {#lex-attrs new="2"} spaCy provides a range of [`Token` attributes](/api/token#attributes) that diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..84bb3d71b 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -732,7 +732,7 @@ rather than performance: ```python def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, - infix_finditer, token_match): + infix_finditer, token_match, url_match): tokens = [] for substring in text.split(): suffixes = [] @@ -829,7 +829,7 @@ for t in tok_exp: ### Customizing spaCy's Tokenizer class {#native-tokenizers} Let's imagine you wanted to create a tokenizer for a new language or specific -domain. There are five things you would need to define: +domain. There are six things you may need to define: 1. A dictionary of **special cases**. This handles things like contractions, units of measurement, emoticons, certain abbreviations, etc. @@ -840,9 +840,22 @@ domain. There are five things you would need to define: 4. A function `infixes_finditer`, to handle non-whitespace separators, such as hyphens etc. 5. An optional boolean function `token_match` matching strings that should never - be split, overriding the infix rules. Useful for things like URLs or numbers. + be split, overriding the infix rules. Useful for things like numbers. 6. An optional boolean function `url_match`, which is similar to `token_match` - except prefixes and suffixes are removed before applying the match. + except that prefixes and suffixes are removed before applying the match. + + + +In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` +above and there was no match pattern applied before prefixes and suffixes were +analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its +behavior in v2.2.1 and earlier with precedence over prefixes and suffixes. + +The `url_match` is introduced in v2.3.0 to handle cases like URLs where the +tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a +URL) before applying the match. + + You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its @@ -865,7 +878,7 @@ def custom_tokenizer(nlp): prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, - token_match=simple_url_re.match) + url_match=simple_url_re.match) nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5fd92f8f3..382193157 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -85,6 +85,123 @@ To load your model with the neutral, multi-language class, simply set `meta.json`. You can also import the class directly, or call [`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading. +### Chinese language support {#chinese new=2.3} + +The Chinese language class supports three word segmentation options: + +> ```python +> from spacy.lang.zh import Chinese +> +> # Disable jieba to use character segmentation +> Chinese.Defaults.use_jieba = False +> nlp = Chinese() +> +> # Disable jieba through tokenizer config options +> cfg = {"use_jieba": False} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> ``` + +1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word + segmentation by default. It's enabled when you create a new `Chinese` + language class or call `spacy.blank("zh")`. +2. **Character segmentation:** Character segmentation is supported by disabling + `jieba` and setting `Chinese.Defaults.use_jieba = False` _before_ + initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer + config options can be used to configure `use_jieba`. +3. **PKUSeg**: In spaCy v2.3.0, support for + [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support + better segmentation for Chinese OntoNotes and the new + [Chinese models](/models/zh). + + + +The `meta` argument of the `Chinese` language class supports the following +following tokenizer config settings: + +| Name | Type | Description | +| ------------------ | ------- | ---------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | unicode | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | +| `pkuseg_user_dict` | unicode | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | +| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | + +```python +### Examples +# Load "default" model +cfg = {"pkuseg_model": "default", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Load local model +cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Override the user directory +cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +``` + +You can also modify the user dictionary on-the-fly: + +```python +# Append words to user dict +nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +# Remove all words from user dict and replace with new words +nlp.tokenizer.pkuseg_update_user_dict(["中国"], reset=True) + +# Remove all words from user dict +nlp.tokenizer.pkuseg_update_user_dict([], reset=True) +``` + + + + + +The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg` +model trained only on +[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the +models provided by `pkuseg` include data restricted to research use. For +research use, `pkuseg` provides models for several different domains +(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses, +`pkuseg` provides a simple +[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage): + +```python +import pkuseg +from spacy.lang.zh import Chinese + +# Train pkuseg model +pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model") +# Load pkuseg model in spaCy Chinese tokenizer +nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}}) +``` + + + +### Japanese language support {#japanese new=2.3} + +> ```python +> from spacy.lang.ja import Japanese +> +> # Load SudachiPy with split mode A (default) +> nlp = Japanese() +> +> # Load SudachiPy with split mode B +> cfg = {"split_mode": "B"} +> nlp = Japanese(meta={"tokenizer": {"config": cfg}}) +> ``` + +The Japanese language class uses +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. The default Japanese language class +and the provided Japanese models use SudachiPy split mode `A`. + +The `meta` argument of the `Japanese` language class can be used to configure +the split mode to `A`, `B` or `C`. + ## Installing and using models {#download} > #### Downloading models in spaCy < v1.7 diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md new file mode 100644 index 000000000..ba75b01ab --- /dev/null +++ b/website/docs/usage/v2-3.md @@ -0,0 +1,213 @@ +--- +title: What's New in v2.3 +teaser: New features, backwards incompatibilities and migration guide +menu: + - ['New Features', 'features'] + - ['Backwards Incompatibilities', 'incompat'] + - ['Migrating from v2.2', 'migrating'] +--- + +## New Features {#features hidden="true"} + +spaCy v2.3 features new pretrained models for five languages, word vectors for +all language models, and decreased model size and loading times for models with +vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish +and Romanian** and updated the training data and vectors for most languages. +Model packages with vectors are about **2×** smaller on disk and load +**2-4×** faster. For the full changelog, see the [release notes on +GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more +details and a behind-the-scenes look at the new release, [see our blog +post](https://explosion.ai/blog/spacy-v2-3). + +### Expanded model families with vectors {#models} + +> #### Example +> +> ```bash +> python -m spacy download da_core_news_sm +> python -m spacy download ja_core_news_sm +> python -m spacy download pl_core_news_sm +> python -m spacy download ro_core_news_sm +> python -m spacy download zh_core_web_sm +> ``` + +With new model families for Chinese, Danish, Polish, Romanian and Chinese plus +`md` and `lg` models with word vectors for all languages, this release provides +a total of 46 model packages. For models trained using [Universal +Dependencies](https://universaldependencies.org) corpora, the training data has +been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been +extended to include both UD Dutch Alpino and LassySmall. + + + +**Models:** [Models directory](/models) **Benchmarks: ** +[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.3.0) + + + +### Chinese {#chinese} + +> #### Example +> ```python +> from spacy.lang.zh import Chinese +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Append words to user dict +> nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +This release adds support for +[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and +the new Chinese models ship with a custom pkuseg model trained on OntoNotes. +The Chinese tokenizer can be initialized with both `pkuseg` and custom models +and the `pkuseg` user dictionary is easy to customize. + + + +**Chinese:** [Chinese tokenizer usage](/usage/models#chinese) + + + +### Japanese {#japanese} + +The updated Japanese language class switches to +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies +installing spaCy for Japanese, which is now possible with a single command: +`pip install spacy[ja]`. + + + +**Japanese:** [Japanese tokenizer usage](/usage/models#japanese) + + + +### Small CLI updates + +- `spacy debug-data` provides the coverage of the vectors in a base model with + `spacy debug-data lang train dev -b base_model` +- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en + dev.json`) to evaluate the tokenization accuracy without loading a model +- `spacy train` on GPU restricts the CPU timing evaluation to the first + iteration + +## Backwards incompatibilities {#incompat} + + + +If you've been training **your own models**, you'll need to **retrain** them +with the new version. Also don't forget to upgrade all models to the latest +versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible +with models for v2.3. To check if all of your models are up to date, you can +run the [`spacy validate`](/api/cli#validate) command. + + + +> #### Install with lookups data +> +> ```bash +> $ pip install spacy[lookups] +> ``` +> +> You can also install +> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) +> directly. + +- If you're training new models, you'll want to install the package + [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), + which now includes both the lemmatization tables (as in v2.2) and the + normalization tables (new in v2.3). If you're using pretrained models, + **nothing changes**, because the relevant tables are included in the model + packages. +- Due to the updated Universal Dependencies training data, the fine-grained + part-of-speech tags will change for many provided language models. The + coarse-grained part-of-speech tagset remains the same, but the mapping from + particular fine-grained to coarse-grained tags may show minor differences. +- For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech + tagsets contain new merged tags related to contracted forms, such as + `ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head + `"à"`. This increases the accuracy of the models by improving the alignment + between spaCy's tokenization and Universal Dependencies multi-word tokens + used for contractions. + +### Migrating from spaCy 2.2 {#migrating} + +#### Tokenizer settings + +In spaCy v2.2.2-v2.2.4, there was a change to the precedence of `token_match` +that gave prefixes and suffixes priority over `token_match`, which caused +problems for many custom tokenizer configurations. This has been reverted in +v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1 +and earlier versions. + +A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle +cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., +a comma at the end of a URL) before applying the match. See the full [tokenizer +documentation](/usage/linguistic-features#tokenization) and try out +[`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when +debugging your tokenizer configuration. + +#### Warnings configuration + +spaCy's custom warnings have been replaced with native python +[`warnings`](https://docs.python.org/3/library/warnings.html). Instead of +setting `SPACY_WARNING_IGNORE`, use the [warnings +filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +to manage warnings. + +#### Normalization tables + +The normalization tables have moved from the language data in +[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to +the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If +you're adding data for a new language, the normalization table should be added +to `spacy-lookups-data`. See [adding norm +exceptions](/usage/adding-languages#norm-exceptions). + +#### Probability and cluster features + +> #### Load and save extra prob lookups table +> +> ```python +> from spacy.lang.en import English +> nlp = English() +> doc = nlp("the") +> print(doc[0].prob) # lazily loads extra prob table +> nlp.to_disk("/path/to/model") # includes prob table +> ``` + +The `Token.prob` and `Token.cluster` features, which are no longer used by the +core pipeline components as of spaCy v2, are no longer provided in the +pretrained models to reduce the model size. To keep these features available +for users relying on them, the `prob` and `cluster` features for the most +frequent 1M tokens have been moved to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as +`extra` features for the relevant languages (English, German, Greek and +Spanish). + +The extra tables are loaded lazily, so if you have `spacy-lookups-data` +installed and your code accesses `Token.prob`, the full table is loaded into +the model vocab, which will take a few seconds on initial loading. When you +save this model after loading the `prob` table, the full `prob` table will be +saved as part of the model vocab. + +If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as +part of a new model, add the data to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under +the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can +initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a +[`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`, +`lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is +currently only used to provide a custom `oov_prob`. See examples in the [`data` +directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) +in `spacy-lookups-data`. + +#### Initializing new models without extra lookups tables + +When you initialize a new model with [`spacy init-model`](/api/cli#init-model), +the `prob` table from `spacy-lookups-data` may be loaded as part of the +initialization. If you'd like to omit this extra data as in spaCy's provided +v2.3 models, use the new flag `--omit-extra-lookups`. diff --git a/website/meta/languages.json b/website/meta/languages.json index 41c1bce7f..facfc3541 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,5 +1,35 @@ { "languages": [ + { + "code": "zh", + "name": "Chinese", + "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "dependencies": [ + { + "name": "Jieba", + "url": "https://github.com/fxsjy/jieba" + }, + { + "name": "PKUSeg", + "url": "https://github.com/lancopku/PKUSeg-python" + } + ], + "has_examples": true + }, + { + "code": "da", + "name": "Danish", + "example": "Dette er en sætning.", + "has_examples": true, + "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] + }, + { + "code": "nl", + "name": "Dutch", + "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "example": "Dit is een zin.", + "has_examples": true + }, { "code": "en", "name": "English", @@ -14,68 +44,91 @@ "example": "This is a sentence.", "has_examples": true }, + { + "code": "fr", + "name": "French", + "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], + "example": "C'est une phrase.", + "has_examples": true + }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md"], + "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], "starters": ["de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, { - "code": "fr", - "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md"], - "example": "C'est une phrase.", - "has_examples": true - }, - { - "code": "es", - "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md"], - "example": "Esto es una frase.", - "has_examples": true - }, - { - "code": "pt", - "name": "Portuguese", - "models": ["pt_core_news_sm"], - "example": "Esta é uma frase.", + "code": "el", + "name": "Greek", + "models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"], + "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, { "code": "it", "name": "Italian", - "models": ["it_core_news_sm"], + "models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"], "example": "Questa è una frase.", "has_examples": true }, { - "code": "nl", - "name": "Dutch", - "models": ["nl_core_news_sm"], - "example": "Dit is een zin.", + "code": "ja", + "name": "Japanese", + "models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"], + "dependencies": [ + { + "name": "SudachiPy", + "url": "https://github.com/WorksApplications/SudachiPy" + } + ], "has_examples": true }, { - "code": "el", - "name": "Greek", - "models": ["el_core_news_sm", "el_core_news_md"], - "example": "Αυτή είναι μια πρόταση.", - "has_examples": true + "code": "lt", + "name": "Lithuanian", + "has_examples": true, + "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "nb", "name": "Norwegian Bokmål", "example": "Dette er en setning.", "has_examples": true, - "models": ["nb_core_news_sm"] + "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] }, - { "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true }, + { + "code": "pl", + "name": "Polish", + "example": "To jest zdanie.", + "has_examples": true, + "models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"] + }, + { + "code": "pt", + "name": "Portuguese", + "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"], + "example": "Esta é uma frase.", + "has_examples": true + }, + { + "code": "ro", + "name": "Romanian", + "example": "Aceasta este o propoziție.", + "has_examples": true, + "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] + }, + { + "code": "es", + "name": "Spanish", + "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], + "example": "Esto es una frase.", + "has_examples": true + }, + { "code": "sv", "name": "Swedish", "has_examples": true }, + { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, - { "code": "pl", "name": "Polish", "example": "To jest zdanie.", "has_examples": true }, { "code": "ru", "name": "Russian", @@ -88,12 +141,6 @@ "has_examples": true, "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, - { - "code": "ro", - "name": "Romanian", - "example": "Aceasta este o propoziție.", - "has_examples": true - }, { "code": "hr", "name": "Croatian", "has_examples": true }, { "code": "eu", "name": "Basque", "has_examples": true }, { "code": "yo", "name": "Yoruba", "has_examples": true }, @@ -123,7 +170,6 @@ { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, { "code": "cs", "name": "Czech" }, { "code": "is", "name": "Icelandic" }, - { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] }, { "code": "lv", "name": "Latvian" }, { "code": "sr", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, @@ -145,12 +191,6 @@ "example": "นี่คือประโยค", "has_examples": true }, - { - "code": "zh", - "name": "Chinese", - "dependencies": [{ "name": "Jieba", "url": "https://github.com/fxsjy/jieba" }], - "has_examples": true - }, { "code": "ja", "name": "Japanese", @@ -187,6 +227,21 @@ "example": "Sta chì a l'é unna fraxe.", "has_examples": true }, + { + "code": "hy", + "name": "Armenian", + "has_examples": true + }, + { + "code": "gu", + "name": "Gujarati", + "has_examples": true + }, + { + "code": "ml", + "name": "Malayalam", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3fafc52b0..d7129875f 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,6 +9,7 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, + { "text": "New in v2.3", "url": "/usage/v2-3" }, { "text": "New in v2.2", "url": "/usage/v2-2" }, { "text": "New in v2.1", "url": "/usage/v2-1" }, { "text": "New in v2.0", "url": "/usage/v2" } diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..5bba1922b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -83,7 +83,7 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { tags_acc: 'TAG', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') return Object.keys(data).map(key => ({ diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index c96905733..1f788877c 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -124,7 +124,7 @@ const Landing = ({ data }) => { {counts.modelLangs} languages
  • - pretrained word vectors + Pretrained word vectors
  • State-of-the-art speed
  • diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js index 55645f951..bb26e57cd 100644 --- a/website/src/widgets/languages.js +++ b/website/src/widgets/languages.js @@ -38,10 +38,10 @@ const Languages = () => ( const langs = site.siteMetadata.languages const withModels = langs .filter(({ models }) => models && !!models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withoutModels = langs .filter(({ models }) => !models || !models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withDeps = langs.filter(({ dependencies }) => dependencies && dependencies.length) return ( <> From bb54f54369be830651658191807c4e8625abb48c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 16:10:12 +0200 Subject: [PATCH 091/119] Fix model accuracy table [ci skip] --- website/src/templates/models.js | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..3c5e9d2a4 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState, useMemo } from 'react' +import React, { useEffect, useState, useMemo, Fragment } from 'react' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' @@ -83,15 +83,24 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { + las: 'LAS', + uas: 'UAS', + tags_acc: 'TAG', + ents_f: 'NER F', + ents_p: 'NER P', + ents_r: 'NER R', + } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') - return Object.keys(data).map(key => ({ - label: labels[key] || key.toUpperCase(), - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + return Object.keys(data) + .filter(key => labels[key]) + .map(key => ({ + label: labels[key], + value: data[key].toFixed(2), + help: MODEL_META[key], + type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, + })) } function formatModelMeta(data) { @@ -115,11 +124,11 @@ function formatModelMeta(data) { function formatSources(data = []) { const sources = data.map(s => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( - <> + {i > 0 &&
    } {name && url ? {name} : name} {author && ` (${author})`} - +
    )) } @@ -308,12 +317,12 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl {labelNames.map((label, i) => ( - <> + {i > 0 && ', '} {label} - + ))} From a9e5b840ee43746cd39213da9d27a01188be1904 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 16:38:45 +0200 Subject: [PATCH 092/119] Fix typos and auto-format [ci skip] --- website/docs/api/goldparse.md | 49 ++++++++++++++++++----------------- website/docs/api/matcher.md | 2 +- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 5df625991..bc33dd4e6 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -12,18 +12,18 @@ expects true examples of a label to have the value `1.0`, and negative examples of a label to have the value `0.0`. Labels not in the dictionary are treated as missing – the gradient for those labels will be zero. -| Name | Type | Description | -| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document the annotations refer to. | -| `words` | iterable | A sequence of unicode word strings. | -| `tags` | iterable | A sequence of strings, representing tag annotations. | -| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | -| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | -| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | -| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | -| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False.`. | -| **RETURNS** | `GoldParse` | The newly constructed object. | +| Name | Type | Description | +| ----------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document the annotations refer to. | +| `words` | iterable | A sequence of unicode word strings. | +| `tags` | iterable | A sequence of strings, representing tag annotations. | +| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | +| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | +| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | +| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | +| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False`. | +| **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -43,17 +43,17 @@ Whether the provided syntactic annotations form a projective dependency tree. ## Attributes {#attributes} -| Name | Type | Description | -| ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `words` | list | The words. | -| `tags` | list | The part-of-speech tag annotations. | -| `heads` | list | The syntactic head annotations. | -| `labels` | list | The syntactic relation-type annotations. | -| `ner` | list | The named entity annotations as BILUO tags. | -| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | -| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` 2 | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | -| `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | +| Name | Type | Description | +| ------------------------------------ | ---- | ------------------------------------------------------------------------------------------------------------------------ | +| `words` | list | The words. | +| `tags` | list | The part-of-speech tag annotations. | +| `heads` | list | The syntactic head annotations. | +| `labels` | list | The syntactic relation-type annotations. | +| `ner` | list | The named entity annotations as BILUO tags. | +| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | +| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | +| `cats` 2 | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | +| `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | ## Utilities {#util} @@ -61,7 +61,8 @@ Whether the provided syntactic annotations form a projective dependency tree. Convert a list of Doc objects into the [JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. > #### Example > diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index ac2f898e0..7b195e352 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -57,7 +57,7 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects. | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).. | +| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | From 44af53bdd93713b24ac28459c5d2543f03c47a18 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 17:13:35 +0200 Subject: [PATCH 093/119] Add pkuseg warnings and auto-format [ci skip] --- website/docs/usage/models.md | 16 ++++- website/docs/usage/v2-3.md | 121 ++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 382193157..4549e8433 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -117,6 +117,18 @@ The Chinese language class supports three word segmentation options: better segmentation for Chinese OntoNotes and the new [Chinese models](/models/zh). + + +Note that [`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship +with pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can +install it from our fork and compile it locally: + +```bash +$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip +``` + + + The `meta` argument of the `Chinese` language class supports the following @@ -196,8 +208,8 @@ nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_mo The Japanese language class uses [SudachiPy](https://github.com/WorksApplications/SudachiPy) for word -segmentation and part-of-speech tagging. The default Japanese language class -and the provided Japanese models use SudachiPy split mode `A`. +segmentation and part-of-speech tagging. The default Japanese language class and +the provided Japanese models use SudachiPy split mode `A`. The `meta` argument of the `Japanese` language class can be used to configure the split mode to `A`, `B` or `C`. diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index ba75b01ab..d59b50a6e 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -14,10 +14,10 @@ all language models, and decreased model size and loading times for models with vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish and Romanian** and updated the training data and vectors for most languages. Model packages with vectors are about **2×** smaller on disk and load -**2-4×** faster. For the full changelog, see the [release notes on -GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more -details and a behind-the-scenes look at the new release, [see our blog -post](https://explosion.ai/blog/spacy-v2-3). +**2-4×** faster. For the full changelog, see the +[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). +For more details and a behind-the-scenes look at the new release, +[see our blog post](https://explosion.ai/blog/spacy-v2-3). ### Expanded model families with vectors {#models} @@ -33,10 +33,10 @@ post](https://explosion.ai/blog/spacy-v2-3). With new model families for Chinese, Danish, Polish, Romanian and Chinese plus `md` and `lg` models with word vectors for all languages, this release provides -a total of 46 model packages. For models trained using [Universal -Dependencies](https://universaldependencies.org) corpora, the training data has -been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been -extended to include both UD Dutch Alpino and LassySmall. +a total of 46 model packages. For models trained using +[Universal Dependencies](https://universaldependencies.org) corpora, the +training data has been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) +and Dutch has been extended to include both UD Dutch Alpino and LassySmall. @@ -48,6 +48,7 @@ extended to include both UD Dutch Alpino and LassySmall. ### Chinese {#chinese} > #### Example +> > ```python > from spacy.lang.zh import Chinese > @@ -57,41 +58,49 @@ extended to include both UD Dutch Alpino and LassySmall. > > # Append words to user dict > nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) +> ``` This release adds support for -[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and -the new Chinese models ship with a custom pkuseg model trained on OntoNotes. -The Chinese tokenizer can be initialized with both `pkuseg` and custom models -and the `pkuseg` user dictionary is easy to customize. +[`pkuseg`](https://github.com/lancopku/pkuseg-python) for word segmentation and +the new Chinese models ship with a custom pkuseg model trained on OntoNotes. The +Chinese tokenizer can be initialized with both `pkuseg` and custom models and +the `pkuseg` user dictionary is easy to customize. Note that +[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with +pre-compiled wheels for Python 3.8. See the +[usage documentation](/usage/models#chinese) for details on how to install it on +Python 3.8. -**Chinese:** [Chinese tokenizer usage](/usage/models#chinese) +**Models:** [Chinese models](/models/zh) **Usage: ** +[Chinese tokenizer usage](/usage/models#chinese) ### Japanese {#japanese} The updated Japanese language class switches to -[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word -segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies +[`SudachiPy`](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. Using `SudachiPy` greatly simplifies installing spaCy for Japanese, which is now possible with a single command: `pip install spacy[ja]`. -**Japanese:** [Japanese tokenizer usage](/usage/models#japanese) +**Models:** [Japanese models](/models/ja) **Usage:** +[Japanese tokenizer usage](/usage/models#japanese) ### Small CLI updates -- `spacy debug-data` provides the coverage of the vectors in a base model with - `spacy debug-data lang train dev -b base_model` -- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en - dev.json`) to evaluate the tokenization accuracy without loading a model -- `spacy train` on GPU restricts the CPU timing evaluation to the first - iteration +- [`spacy debug-data`](/api/cli#debug-data) provides the coverage of the vectors + in a base model with `spacy debug-data lang train dev -b base_model` +- [`spacy evaluate`](/api/cli#evaluate) supports `blank:lg` (e.g. + `spacy evaluate blank:en dev.json`) to evaluate the tokenization accuracy + without loading a model +- [`spacy train`](/api/cli#train) on GPU restricts the CPU timing evaluation to + the first iteration ## Backwards incompatibilities {#incompat} @@ -100,8 +109,8 @@ installing spaCy for Japanese, which is now possible with a single command: If you've been training **your own models**, you'll need to **retrain** them with the new version. Also don't forget to upgrade all models to the latest versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible -with models for v2.3. To check if all of your models are up to date, you can -run the [`spacy validate`](/api/cli#validate) command. +with models for v2.3. To check if all of your models are up to date, you can run +the [`spacy validate`](/api/cli#validate) command. @@ -116,21 +125,20 @@ run the [`spacy validate`](/api/cli#validate) command. > directly. - If you're training new models, you'll want to install the package - [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), - which now includes both the lemmatization tables (as in v2.2) and the - normalization tables (new in v2.3). If you're using pretrained models, - **nothing changes**, because the relevant tables are included in the model - packages. + [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which + now includes both the lemmatization tables (as in v2.2) and the normalization + tables (new in v2.3). If you're using pretrained models, **nothing changes**, + because the relevant tables are included in the model packages. - Due to the updated Universal Dependencies training data, the fine-grained part-of-speech tags will change for many provided language models. The coarse-grained part-of-speech tagset remains the same, but the mapping from particular fine-grained to coarse-grained tags may show minor differences. - For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech - tagsets contain new merged tags related to contracted forms, such as - `ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head - `"à"`. This increases the accuracy of the models by improving the alignment - between spaCy's tokenization and Universal Dependencies multi-word tokens - used for contractions. + tagsets contain new merged tags related to contracted forms, such as `ADP_DET` + for French `"au"`, which maps to UPOS `ADP` based on the head `"à"`. This + increases the accuracy of the models by improving the alignment between + spaCy's tokenization and Universal Dependencies multi-word tokens used for + contractions. ### Migrating from spaCy 2.2 {#migrating} @@ -143,29 +151,28 @@ v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1 and earlier versions. A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle -cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., -a comma at the end of a URL) before applying the match. See the full [tokenizer -documentation](/usage/linguistic-features#tokenization) and try out +cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., a +comma at the end of a URL) before applying the match. See the full +[tokenizer documentation](/usage/linguistic-features#tokenization) and try out [`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when debugging your tokenizer configuration. #### Warnings configuration -spaCy's custom warnings have been replaced with native python +spaCy's custom warnings have been replaced with native Python [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of -setting `SPACY_WARNING_IGNORE`, use the [warnings -filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +setting `SPACY_WARNING_IGNORE`, use the +[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) to manage warnings. #### Normalization tables The normalization tables have moved from the language data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to -the package -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If -you're adding data for a new language, the normalization table should be added -to `spacy-lookups-data`. See [adding norm -exceptions](/usage/adding-languages#norm-exceptions). +[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to the +package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). +If you're adding data for a new language, the normalization table should be +added to `spacy-lookups-data`. See +[adding norm exceptions](/usage/adding-languages#norm-exceptions). #### Probability and cluster features @@ -181,28 +188,28 @@ exceptions](/usage/adding-languages#norm-exceptions). The `Token.prob` and `Token.cluster` features, which are no longer used by the core pipeline components as of spaCy v2, are no longer provided in the -pretrained models to reduce the model size. To keep these features available -for users relying on them, the `prob` and `cluster` features for the most -frequent 1M tokens have been moved to +pretrained models to reduce the model size. To keep these features available for +users relying on them, the `prob` and `cluster` features for the most frequent +1M tokens have been moved to [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as `extra` features for the relevant languages (English, German, Greek and Spanish). The extra tables are loaded lazily, so if you have `spacy-lookups-data` -installed and your code accesses `Token.prob`, the full table is loaded into -the model vocab, which will take a few seconds on initial loading. When you -save this model after loading the `prob` table, the full `prob` table will be -saved as part of the model vocab. +installed and your code accesses `Token.prob`, the full table is loaded into the +model vocab, which will take a few seconds on initial loading. When you save +this model after loading the `prob` table, the full `prob` table will be saved +as part of the model vocab. -If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as -part of a new model, add the data to +If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as part +of a new model, add the data to [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a [`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`, `lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is -currently only used to provide a custom `oov_prob`. See examples in the [`data` -directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) +currently only used to provide a custom `oov_prob`. See examples in the +[`data` directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) in `spacy-lookups-data`. #### Initializing new models without extra lookups tables From fd89f44c0c81bd1f1a2c1ec396c0ff3a29ac6423 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 17:34:26 +0200 Subject: [PATCH 094/119] Update Binder URL [ci skip] --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 29d71048e..9083e98a0 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -23,7 +23,7 @@ "apiKey": "371e26ed49d29a27bd36273dfdaf89af", "indexName": "spacy" }, - "binderUrl": "ines/spacy-io-binder", + "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", "binderVersion": "2.2.0", "sections": [ From 41003a5117d23f519c99edddfb4fc3a80370d7d1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 17:41:23 +0200 Subject: [PATCH 095/119] Update Binder version [ci skip] --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 9083e98a0..8b8424f82 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -25,7 +25,7 @@ }, "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", - "binderVersion": "2.2.0", + "binderVersion": "2.3.0", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, From 457babfa0c581d868fe16b418c0dcef357d78a97 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:22:03 +0200 Subject: [PATCH 096/119] Update alignment example for new gold.align --- website/docs/usage/linguistic-features.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..a442cc7a0 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1130,9 +1130,9 @@ from spacy.gold import align other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) -print("Misaligned tokens:", cost) # 2 +print("Edit distance:", cost) # 3 print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) -print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7]) +print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7]) print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} print("Many-to-one mappings b-> a", b2a_multi) # {} ``` @@ -1140,7 +1140,7 @@ print("Many-to-one mappings b-> a", b2a_multi) # {} Here are some insights from the alignment information generated in the example above: -- Two tokens are misaligned. +- The edit distance (cost) is `3`: two deletions and one insertion. - The one-to-one mappings for the first four tokens are identical, which means they map to each other. This makes sense because they're also identical in the input: `"i"`, `"listened"`, `"to"` and `"obama"`. From 9aff317ca788cc996da5125e7d9c4783c8ab9f7e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:26:57 +0200 Subject: [PATCH 097/119] Update POS in tagging example --- website/docs/usage/101/_pos-deps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index 1a438e424..1e8960edf 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -36,7 +36,7 @@ for token in doc: | Text | Lemma | POS | Tag | Dep | Shape | alpha | stop | | ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- | | Apple | apple | `PROPN` | `NNP` | `nsubj` | `Xxxxx` | `True` | `False` | -| is | be | `VERB` | `VBZ` | `aux` | `xx` | `True` | `True` | +| is | be | `AUX` | `VBZ` | `aux` | `xx` | `True` | `True` | | looking | look | `VERB` | `VBG` | `ROOT` | `xxxx` | `True` | `False` | | at | at | `ADP` | `IN` | `prep` | `xx` | `True` | `True` | | buying | buy | `VERB` | `VBG` | `pcomp` | `xxxx` | `True` | `False` | From a6abdfbc3c5a298b9d0e547451701f6705fd09b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:35:45 +0200 Subject: [PATCH 098/119] Fix numpy.zeros() dtype for Doc.from_array --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index a442cc7a0..1e3b129ac 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.") print("Before", doc.ents) # [] header = [ENT_IOB, ENT_TYPE] -attr_array = numpy.zeros((len(doc), len(header))) +attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64") attr_array[0, 0] = 3 # B attr_array[0, 1] = doc.vocab.strings["GPE"] doc.from_array(header, attr_array) From f0fd77648fb488c26852cd1494b69073e5766b65 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:36:21 +0200 Subject: [PATCH 099/119] Change example title to Dr. Change example title to Dr. so the current model does exclude the title in the initial example. --- website/docs/usage/rule-based-matching.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1db2405d1..f7866fe31 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1158,17 +1158,17 @@ what you need for your application. > available corpus. For example, the corpus spaCy's [English models](/models/en) were trained on -defines a `PERSON` entity as just the **person name**, without titles like "Mr" -or "Dr". This makes sense, because it makes it easier to resolve the entity type -back to a knowledge base. But what if your application needs the full names, -_including_ the titles? +defines a `PERSON` entity as just the **person name**, without titles like "Mr." +or "Dr.". This makes sense, because it makes it easier to resolve the entity +type back to a knowledge base. But what if your application needs the full +names, _including_ the titles? ```python ### {executable="true"} import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -1233,7 +1233,7 @@ def expand_person_entities(doc): # Add the component after the named entity recognizer nlp.add_pipe(expand_person_entities, after='ner') -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` From 02369f91d307a6ba43f1d9ad97efbb5e348cc599 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 20:41:17 +0200 Subject: [PATCH 100/119] Fix spacy convert argument --- website/docs/usage/adding-languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..98d4fdec9 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -634,7 +634,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole documents**, not just single sentences. If your corpus only contains single sentences, spaCy's models will never learn to expect multi-sentence documents, leading to low performance on real text. To mitigate this problem, you can use -the `-N` argument to the `spacy convert` command, to merge some of the sentences +the `-n` argument to the `spacy convert` command, to merge some of the sentences into longer pseudo-documents. ### Training the tagger and parser {#train-tagger-parser} From 931d80de72db45bb11d571e767d7062a45209182 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Jun 2020 12:43:41 +0200 Subject: [PATCH 101/119] Warning for sudachipy 0.4.5 (#5611) --- website/docs/usage/models.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 4549e8433..b11e6347a 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -214,6 +214,14 @@ the provided Japanese models use SudachiPy split mode `A`. The `meta` argument of the `Japanese` language class can be used to configure the split mode to `A`, `B` or `C`. + + +If you run into errors related to `sudachipy`, which is currently under active +development, we suggest downgrading to `sudachipy==0.4.5`, which is the version +used for training the current [Japanese models](/models/ja). + + + ## Installing and using models {#download} > #### Downloading models in spaCy < v1.7 From ccd7edf04bac4a8a29431433e73e1a2474acc0dd Mon Sep 17 00:00:00 2001 From: "Marat M. Yavrumyan" Date: Fri, 19 Jun 2020 20:34:27 +0400 Subject: [PATCH 102/119] Create myavrum.md (#5612) --- .github/contributors/myavrum.md | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/myavrum.md diff --git a/.github/contributors/myavrum.md b/.github/contributors/myavrum.md new file mode 100644 index 000000000..dc8f1bb84 --- /dev/null +++ b/.github/contributors/myavrum.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Marat M. Yavrumyan | +| Company name (if applicable) | YSU, UD_Armenian Project | +| Title or role (if applicable) | Dr., Principal Investigator | +| Date | 2020-06-19 | +| GitHub username | myavrum | +| Website (optional) | http://armtreebank.yerevann.com/ | From 8120b641ccb66b088fa70c028e5be542bf561dfd Mon Sep 17 00:00:00 2001 From: "Marat M. Yavrumyan" Date: Fri, 19 Jun 2020 22:00:34 +0400 Subject: [PATCH 103/119] Update lex_attrs.py (#5608) --- spacy/lang/hy/lex_attrs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 910625fb8..b556d679c 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -5,8 +5,8 @@ from ...attrs import LIKE_NUM _num_words = [ - "զրօ", - "մէկ", + "զրո", + "մեկ", "երկու", "երեք", "չորս", @@ -28,10 +28,10 @@ _num_words = [ "քսան" "երեսուն", "քառասուն", "հիսուն", - "վաթցսուն", + "վաթսուն", "յոթանասուն", "ութսուն", - "ինիսուն", + "իննսուն", "հարյուր", "հազար", "միլիոն", From ff6a084e9cdf9114cbc8cb55fe0e9c69e4cabc34 Mon Sep 17 00:00:00 2001 From: Karen Hambardzumyan Date: Sat, 20 Jun 2020 13:14:26 +0400 Subject: [PATCH 104/119] Create mahnerak.md (#5615) --- .github/contributors/mahnerak.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/mahnerak.md diff --git a/.github/contributors/mahnerak.md b/.github/contributors/mahnerak.md new file mode 100644 index 000000000..cc7739681 --- /dev/null +++ b/.github/contributors/mahnerak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Karen Hambardzumyan | +| Company name (if applicable) | YerevaNN | +| Title or role (if applicable) | Researcher | +| Date | 2020-06-19 | +| GitHub username | mahnerak | +| Website (optional) | https://mahnerak.com/| From 0cdb631e6c328bdc985f631125dcbb3e5a55c673 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 16:02:42 +0200 Subject: [PATCH 105/119] Fix merge errors --- spacy/pipeline/pipes.pyx | 2 +- spacy/tokenizer.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 946cd5366..7c800eed8 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1337,7 +1337,7 @@ class EntityLinker(Pipe): final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - sent_doc = doc[start_token:end_token].as_doc() + sent_doc = doc[sent.start:sent.end].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model([sent_doc])[0] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ef5b14d87..b40113460 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -782,7 +782,7 @@ cdef class Tokenizer: "suffix_search": lambda b: data.setdefault("suffix_search", b), "infix_finditer": lambda b: data.setdefault("infix_finditer", b), "token_match": lambda b: data.setdefault("token_match", b), - "url_match": lambda b: data.setdefault("url_match", b) + "url_match": lambda b: data.setdefault("url_match", b), "exceptions": lambda b: data.setdefault("rules", b) } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) @@ -795,7 +795,7 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], str): self.token_match = re.compile(data["token_match"]).match - if "url_match" in data and isinstance(data["url_match"], basestring_): + if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions From 296b5d633b94ca51ed038b31d207edb5f53e0acb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 16:11:13 +0200 Subject: [PATCH 106/119] Remove references to Python 2 / is_python2 --- spacy/tests/regression/test_issue5230.py | 19 +++++++------------ .../serialize/test_serialize_vocab_strings.py | 2 -- spacy/tests/vocab_vectors/test_vectors.py | 2 -- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 2b14ff589..9e83d6818 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -8,7 +8,6 @@ from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language from spacy.pipeline import Pipe -from spacy.compat import is_python2 from ..util import make_tempdir @@ -97,14 +96,12 @@ def write_obj_and_catch_warnings(obj): return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) -@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) assert len(warnings_list) == 0 -@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") def test_writer_with_path_py35(): writer = None with make_tempdir() as d: @@ -135,13 +132,11 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) -if not is_python2: +class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) - class TestToDiskResourceWarningUnittest(TestCase): - def test_resource_warning(self): - scenarios = zip(*objects_to_test) - - for scenario in scenarios: - with self.subTest(msg=scenario[1]): - warnings_list = write_obj_and_catch_warnings(scenario[0]) - self.assertEqual(len(warnings_list), 0) + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index f0bad9c10..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -2,7 +2,6 @@ import pytest import pickle from spacy.vocab import Vocab from spacy.strings import StringStore -from spacy.compat import is_python2 from ..util import make_tempdir @@ -135,7 +134,6 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) != list(sstore2_d) -@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 819338eeb..cc95252a6 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -6,7 +6,6 @@ from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc -from spacy.compat import is_python2 from ..util import add_vecs_to_vocab, get_cosine, make_tempdir @@ -336,7 +335,6 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) -@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) From 63c22969f4cc73cbee577fd026b46f2c4ecff43e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 16:17:48 +0200 Subject: [PATCH 107/119] Update test_issue5230.py --- spacy/tests/regression/test_issue5230.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9e83d6818..42b08eeff 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -76,6 +76,7 @@ def entity_linker(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker From 5424b70e51277049fac470a5c5458830202d03f0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 20 Jun 2020 16:18:53 +0200 Subject: [PATCH 108/119] Remove v2 test --- spacy/tests/test_misc.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index cc6d3a57d..5f9e72f79 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -107,36 +107,6 @@ def test_load_model_blank_shortcut(): util.load_model("blank:fjsfijsdof") -def test_load_model_version_compat(): - """Test warnings for various spacy_version specifications in meta. Since - this is more of a hack for v2, manually specify the current major.minor - version to simplify test creation.""" - nlp = util.load_model("blank:en") - assert nlp.meta["spacy_version"].startswith(">=2.3") - with make_tempdir() as d: - # no change: compatible - nlp.to_disk(d) - meta_path = Path(d / "meta.json") - util.get_model_meta(d) - - # additional compatible upper pin - nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - srsly.write_json(meta_path, nlp.meta) - util.get_model_meta(d) - - # incompatible older version - nlp.meta["spacy_version"] = ">=2.2.5" - srsly.write_json(meta_path, nlp.meta) - with pytest.warns(UserWarning): - util.get_model_meta(d) - - # invalid version specification - nlp.meta["spacy_version"] = ">@#$%_invalid_version" - srsly.write_json(meta_path, nlp.meta) - with pytest.warns(UserWarning): - util.get_model_meta(d) - - @pytest.mark.parametrize( "version,constraint,compatible", [ From dc069e90b39ef3ca0604e950780a84482386973f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 21:13:11 +0200 Subject: [PATCH 109/119] fix token.morph_ for v.3 (cf PR #5517) --- spacy/tokens/morphanalysis.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e0db52d5b..77e499968 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -73,3 +73,10 @@ cdef class MorphAnalysis: """Produce a dict representation. """ return self.vocab.morphology.feats_to_dict(self.to_json()) + + def __str__(self): + return self.to_json() + + def __repr__(self): + return self.to_json() + From c9242e9bf49f751907debfac92e80ae3f93057e8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 21:47:23 +0200 Subject: [PATCH 110/119] fix entity linker (cf PR #5548) --- spacy/pipeline/pipes.pyx | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7c800eed8..536c2a8a5 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1219,13 +1219,11 @@ class EntityLinker(Pipe): sent_doc = doc[start_token:end_token].as_doc() sentence_docs.append(sent_doc) - sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) - bp_context(d_scores, sgd=sgd) set_dropout_rate(self.model, drop) sentence_encodings, bp_context = self.model.begin_update(sentence_docs) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) bp_context(d_scores) + if sgd is not None: self.model.finish_update(sgd) @@ -1306,22 +1304,28 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] - for i, doc in enumerate(docs): sentences = [s for s in doc.sents] if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() + for sent_index, sent in enumerate(sentences): + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + sent_doc = doc[start_token:end_token].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model.predict([sent_doc])[0] xp = get_array_module(sentence_encoding) sentence_encoding_t = sentence_encoding.T sentence_norm = xp.linalg.norm(sentence_encoding_t) - for ent in sent_doc.ents: + for ent in sent.ents: entity_count += 1 to_discard = self.cfg.get("labels_discard", []) @@ -1337,21 +1341,11 @@ class EntityLinker(Pipe): final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - sent_doc = doc[sent.start:sent.end].as_doc() + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - - for ent in sent.ents: - entity_count += 1 - - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) final_tensors.append(sentence_encoding) else: From 5cb812e0ab2f1cbddcc13b9cf442482112d28ced Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 22:04:18 +0200 Subject: [PATCH 111/119] fix NER warn empty lookups (cf PR #5588) --- spacy/syntax/nn_parser.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8218240f0..1dcb92016 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -608,6 +608,8 @@ cdef class Parser: def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples From 256d4c27c838f0f995b3e5beb5712f649ecf9ba1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 22:38:00 +0200 Subject: [PATCH 112/119] fix tagger begin_training being called without examples --- spacy/pipeline/pipes.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 536c2a8a5..b3fa77732 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -394,12 +394,11 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} - # TODO: do we still need this? - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) From 617977427897bd2c2bb1fce9ff190a3045169cf9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 22:49:37 +0200 Subject: [PATCH 113/119] fix test_build_dependencies by ignoring new libs --- spacy/language.py | 2 +- spacy/tests/package/test_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 36ecad68b..94da63a1a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1166,7 +1166,7 @@ def _fix_pretrained_vectors_name(nlp): else: raise ValueError(Errors.E092) if nlp.vocab.vectors.size != 0: - link_vectors_to_models(nlp.vocab, skip_rank=True) + link_vectors_to_models(nlp.vocab) for name, proc in nlp.pipeline: if not hasattr(proc, "cfg"): continue diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 0dc0f9d6c..a7c9a3ea4 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,7 +10,7 @@ def test_build_dependencies(): "mock", "flake8", ] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] + libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] # check requirements.txt req_dict = {} From 12dc8ab208720e019ed02b3b63432f59982bc5bd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 23:07:42 +0200 Subject: [PATCH 114/119] remove redundant code from master in EntityLinker --- spacy/pipeline/pipes.pyx | 8 -------- spacy/tests/regression/test_issue5230.py | 7 +++---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b3fa77732..98414736b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1136,14 +1136,6 @@ class EntityLinker(Pipe): # how many neightbour sentences to take into account self.n_sents = cfg.get("n_sents", 0) - def set_kb(self, kb): - self.kb = kb - - def require_model(self): - # Raise an error if the component's model is not initialized. - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def require_kb(self): # Raise an error if the knowledge base is not initialized. if len(self.kb) == 0: diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 42b08eeff..b46bf9063 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -70,14 +70,13 @@ def tagger(): def entity_linker(): nlp = Language() - nlp.add_pipe(nlp.create_pipe("entity_linker")) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) + nlp.add_pipe(nlp.create_pipe("entity_linker", {"kb": kb})) entity_linker = nlp.get_pipe("entity_linker") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) - kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) - entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker From 2f6062a8a4353a3ee8c0602acbd1dba22f857fe4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 23:14:45 +0200 Subject: [PATCH 115/119] add line that got removed from EntityLinker --- spacy/pipeline/pipes.pyx | 105 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 98414736b..4e04b96b5 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1302,71 +1302,72 @@ class EntityLinker(Pipe): # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. for sent_index, sent in enumerate(sentences): - # get n_neightbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) - for ent in sent.ents: - entity_count += 1 + for ent in sent.ents: + entity_count += 1 - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) - - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax().item() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From 689600e17d0e3734b29bf758e09068b7b4413437 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 20 Jun 2020 23:23:57 +0200 Subject: [PATCH 116/119] add additional test back in (it works now) --- spacy/tests/test_lemmatizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 4f7c0a026..050206539 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -35,6 +35,8 @@ def test_tagger_warns_no_lookups(): nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") + with pytest.warns(UserWarning): + tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() From 40bb918a4c8507f5c54a722e0388eda1da1e2b7a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 21 Jun 2020 22:34:10 +0200 Subject: [PATCH 117/119] Remove unicode declarations and tidy up --- spacy/lang/es/punctuation.py | 3 - spacy/lang/gu/__init__.py | 3 - spacy/lang/gu/examples.py | 4 -- spacy/lang/gu/stop_words.py | 3 - spacy/lang/hy/__init__.py | 3 - spacy/lang/hy/examples.py | 3 - spacy/lang/hy/lex_attrs.py | 3 - spacy/lang/hy/stop_words.py | 3 - spacy/lang/hy/tag_map.py | 3 - spacy/lang/ja/bunsetu.py | 92 ++++++++++++++++-------- spacy/lang/ja/syntax_iterators.py | 29 ++++---- spacy/lang/kn/examples.py | 4 -- spacy/lang/ml/__init__.py | 3 - spacy/lang/ml/examples.py | 4 -- spacy/lang/ml/lex_attrs.py | 3 - spacy/lang/ml/stop_words.py | 4 -- spacy/lang/pl/lemmatizer.py | 3 - spacy/lang/sv/lex_attrs.py | 3 - spacy/tests/lang/de/test_noun_chunks.py | 3 - spacy/tests/lang/el/test_noun_chunks.py | 3 - spacy/tests/lang/es/test_noun_chunks.py | 3 - spacy/tests/lang/fa/test_noun_chunks.py | 3 - spacy/tests/lang/fr/test_noun_chunks.py | 3 - spacy/tests/lang/gu/test_text.py | 3 - spacy/tests/lang/hy/test_text.py | 3 - spacy/tests/lang/hy/test_tokenizer.py | 3 - spacy/tests/lang/id/test_noun_chunks.py | 3 - spacy/tests/lang/ja/test_serialize.py | 4 -- spacy/tests/lang/ml/test_text.py | 3 - spacy/tests/lang/nb/test_noun_chunks.py | 3 - spacy/tests/lang/sv/test_lex_attrs.py | 3 - spacy/tests/lang/zh/test_serialize.py | 3 - spacy/tests/regression/test_issue5152.py | 3 - spacy/tests/regression/test_issue5230.py | 1 - spacy/tests/regression/test_issue5458.py | 3 - 35 files changed, 76 insertions(+), 147 deletions(-) diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py index f989221c2..e9552371e 100644 --- a/spacy/lang/es/punctuation.py +++ b/spacy/lang/es/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 1f080c7c2..bc8fc260c 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py index 202a8d022..1cf75fd32 100644 --- a/spacy/lang/gu/examples.py +++ b/spacy/lang/gu/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 85d33763d..2c859681b 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ એમ diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 6aaa965bb..8928e52ae 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index 323f77b1c..69e354688 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.hy.examples import sentences diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index b556d679c..f84472d60 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index d75aad6e2..46d0f6b51 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ նա diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py index 722270110..09be1fd8d 100644 --- a/spacy/lang/hy/tag_map.py +++ b/spacy/lang/hy/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py index 7c3eee336..e8c802246 100644 --- a/spacy/lang/ja/bunsetu.py +++ b/spacy/lang/ja/bunsetu.py @@ -1,21 +1,11 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS - - POS_PHRASE_MAP = { "NOUN": "NP", "NUM": "NP", "PRON": "NP", "PROPN": "NP", - "VERB": "VP", - "ADJ": "ADJP", - "ADV": "ADVP", - "CCONJ": "CCONJP", } @@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False): dep = t.dep_ head = t.head.i if debug: - print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + print( + t.i, + t.orth_, + pos, + pos_type, + dep, + head, + bunsetu_may_end, + phrase_type, + phrase, + bunsetu, + ) # DET is always an individual bunsetu if pos == "DET": @@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False): # entering new bunsetu elif pos_type and ( - pos_type != phrase_type or # different phrase type arises - bunsetu_may_end # same phrase type but bunsetu already ended + pos_type != phrase_type + or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended ): # exceptional case: NOUN to VERB - if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + if ( + phrase_type == "NP" + and pos_type == "VP" + and prev_dep == "compound" + and prev_head == t.i + ): bunsetu.append(t) phrase_type = "VP" phrase.append(t) # exceptional case: VERB to NOUN - elif phrase_type == "VP" and pos_type == "NP" and ( - prev_dep == 'compound' and prev_head == t.i or - dep == 'compound' and prev == head or - prev_dep == 'nmod' and prev_head == t.i + elif ( + phrase_type == "VP" + and pos_type == "NP" + and ( + prev_dep == "compound" + and prev_head == t.i + or dep == "compound" + and prev == head + or prev_dep == "nmod" + and prev_head == t.i + ) ): bunsetu.append(t) phrase_type = "NP" @@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False): # NOUN bunsetu elif phrase_type == "NP": bunsetu.append(t) - if not bunsetu_may_end and (( - (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + (pos_type == "NP" or pos == "SYM") + and (prev_head == t.i or prev_head == head) + and prev_dep in {"compound", "nummod"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True @@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False): # VERB bunsetu elif phrase_type == "VP": bunsetu.append(t) - if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + if ( + not bunsetu_may_end + and pos == "VERB" + and prev_head == t.i + and prev_dep == "compound" + ): phrase.append(t) else: bunsetu_may_end = True # ADJ bunsetu - elif phrase_type == "ADJP" and tag != '連体詞': + elif phrase_type == "ADJP" and tag != "連体詞": bunsetu.append(t) - if not bunsetu_may_end and (( - pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + pos == "NOUN" + and (prev_head == t.i or prev_head == head) + and prev_dep in {"amod", "compound"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index cd1e4fde7..3f6e4bfa3 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -1,24 +1,22 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB # XXX this can probably be pruned a bit labels = [ - "nsubj", - "nmod", - "dobj", - "nsubjpass", - "pcomp", - "pobj", - "obj", - "obl", - "dative", - "appos", - "attr", - "ROOT", + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", ] + def noun_chunks(obj): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. @@ -52,4 +50,5 @@ def noun_chunks(obj): seen.update(w.i for w in word.head.rights) yield unseen[0], word.i + 1, np_label + SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py index d82630432..3e055752e 100644 --- a/spacy/lang/kn/examples.py +++ b/spacy/lang/kn/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index d052ded1b..e92a7617f 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py index a2a0ed10e..9794eab29 100644 --- a/spacy/lang/ml/examples.py +++ b/spacy/lang/ml/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py index 468ad88f8..9ac19b6a7 100644 --- a/spacy/lang/ml/lex_attrs.py +++ b/spacy/lang/ml/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 8bd6a7e02..441e93586 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ അത് diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 8b8d7fe27..b80a1a143 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...parts_of_speech import NAMES diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py index 24d06a97a..f8ada9e2e 100644 --- a/spacy/lang/sv/lex_attrs.py +++ b/spacy/lang/sv/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index 8d76ddd79..ff9f8d5e5 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index 4f24865d0..38e72b0b2 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 66bbd8c3a..a7ec4e562 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py index a98aae061..767e91f6b 100644 --- a/spacy/tests/lang/fa/test_noun_chunks.py +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index ea93a5a35..5fd6897f7 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py index aa8d442a2..2d251166f 100644 --- a/spacy/tests/lang/gu/test_text.py +++ b/spacy/tests/lang/gu/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index cbdb77e4e..ac0f1e128 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.hy.lex_attrs import like_num diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py index 3eeb8b54e..e9efb224a 100644 --- a/spacy/tests/lang/hy/test_tokenizer.py +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index add76f9b9..445643933 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 018e645bb..9e703e63d 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest from spacy.lang.ja import Japanese from ...util import make_tempdir diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py index 2883cf5bb..aced78461 100644 --- a/spacy/tests/lang/ml/test_text.py +++ b/spacy/tests/lang/ml/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index 653491a64..c6a00354b 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py index abe6b0f7b..656c4706b 100644 --- a/spacy/tests/lang/sv/test_lex_attrs.py +++ b/spacy/tests/lang/sv/test_lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.sv.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 56f092ed8..d84920c3e 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.zh import Chinese from ...util import make_tempdir diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index 758ac9c14..a9a57746d 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index b46bf9063..9ffa3862c 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,4 +1,3 @@ -# coding: utf8 import warnings from unittest import TestCase import pytest diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 3281e2a8c..a7a2959df 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc From ef5f548fb0b8f4737a41a838c0d1123752e12346 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 21 Jun 2020 22:38:04 +0200 Subject: [PATCH 118/119] Tidy up and auto-format --- spacy/lang/en/tokenizer_exceptions.py | 24 +++++++++++++++--------- spacy/lang/ja/syntax_iterators.py | 2 +- spacy/lang/ja/tag_bigram_map.py | 11 +---------- spacy/lang/ja/tag_orth_map.py | 14 +++----------- spacy/lang/ta/examples.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tests/lang/ja/test_serialize.py | 4 ++-- spacy/tests/lang/ja/test_tokenizer.py | 16 ++++++++-------- spacy/tests/package/test_requirements.py | 8 +++++++- spacy/tests/test_misc.py | 3 --- 10 files changed, 39 insertions(+), 47 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index e024dd483..dc8a5c04d 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: +for word in [ + "who", + "what", + "when", + "where", + "why", + "how", + "there", + "that", + "this", + "these", + "those", +]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, @@ -396,14 +408,8 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], - "c'mon": [ - {ORTH: "c'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ], - "C'mon": [ - {ORTH: "C'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ] + "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], + "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], } _exc.update(_other_exc) diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index 3f6e4bfa3..bb0554cf9 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -24,7 +24,7 @@ def noun_chunks(obj): doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] - conj = doc.vocab.strings.add("conj") + doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() for i, word in enumerate(obj): diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py index 5ed9aec89..9d15fc520 100644 --- a/spacy/lang/ja/tag_bigram_map.py +++ b/spacy/lang/ja/tag_bigram_map.py @@ -1,21 +1,15 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB +from ...symbols import ADJ, AUX, NOUN, PART, VERB # mapping from tag bi-gram to pos of previous token TAG_BIGRAM_MAP = { # This covers only small part of AUX. ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), - ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), - # This covers acl, advcl, obl and root, but has side effect for compound. ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), # This covers almost all of the deps ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), - ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), ("副詞", "動詞-非自立可能"): (None, VERB), ("形容詞-一般", "動詞-非自立可能"): (None, VERB), @@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = { ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), ("補助記号-読点", "動詞-非自立可能"): (None, VERB), - ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), - ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), - ("動詞-一般", "助詞-副助詞"): (None, PART), ("動詞-非自立可能", "助詞-副助詞"): (None, PART), ("助動詞", "助詞-副助詞"): (None, PART), diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py index 355cc655b..9d32cdea7 100644 --- a/spacy/lang/ja/tag_orth_map.py +++ b/spacy/lang/ja/tag_orth_map.py @@ -1,17 +1,9 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X +from ...symbols import DET, PART, PRON, SPACE, X # mapping from tag bi-gram to pos of previous token TAG_ORTH_MAP = { - "空白": { - " ": SPACE, - " ": X, - }, - "助詞-副助詞": { - "たり": PART, - }, + "空白": {" ": SPACE, " ": X}, + "助詞-副助詞": {"たり": PART}, "連体詞": { "あの": DET, "かの": DET, diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 245b8ba1a..c3c47e66e 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,5 +18,5 @@ sentences = [ "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", - "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.", ] diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 28bc51228..f732a9097 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,6 @@ import re -from .char_classes import ALPHA_LOWER, ALPHA +from .char_classes import ALPHA_LOWER from ..symbols import ORTH, POS, TAG, LEMMA, SPACE diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 9e703e63d..4d4174b03 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None with make_tempdir() as d: file_path = d / "tokenizer" @@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None # split mode is (de)serialized correctly nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ee532cb81..f76a9067a 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -29,10 +29,9 @@ POS_TESTS = [ ] SENTENCE_TESTS = [ - ('あれ。これ。', ['あれ。', 'これ。']), - ('「伝染るんです。」という漫画があります。', - ['「伝染るんです。」という漫画があります。']), - ] + ("あれ。これ。", ["あれ。", "これ。"]), + ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]), +] # fmt: on @@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags -#XXX This isn't working? Always passes +# XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] @@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): @pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) -def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): +def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] assert sents == expected_sents @@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): assert tokens.text_with_ws == text -@pytest.mark.parametrize("text,len_a,len_b,len_c", +@pytest.mark.parametrize( + "text,len_a,len_b,len_c", [ ("選挙管理委員会", 4, 3, 1), ("客室乗務員", 3, 2, 1), ("労働者協同組合", 4, 3, 1), ("機能性食品", 3, 2, 1), - ] + ], ) def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index a7c9a3ea4..6cc8fa6a8 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,7 +10,13 @@ def test_build_dependencies(): "mock", "flake8", ] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] + libs_ignore_setup = [ + "fugashi", + "natto-py", + "pythainlp", + "sudachipy", + "sudachidict_core", + ] # check requirements.txt req_dict = {} diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 5f9e72f79..f6724f632 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,7 +1,6 @@ import pytest import os import ctypes -import srsly from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util @@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from .util import make_tempdir - @pytest.fixture def is_admin(): From 0ee6d7a4d1dea48547c8c78d59bbc3d3a2c4ff45 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 22 Jun 2020 14:54:38 +0200 Subject: [PATCH 119/119] Remove project stuff from this branch --- spacy/cli/__init__.py | 1 - spacy/cli/project.py | 162 ------------------------------------------ 2 files changed, 163 deletions(-) delete mode 100644 spacy/cli/project.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 14623000a..206f8dd3b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,6 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_get_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/project.py b/spacy/cli/project.py deleted file mode 100644 index c33f6a395..000000000 --- a/spacy/cli/project.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import List, Dict, Any -import typer -import srsly -from pathlib import Path -from wasabi import msg -import subprocess -import shlex -import os -import re - -from ._app import app, Arg, Opt -from .. import about -from ..schemas import ProjectConfigSchema, validate -from ..util import ensure_path, run_command - - -CONFIG_FILE = "project.yml" -DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"] -CACHES = [ - Path.home() / ".torch", - Path.home() / ".caches" / "torch", - os.environ.get("TORCH_HOME"), - Path.home() / ".keras", -] - -project_cli = typer.Typer(help="Command-line interface for spaCy projects") - - -@project_cli.callback(invoke_without_command=True) -def callback(): - # This runs before every project command and ensures DVC is installed - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "spaCy projects require DVC (Data Version Control) and the 'dvc' command", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) - - -@project_cli.command("clone") -def project_clone_cli( - # fmt: off - name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), - # fmt: on -): - """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo) - - -def project_clone(name: str, dest: Path, repo: str = about.__projects__) -> None: - dest = ensure_path(dest) - if not dest or not dest.exists() or not dest.is_dir(): - msg.fail("Not a valid directory to clone project", dest, exits=1) - cmd = ["dvc", "get", repo, name, "-o", str(dest)] - msg.info(" ".join(cmd)) - run_command(cmd) - msg.good(f"Cloned project '{name}' from {repo}") - with msg.loading("Setting up directories..."): - for sub_dir in DIRS: - dir_path = dest / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - msg.good(f"Your project is now ready!", dest.resolve()) - - -@project_cli.command("get-assets") -def project_get_assets_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False) -): - """Use Data Version Control to get the assets for the project.""" - project_get_assets(path) - - -def project_get_assets(project_path: Path) -> None: - project_path = ensure_path(project_path) - config = load_project_config(project_path) - assets = config.get("assets", {}) - if not assets: - msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Getting {len(assets)} asset(s)") - variables = config.get("variables", {}) - for asset in assets: - url = asset["url"].format(**variables) - dest = asset["dest"].format(**variables) - dest_path = project_path / dest - check_asset(url) - cmd = ["dvc", "get-url", url, str(dest_path)] - msg.info(" ".join(cmd)) - run_command(cmd) - msg.good(f"Got asset {dest}") - - -@project_cli.command("run") -def project_run_cli( - # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - subcommand: str = Arg(None, help="Name of command defined in project config") - # fmt: on -): - """Run scripts defined in the project.""" - project_run(project_dir, subcommand) - - -def project_run(project_dir: Path, subcommand: str) -> None: - config = load_project_config(project_dir) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand is None: - all_commands = config.get("run", []) - if not all_commands: - msg.warn("No run commands defined in project config", exits=0) - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - for command in all_commands: - if command not in commands: - msg.fail(f"Can't find command '{command}' in project config", exits=1) - msg.divider(command) - run_commands(commands[command]["script"], variables) - return - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) - run_commands(commands[subcommand]["script"], variables) - - -app.add_typer(project_cli, name="project") - - -def load_project_config(path: Path) -> Dict[str, Any]: - config_path = path / CONFIG_FILE - if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - config = srsly.read_yaml(config_path) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) - return config - - -def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - msg.info(command) - run_command(shlex.split(command)) - - -def check_asset(url: str) -> None: - # If the asset URL is a regular GitHub URL it's likely a mistake - # TODO: support loading from GitHub URLs? Automatically convert to raw? - if re.match("(http(s?)):\/\/github.com", url): - msg.warn( - "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. If you want to " - "download the raw file, click on 'Download' on the GitHub page " - "and copy the raw.githubusercontent.com URL instead." - )