From 1139247532d42ccc16e2e1c548924d83d7615637 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 9 Mar 2020 12:09:41 +0100 Subject: [PATCH 001/203] Revert changes to token_match priority from #4374 * Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly --- spacy/lang/hu/punctuation.py | 6 +++--- spacy/lang/tokenizer_exceptions.py | 6 +++++- spacy/tests/tokenizer/test_urls.py | 8 ++++++-- spacy/tokenizer.pyx | 14 +++++++++---- website/docs/usage/linguistic-features.md | 24 +++++++++++++---------- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..a010bb7ae 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -21,7 +20,8 @@ _prefixes = ( ) _suffixes = ( - LIST_PUNCT + [r"\+"] + + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..42dbc7bac 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .char_classes import ALPHA_LOWER +from .char_classes import ALPHA_LOWER, ALPHA from ..symbols import ORTH, POS, TAG, LEMMA, SPACE @@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" + # in order to support the prefix tokenization (see prefix test cases in test_urls). + r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -54,6 +56,8 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" + # in order to support the suffix tokenization (see suffix test cases in test_urls), + r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..2d82e213c 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - "http://foo.com/blah_blah_(wikipedia)", - "http://foo.com/blah_blah_(wikipedia)_(again)", + pytest.param( + "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() + ), + pytest.param( + "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() + ), "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..6f7e44061 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -239,6 +239,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -455,6 +457,10 @@ cdef class Tokenizer: suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + break if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -475,12 +481,12 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] - if substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..60a6699a9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(substring) + substring = '' + break if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, split = suffix_search(substring).start() suffixes.append(substring[split:]) substring = substring[:split] - if substring in special_cases: - tokens.extend(special_cases[substring]) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(substring) substring = '' + elif substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Check whether we have an explicitly defined rule for this substring. If we +2. Look for a token match. If there is a match, stop processing and keep this token. +3. Check whether we have an explicitly defined rule for this substring. If we do, use it. -3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that special cases always get priority. -4. If we didn't consume a prefix, try to consume a suffix and then go back to +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, + so that the token match and special cases always get priority. +5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -5. If we can't consume a prefix or a suffix, look for a special case. -6. Next, look for a token match. +6. If we can't consume a prefix or a suffix, look for a special case. 7. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. 8. Once we can't consume any more of the string, handle it as a single token. From 0c31f03ec5525cd33224a880b6d678c69019727d Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 9 Mar 2020 13:41:01 +0100 Subject: [PATCH 002/203] Update docs [ci skip] --- website/docs/usage/linguistic-features.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 60a6699a9..0ceae4c4f 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -838,8 +838,6 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. - Note that prefixes and suffixes will be split off before `token_match` is - applied. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its From 493c77462a236fae204920e8a3fa22d70833d2fc Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 18:46:51 +0200 Subject: [PATCH 003/203] issue5230: test cases covering known sources of resource warnings --- spacy/tests/regression/test_issue5230.py | 112 +++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 spacy/tests/regression/test_issue5230.py diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py new file mode 100644 index 000000000..e3d7c7e82 --- /dev/null +++ b/spacy/tests/regression/test_issue5230.py @@ -0,0 +1,112 @@ +import warnings + +import numpy +import pytest +import srsly + +from spacy.kb import KnowledgeBase +from spacy.vectors import Vectors +from spacy.language import Language +from spacy.pipeline import Pipe +from spacy.tests.util import make_tempdir + + +@pytest.mark.xfail +def test_language_to_disk_resource_warning(): + nlp = Language() + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + # catch only warnings raised in spacy.language since there may be others from other components or pipelines + warnings.filterwarnings( + "always", module="spacy.language", category=ResourceWarning + ) + nlp.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_vectors_to_disk_resource_warning(): + data = numpy.zeros((3, 300), dtype="f") + keys = ["cat", "dog", "rat"] + vectors = Vectors(data=data, keys=keys) + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + vectors.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_custom_pipes_to_disk_resource_warning(): + # create dummy pipe partially implementing interface -- only want to test to_disk + class SerializableDummy(object): + def __init__(self, **cfg): + if cfg: + self.cfg = cfg + else: + self.cfg = None + super(SerializableDummy, self).__init__() + + def to_bytes(self, exclude=tuple(), disable=None, **kwargs): + return srsly.msgpack_dumps({"dummy": srsly.json_dumps(None)}) + + def from_bytes(self, bytes_data, exclude): + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + pass + + def from_disk(self, path, exclude=tuple(), **kwargs): + return self + + class MyPipe(Pipe): + def __init__(self, vocab, model=True, **cfg): + if cfg: + self.cfg = cfg + else: + self.cfg = None + self.model = SerializableDummy() + self.vocab = SerializableDummy() + + pipe = MyPipe(None) + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + pipe.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_tagger_to_disk_resource_warning(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("tagger")) + tagger = nlp.get_pipe("tagger") + # need to add model for two reasons: + # 1. no model leads to error in serialization, + # 2. the affected line is the one for model serialization + tagger.begin_training(pipeline=nlp.pipeline) + + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + tagger.to_disk(d) + assert len(w) == 0 + + +@pytest.mark.xfail +def test_entity_linker_to_disk_resource_warning(): + nlp = Language() + nlp.add_pipe(nlp.create_pipe("entity_linker")) + entity_linker = nlp.get_pipe("entity_linker") + # need to add model for two reasons: + # 1. no model leads to error in serialization, + # 2. the affected line is the one for model serialization + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + entity_linker.set_kb(kb) + entity_linker.begin_training(pipeline=nlp.pipeline) + + with make_tempdir() as d: + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings("always", category=ResourceWarning) + entity_linker.to_disk(d) + assert len(w) == 0 From 1cd975d4a5cf50eb5a2b16a30e8b520c7778af40 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 18:54:32 +0200 Subject: [PATCH 004/203] issue5230: fixed resource warnings in language --- spacy/language.py | 5 ++--- spacy/tests/regression/test_issue5230.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 56619080d..0eb062eae 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -903,9 +903,8 @@ class Language(object): serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( p, exclude=["vocab"] ) - serializers["meta.json"] = lambda p: p.open("w").write( - srsly.json_dumps(self.meta) - ) + serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta) + for name, proc in self.pipeline: if not hasattr(proc, "name"): continue diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index e3d7c7e82..be84875e7 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -11,7 +11,6 @@ from spacy.pipeline import Pipe from spacy.tests.util import make_tempdir -@pytest.mark.xfail def test_language_to_disk_resource_warning(): nlp = Language() with make_tempdir() as d: From 273ed452bb4ba148d491dcec4b321a6293bdcd30 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 19:22:32 +0200 Subject: [PATCH 005/203] issue5230: added unicode declaration at top of the file --- spacy/tests/regression/test_issue5230.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index be84875e7..9cfa3fc05 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,3 +1,4 @@ +# coding: utf8 import warnings import numpy From 71cc903d65b8946a4c6cd04cb2ca38b8a19eb5c4 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 20:30:41 +0200 Subject: [PATCH 006/203] issue5230: replaced open statements on path objects so that serialization still works an files are closed --- spacy/pipeline/pipes.pyx | 6 +++--- spacy/tests/regression/test_issue5230.py | 4 ---- spacy/vectors.pyx | 10 +++++++++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a20c9b6df..ce95b2752 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -202,7 +202,7 @@ class Pipe(object): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = self.model.to_disk exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -625,7 +625,7 @@ class Tagger(Pipe): serialize = OrderedDict(( ("vocab", lambda p: self.vocab.to_disk(p)), ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), + ("model", self.model.to_disk), ("cfg", lambda p: srsly.write_json(p, self.cfg)) )) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -1394,7 +1394,7 @@ class EntityLinker(Pipe): serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) if self.model not in (None, True, False): - serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + serialize["model"] = self.model.to_disk exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9cfa3fc05..716a4624b 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -24,7 +24,6 @@ def test_language_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_vectors_to_disk_resource_warning(): data = numpy.zeros((3, 300), dtype="f") keys = ["cat", "dog", "rat"] @@ -36,7 +35,6 @@ def test_vectors_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_custom_pipes_to_disk_resource_warning(): # create dummy pipe partially implementing interface -- only want to test to_disk class SerializableDummy(object): @@ -76,7 +74,6 @@ def test_custom_pipes_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_tagger_to_disk_resource_warning(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) @@ -93,7 +90,6 @@ def test_tagger_to_disk_resource_warning(): assert len(w) == 0 -@pytest.mark.xfail def test_entity_linker_to_disk_resource_warning(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index f3c20fb7f..62d176c6c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -376,8 +376,16 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) + + def save_vectors(path): + # the source of numpy.save indicates that the file object is closed after use. + # but it seems that somehow this does not happen, as ResourceWarnings are raised here. + # in order to not rely on this, wrap in context manager. + with path.open("wb") as _file: + save_array(self.data, _file) + serializers = OrderedDict(( - ("vectors", lambda p: save_array(self.data, p.open("wb"))), + ("vectors", save_vectors), ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, []) From cde96f6c64220bf6a82cf4288f6e2bfbbc97eb0a Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 20:51:12 +0200 Subject: [PATCH 007/203] issue5230: optimized unit test a bit --- spacy/tests/regression/test_issue5230.py | 61 +++++++++--------------- 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 716a4624b..76d4d3e96 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,41 +1,28 @@ # coding: utf8 import warnings -import numpy import pytest import srsly - +from numpy import zeros from spacy.kb import KnowledgeBase from spacy.vectors import Vectors + from spacy.language import Language from spacy.pipeline import Pipe from spacy.tests.util import make_tempdir -def test_language_to_disk_resource_warning(): - nlp = Language() - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - # catch only warnings raised in spacy.language since there may be others from other components or pipelines - warnings.filterwarnings( - "always", module="spacy.language", category=ResourceWarning - ) - nlp.to_disk(d) - assert len(w) == 0 +def nlp(): + return Language() -def test_vectors_to_disk_resource_warning(): - data = numpy.zeros((3, 300), dtype="f") +def vectors(): + data = zeros((3, 1), dtype="f") keys = ["cat", "dog", "rat"] - vectors = Vectors(data=data, keys=keys) - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - vectors.to_disk(d) - assert len(w) == 0 + return Vectors(data=data, keys=keys) -def test_custom_pipes_to_disk_resource_warning(): +def custom_pipe(): # create dummy pipe partially implementing interface -- only want to test to_disk class SerializableDummy(object): def __init__(self, **cfg): @@ -66,15 +53,10 @@ def test_custom_pipes_to_disk_resource_warning(): self.model = SerializableDummy() self.vocab = SerializableDummy() - pipe = MyPipe(None) - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - pipe.to_disk(d) - assert len(w) == 0 + return MyPipe(None) -def test_tagger_to_disk_resource_warning(): +def tagger(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) tagger = nlp.get_pipe("tagger") @@ -82,15 +64,10 @@ def test_tagger_to_disk_resource_warning(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.begin_training(pipeline=nlp.pipeline) - - with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings("always", category=ResourceWarning) - tagger.to_disk(d) - assert len(w) == 0 + return tagger -def test_entity_linker_to_disk_resource_warning(): +def entity_linker(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) entity_linker = nlp.get_pipe("entity_linker") @@ -100,9 +77,17 @@ def test_entity_linker_to_disk_resource_warning(): kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) + return entity_linker + +@pytest.mark.parametrize( + "obj", + [nlp(), vectors(), custom_pipe(), tagger(), entity_linker()], + ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], +) +def test_to_disk_resource_warning(obj): with make_tempdir() as d: - with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) - entity_linker.to_disk(d) - assert len(w) == 0 + obj.to_disk(d) + assert len(warnings_list) == 0 From b63871ceff4497ca61bd066c8432603bc73c6a8b Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 21:04:06 +0200 Subject: [PATCH 008/203] issue5230: added contributors agreement --- .github/contributors/lfiedler.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/lfiedler.md diff --git a/.github/contributors/lfiedler.md b/.github/contributors/lfiedler.md new file mode 100644 index 000000000..61f8ffeb4 --- /dev/null +++ b/.github/contributors/lfiedler.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Leander Fiedler | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 06 April 2020 | +| GitHub username | lfiedler | +| Website (optional) | | \ No newline at end of file From e1e25c7e302876b85dc7a95c0f5cf768fbac3f1d Mon Sep 17 00:00:00 2001 From: lfiedler <lfiedler@users.noreply.github.com> Date: Mon, 6 Apr 2020 21:36:02 +0200 Subject: [PATCH 009/203] issue5230: added unittest test case for completion --- spacy/tests/regression/test_issue5230.py | 28 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 76d4d3e96..1a03fa0d2 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,5 +1,6 @@ # coding: utf8 import warnings +from unittest import TestCase import pytest import srsly @@ -80,14 +81,31 @@ def entity_linker(): return entity_linker -@pytest.mark.parametrize( - "obj", +objects_to_test = ( [nlp(), vectors(), custom_pipe(), tagger(), entity_linker()], - ids=["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], + ["nlp", "vectors", "custom_pipe", "tagger", "entity_linker"], ) -def test_to_disk_resource_warning(obj): + + +def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - assert len(warnings_list) == 0 + return warnings_list + + +@pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) +def test_to_disk_resource_warning(obj): + warnings_list = write_obj_and_catch_warnings(obj) + assert len(warnings_list) == 0 + + +class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) + + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) From 8c1d0d628fb196abd33859b18a597eb0414e6c55 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Fri, 10 Apr 2020 20:35:52 +0200 Subject: [PATCH 010/203] issue5230 writer now checks instance of loc parameter before trying to operate on it --- spacy/kb.pyx | 4 ++-- spacy/tests/regression/test_issue5230.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..7c6865eed 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -446,10 +446,10 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): - if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc if isinstance(loc, Path): loc = bytes(loc) + if path.exists(loc): + assert not path.isdir(loc), "%s is directory." % loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(<char*>bytes_loc, 'wb') if not self._fp: diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 1a03fa0d2..b7c6b9b1d 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -5,7 +5,7 @@ from unittest import TestCase import pytest import srsly from numpy import zeros -from spacy.kb import KnowledgeBase +from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language @@ -101,6 +101,19 @@ def test_to_disk_resource_warning(obj): assert len(warnings_list) == 0 +def test_writer_with_path_py35(): + writer = None + with make_tempdir() as d: + path = d / "test" + try: + writer = Writer(path) + except Exception as e: + pytest.fail(str(e)) + finally: + if writer: + writer.close() + + class TestToDiskResourceWarningUnittest(TestCase): def test_resource_warning(self): scenarios = zip(*objects_to_test) From a7bdfe42e13bdb2e61edcb3b4bf9203e041ef3f0 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Fri, 10 Apr 2020 21:14:33 +0200 Subject: [PATCH 011/203] issue5230 added print statement to warnings filter to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index b7c6b9b1d..03027fe39 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -98,6 +98,8 @@ def write_obj_and_catch_warnings(obj): @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) + for warning in warnings_list: + print(warning.message) assert len(warnings_list) == 0 From 88ca40a15d010fe50da383f4664f8064046f7540 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Fri, 10 Apr 2020 21:45:53 +0200 Subject: [PATCH 012/203] issue5230 raise warnings as errors to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 03027fe39..adc9307ce 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -90,7 +90,7 @@ objects_to_test = ( def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: - warnings.filterwarnings("always", category=ResourceWarning) + warnings.filterwarnings("error", category=ResourceWarning) obj.to_disk(d) return warnings_list @@ -98,8 +98,6 @@ def write_obj_and_catch_warnings(obj): @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) - for warning in warnings_list: - print(warning.message) assert len(warnings_list) == 0 From ca2a7a44db29b3ffbcf24459a8c0332742c8b676 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Fri, 10 Apr 2020 22:26:55 +0200 Subject: [PATCH 013/203] issue5230 store string values of warnings to remotely debug failing python35(win) setup --- spacy/tests/regression/test_issue5230.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index adc9307ce..c78a84ad7 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -90,9 +90,9 @@ objects_to_test = ( def write_obj_and_catch_warnings(obj): with make_tempdir() as d: with warnings.catch_warnings(record=True) as warnings_list: - warnings.filterwarnings("error", category=ResourceWarning) + warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - return warnings_list + return list(map(lambda w: w.message, warnings_list)) @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) From d2bb649227ce5a24e53d7526cf7892643eb297c9 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Fri, 10 Apr 2020 23:21:13 +0200 Subject: [PATCH 014/203] issue5230 filter warnings in addition to filterwarnings to prevent deprecation warnings in python35(win) setup to pop up --- spacy/tests/regression/test_issue5230.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index c78a84ad7..ae735c7bd 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -92,7 +92,8 @@ def write_obj_and_catch_warnings(obj): with warnings.catch_warnings(record=True) as warnings_list: warnings.filterwarnings("always", category=ResourceWarning) obj.to_disk(d) - return list(map(lambda w: w.message, warnings_list)) + # in python3.5 it seems that deprecation warnings are not filtered by filterwarnings + return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) From d60e2d3ebf33fc0c4280117b08f6e3ef9ad63ff9 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Sun, 12 Apr 2020 09:08:41 +0200 Subject: [PATCH 015/203] issue5230 added unit test for dumping and loading knowledgebase --- spacy/tests/regression/test_issue5230.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index ae735c7bd..337c82255 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -115,6 +115,23 @@ def test_writer_with_path_py35(): writer.close() +def test_save_and_load_knowledge_base(): + nlp = Language() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + with make_tempdir() as d: + path = d / "kb" + try: + kb.dump(path) + except Exception as e: + pytest.fail(str(e)) + + try: + kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb_loaded.load_bulk(path) + except Exception as e: + pytest.fail(str(e)) + + class TestToDiskResourceWarningUnittest(TestCase): def test_resource_warning(self): scenarios = zip(*objects_to_test) From 67000068304b9a125ec792f32bed8491767dbed1 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Sun, 12 Apr 2020 09:34:54 +0200 Subject: [PATCH 016/203] issue5230 attempted fix of pytest segfault for python3.5 --- spacy/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 7c6865eed..14327f0d6 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -491,10 +491,10 @@ cdef class Writer: cdef class Reader: def __init__(self, object loc): - assert path.exists(loc) - assert not path.isdir(loc) if isinstance(loc, Path): loc = bytes(loc) + assert path.exists(loc) + assert not path.isdir(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(<char*>bytes_loc, 'rb') if not self._fp: From cef0c909b9dc1afd37511db4cbfd1863f27a371a Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Wed, 15 Apr 2020 19:28:33 +0200 Subject: [PATCH 017/203] issue5230 changed reference to function to anonymous function --- spacy/pipeline/pipes.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ce95b2752..8af76a0fb 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -202,7 +202,7 @@ class Pipe(object): serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): - serialize["model"] = self.model.to_disk + serialize["model"] = lambda p: self.model.to_disk(p) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) From a3401b11946b9aba06dd3e83a1877c156e7ddeb4 Mon Sep 17 00:00:00 2001 From: Leander Fiedler <lfiedler@users.noreply.github.com> Date: Wed, 15 Apr 2020 21:52:52 +0200 Subject: [PATCH 018/203] issue5230 changed reference to function to anonymous function --- spacy/pipeline/pipes.pyx | 4 ++-- spacy/vectors.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 8af76a0fb..fc077fc82 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -625,7 +625,7 @@ class Tagger(Pipe): serialize = OrderedDict(( ("vocab", lambda p: self.vocab.to_disk(p)), ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", self.model.to_disk), + ("model", lambda p: self.model.to_disk(p)), ("cfg", lambda p: srsly.write_json(p, self.cfg)) )) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) @@ -1394,7 +1394,7 @@ class EntityLinker(Pipe): serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) if self.model not in (None, True, False): - serialize["model"] = self.model.to_disk + serialize["model"] = lambda p: self.model.to_disk(p) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 62d176c6c..2877d2d7d 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -385,7 +385,7 @@ cdef class Vectors: save_array(self.data, _file) serializers = OrderedDict(( - ("vectors", save_vectors), + ("vectors", lambda p: save_vectors(p)), ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, []) From 565e0eef73fab8c394339239cc48e4a83e068dfd Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 5 May 2020 10:35:33 +0200 Subject: [PATCH 019/203] Add tokenizer option for token match with affixes To fix the slow tokenizer URL (#4374) and allow `token_match` to take priority over prefixes and suffixes by default, introduce a new tokenizer option for a token match pattern that's applied after prefixes and suffixes but before infixes. --- spacy/lang/fr/tokenizer_exceptions.py | 4 --- spacy/lang/hu/tokenizer_exceptions.py | 3 +-- spacy/lang/tokenizer_exceptions.py | 7 ++--- spacy/language.py | 5 +++- spacy/tests/tokenizer/test_urls.py | 12 +++------ spacy/tokenizer.pxd | 1 + spacy/tokenizer.pyx | 37 +++++++++++++++++++++++---- website/docs/api/tokenizer.md | 3 ++- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index cb1702300..465626d39 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .punctuation import ELISION, HYPHENS -from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA @@ -455,9 +454,6 @@ _regular_exp += [ for hc in _hyphen_combination ] -# URLs -_regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..d328baa22 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from ..punctuation import ALPHA_LOWER, CURRENCY -from ..tokenizer_exceptions import URL_PATTERN from ...symbols import ORTH @@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( TOKENIZER_EXCEPTIONS = _exc -TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match +TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index f1eabd9aa..6a9a5363f 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" - # in order to support the prefix tokenization (see prefix test cases in test_urls). - r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -56,13 +54,12 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" - # in order to support the suffix tokenization (see suffix test cases in test_urls), - r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match +TOKEN_MATCH = None +TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index e89f80f08..d4f6c78ec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -86,6 +86,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match + token_match_with_affixes = cls.token_match_with_affixes prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -103,10 +104,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, + token_match_with_affixes=token_match_with_affixes, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH + token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2d82e213c..2f76111e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() - ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() - ), + "http://foo.com/blah_blah_(wikipedia)", + "http://foo.com/blah_blah_(wikipedia)_(again)", "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", @@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is not None + assert en_tokenizer.token_match_with_affixes(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is None + assert en_tokenizer.token_match_with_affixes(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..70d49bb39 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,6 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match + cdef object _token_match_with_affixes cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 16a2cf27b..cf0421158 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,8 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None, + token_match_with_affixes=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,6 +44,8 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. + token_match_with_affixes (callable): A boolean function matching strings to be + recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. EXAMPLE: @@ -55,6 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match + self.token_match_with_affixes = token_match_with_affixes self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -70,6 +74,14 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() + property token_match_with_affixes: + def __get__(self): + return self._token_match_with_affixes + + def __set__(self, token_match_with_affixes): + self._token_match_with_affixes = token_match_with_affixes + self._flush_cache() + property prefix_search: def __get__(self): return self._prefix_search @@ -108,11 +120,12 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, - self._rules, + self.rules, self.prefix_search, self.suffix_search, self.infix_finditer, - self.token_match) + self.token_match, + self.token_match_with_affixes) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -297,7 +310,9 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif (self.token_match and self.token_match(string)) or \ + (self.token_match_with_affixes and \ + self.token_match_with_affixes(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -450,6 +465,11 @@ cdef class Tokenizer: suffix_search = self.suffix_search infix_finditer = self.infix_finditer token_match = self.token_match + if token_match is None: + token_match = re.compile("a^").match + token_match_with_affixes = self.token_match_with_affixes + if token_match_with_affixes is None: + token_match_with_affixes = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -485,6 +505,9 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif token_match_with_affixes(substring): + tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -549,6 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -570,11 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), + ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -585,6 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): + self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..f73e851f7 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -41,7 +41,8 @@ the | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} From 36a94c409a50e3d815924197d668e0ae315d4352 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Wed, 20 May 2020 23:06:03 +0200 Subject: [PATCH 020/203] failing test to reproduce overlapping spans problem --- spacy/tests/regression/test_issue5458.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 spacy/tests/regression/test_issue5458.py diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py new file mode 100644 index 000000000..33281c858 --- /dev/null +++ b/spacy/tests/regression/test_issue5458.py @@ -0,0 +1,21 @@ +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks +from spacy.tests.util import get_doc +from spacy.vocab import Vocab + + +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + + en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc) From b509a3e7fcadf84c257c1e5168b6dc926b8b2f3d Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Wed, 20 May 2020 23:06:39 +0200 Subject: [PATCH 021/203] fix: use actual range in 'seen' instead of subtree --- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/language.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 5ff848124..22f7fcf81 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label @@ -46,7 +46,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + if any(j in seen for j in range(word.left_edge.i, word.i + 1)): continue seen.update(j for j in range(word.left_edge.i, word.i + 1)) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/language.py b/spacy/language.py index 703806627..c4eb26bad 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -418,7 +418,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. From b221bcf1ba3907552d4c3b660d1902b0a1c26b2e Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Thu, 21 May 2020 00:17:28 +0200 Subject: [PATCH 022/203] fixing all languages --- spacy/lang/el/syntax_iterators.py | 14 +++++++------- spacy/lang/en/syntax_iterators.py | 10 ++++++---- spacy/lang/fa/syntax_iterators.py | 10 ++++++---- spacy/lang/fr/syntax_iterators.py | 10 ++++++---- spacy/lang/id/syntax_iterators.py | 10 ++++++---- spacy/lang/nb/syntax_iterators.py | 10 ++++++---- spacy/lang/sv/syntax_iterators.py | 10 ++++++---- 7 files changed, 43 insertions(+), 31 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index f02619ac9..5d6398aad 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -31,16 +31,15 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue flag = False if word.pos == NOUN: # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - seen.update( - j for j in range(word.left_edge.i, potential_nmod.i + 1) - ) + w_range = range(word.left_edge.i, potential_nmod.i + 1) + if any(j in seen for j in w_range): + continue + seen.update(j for j in w_range) yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break @@ -54,9 +53,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 22f7fcf81..0d43ebf37 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(j in seen for j in range(word.left_edge.i, word.i + 1)): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 5ff848124..0d43ebf37 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 9495dcf1e..91b338eb3 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,9 +35,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +46,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 148884efe..31e3302e9 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,9 +36,10 @@ def noun_chunks(obj): if word.i in seen: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +47,10 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): + w_range = range(word.left_edge.i, word.right_edge.i + 1) + if any(j in seen for j in w_range): continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + seen.update(j for j in w_range) yield word.left_edge.i, word.right_edge.i + 1, np_label From f7d10da555c089a2015fd0101b6198db395d82fc Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Thu, 21 May 2020 19:15:57 +0200 Subject: [PATCH 023/203] avoid unnecessary loop to check overlapping noun chunks --- spacy/lang/el/syntax_iterators.py | 16 +++++----------- spacy/lang/en/syntax_iterators.py | 14 ++++---------- spacy/lang/fa/syntax_iterators.py | 14 ++++---------- spacy/lang/fr/syntax_iterators.py | 14 ++++---------- spacy/lang/id/syntax_iterators.py | 14 ++++---------- spacy/lang/nb/syntax_iterators.py | 14 ++++---------- spacy/lang/sv/syntax_iterators.py | 14 ++++---------- 7 files changed, 29 insertions(+), 71 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5d6398aad..b5811c337 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -23,12 +23,12 @@ def noun_chunks(obj): conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: flag = False @@ -36,15 +36,12 @@ def noun_chunks(obj): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - w_range = range(word.left_edge.i, potential_nmod.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = potential_nmod.i + 1 yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -53,10 +50,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 0d43ebf37..dbb2d6c9f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.i + 1 yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 91b338eb3..b38be57fc 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -27,18 +27,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,10 +43,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 31e3302e9..12d351148 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -28,18 +28,15 @@ def noun_chunks(obj): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -47,10 +44,7 @@ def noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - w_range = range(word.left_edge.i, word.right_edge.i + 1) - if any(j in seen for j in w_range): - continue - seen.update(j for j in w_range) + prev_end = word.right_edge.i + 1 yield word.left_edge.i, word.right_edge.i + 1, np_label From 51715b9f720e115fe91f4684c589c3e5666cec5b Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Thu, 21 May 2020 19:56:56 +0200 Subject: [PATCH 024/203] span / noun chunk has +1 because end is exclusive --- spacy/lang/el/syntax_iterators.py | 6 +++--- spacy/lang/en/syntax_iterators.py | 4 ++-- spacy/lang/fa/syntax_iterators.py | 4 ++-- spacy/lang/fr/syntax_iterators.py | 4 ++-- spacy/lang/id/syntax_iterators.py | 4 ++-- spacy/lang/nb/syntax_iterators.py | 4 ++-- spacy/lang/sv/syntax_iterators.py | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 10fa94f8c..4a40e28c2 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -36,12 +36,12 @@ def noun_chunks(doclike): # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - prev_end = potential_nmod.i + 1 + prev_end = potential_nmod.i yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -50,7 +50,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 91152bd50..0f2b28b58 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.i + 1 + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 3523e2f02..d6c12e69f 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -35,7 +35,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -43,7 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 99621e6a9..84d295f96 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -36,7 +36,7 @@ def noun_chunks(doclike): if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -44,7 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - prev_end = word.right_edge.i + 1 + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label From 0f1beb5ff27bf19e14ddc3a8b80e2521a782c03c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 21 May 2020 20:05:03 +0200 Subject: [PATCH 025/203] Tidy up and avoid absolute spacy imports in core --- spacy/cli/evaluate.py | 3 +-- spacy/kb.pxd | 5 ++--- spacy/kb.pyx | 17 ++++++----------- spacy/language.py | 5 +---- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 8a84684e5..be994de73 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals, division, print_function import plac -import spacy from timeit import default_timer as timer from wasabi import msg @@ -45,7 +44,7 @@ def evaluate( msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) if model.startswith("blank:"): - nlp = spacy.blank(model.replace("blank:", "")) + nlp = util.get_lang_class(model.replace("blank:", ""))() else: nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index d5aa382b1..518ce0f4e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from libc.stdio cimport FILE -from spacy.vocab cimport Vocab +from .vocab cimport Vocab from .typedefs cimport hash_t from .structs cimport KBEntryC, AliasC @@ -113,7 +113,7 @@ cdef class KnowledgeBase: return new_index cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: - """ + """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 @@ -169,4 +169,3 @@ cdef class Reader: cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int _read(self, void* value, size_t size) except -1 - diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 36a6dbd93..076f25267 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,23 +1,20 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -import warnings - -from spacy.errors import Errors, Warnings - -from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t +from libcpp.vector cimport vector + +import warnings +from os import path +from pathlib import Path from .typedefs cimport hash_t -from os import path -from libcpp.vector cimport vector +from .errors import Errors, Warnings cdef class Candidate: @@ -586,5 +583,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/language.py b/spacy/language.py index 0e5c46459..dae7d96a2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,10 +4,7 @@ from __future__ import absolute_import, unicode_literals import random import itertools import warnings - from thinc.extra import load_nlp - -from spacy.util import minibatch import weakref import functools from collections import OrderedDict @@ -852,7 +849,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) From cb02bff0ebe31ab0d3b13fad9fcd2424c09f6c4b Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 21 May 2020 20:24:07 +0200 Subject: [PATCH 026/203] Add blank:{lang} shortcut to util.load_mode --- spacy/tests/test_misc.py | 11 +++++++++++ spacy/util.py | 2 ++ 2 files changed, 13 insertions(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4075ccf64..3ac621649 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -135,3 +135,14 @@ def test_ascii_filenames(): root = Path(__file__).parent.parent for path in root.glob("**/*"): assert all(ord(c) < 128 for c in path.name), path.name + + +def test_load_model_blank_shortcut(): + """Test that using a model name like "blank:en" works as a shortcut for + spacy.blank("en"). + """ + nlp = util.load_model("blank:en") + assert nlp.lang == "en" + assert nlp.pipeline == [] + with pytest.raises(ImportError): + util.load_model("blank:fjsfijsdof") diff --git a/spacy/util.py b/spacy/util.py index 419c99bc0..5fd296404 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -161,6 +161,8 @@ def load_model(name, **overrides): if not data_path or not data_path.exists(): raise IOError(Errors.E049.format(path=path2str(data_path))) if isinstance(name, basestring_): # in data dir / shortcut + if name.startswith("blank:"): # shortcut for blank model + return get_lang_class(name.replace("blank:", ""))() if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) if is_package(name): # installed as package From 53da6bd6724d5ab26da597faa275816fa3e1093e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 21 May 2020 20:45:33 +0200 Subject: [PATCH 027/203] Add course to landing [ci skip] --- website/src/styles/landing.module.sass | 1 + website/src/widgets/landing.js | 47 ++++++++++++++------------ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass index e36e36c0a..c29c0fffb 100644 --- a/website/src/styles/landing.module.sass +++ b/website/src/styles/landing.module.sass @@ -86,6 +86,7 @@ .banner-content-small display: block + margin-bottom: 0 !important .banner-title display: block diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 9aeec0cdc..c96905733 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -9,7 +9,6 @@ import { LandingGrid, LandingCard, LandingCol, - LandingButton, LandingDemo, LandingBannerGrid, LandingBanner, @@ -19,7 +18,8 @@ import { H2 } from '../components/typography' import { Ul, Li } from '../components/list' import Button from '../components/button' import Link from '../components/link' -import irlBackground from '../images/spacy-irl.jpg' + +import courseImage from '../../docs/images/course.jpg' import BenchmarksChoi from 'usage/_benchmarks-choi.md' @@ -148,13 +148,35 @@ const Landing = ({ data }) => { </LandingGrid> <LandingBannerGrid> + <LandingBanner + to="https://course.spacy.io" + button="Start the course" + background="#f6f6f6" + color="#252a33" + small + > + <Link to="https://course.spacy.io" hidden> + <img + src={courseImage} + alt="Advanced NLP with spaCy: A free online course" + /> + </Link> + <br /> + <br /> + In this <strong>free and interactive online course</strong> you’ll learn how to + use spaCy to build advanced natural language understanding systems, using both + rule-based and machine learning approaches. It includes{' '} + <strong>55 exercises</strong> featuring videos, slide decks, multiple-choice + questions and interactive coding practice in the browser. + </LandingBanner> + <LandingBanner title="Prodigy: Radically efficient machine teaching" label="From the makers of spaCy" to="https://prodi.gy" button="Try it out" background="#eee" - color="#252a33" + color="#000" small > Prodigy is an <strong>annotation tool</strong> so efficient that data scientists @@ -165,25 +187,6 @@ const Landing = ({ data }) => { update your model in real-time and chain models together to build more complex systems. </LandingBanner> - - <LandingBanner - title="spaCy IRL 2019: Two days of NLP" - label="Watch the videos" - to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc" - button="Watch the videos" - background="#ffc194" - backgroundImage={irlBackground} - color="#1a1e23" - small - > - We were pleased to invite the spaCy community and other folks working on Natural - Language Processing to Berlin this summer for a small and intimate event{' '} - <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an - awesome lineup of speakers and scheduled plenty of social time to get to know - each other and exchange ideas. The YouTube playlist includes 12 talks about NLP - research, development and applications, with keynotes by Sebastian Ruder - (DeepMind) and Yoav Goldberg (Allen AI). - </LandingBanner> </LandingBannerGrid> <LandingLogos title="spaCy is trusted by" logos={data.logosUsers}> From 891fa590096ef1d1d9dbef013ebc9b9b34986aee Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 21 May 2020 20:52:48 +0200 Subject: [PATCH 028/203] Use backwards-compatible super() --- spacy/errors.py | 2 +- spacy/lang/pl/lemmatizer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0750ab616..aca94d64e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -7,7 +7,7 @@ def add_codes(err_cls): class ErrorsWithCodes(err_cls): def __getattribute__(self, code): - msg = super().__getattribute__(code) + msg = super(ErrorsWithCodes, self).__getattribute__(code) if code.startswith("__"): # python system attributes like __class__ return msg else: diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index cd555b9c2..d0d843b2a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -13,7 +13,7 @@ class PolishLemmatizer(Lemmatizer): # lemmatization for nouns def __init__(self, lookups, *args, **kwargs): # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super().__init__(lookups) + super(PolishLemmatizer, self).__init__(lookups) self.lemma_lookups = {} for tag in [ "ADJ", From ee027de032ffb30abacabbb410ed66b0877e95b2 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 21 May 2020 21:54:23 +0200 Subject: [PATCH 029/203] Update universe and display of videos [ci skip] --- website/meta/universe.json | 128 +++++++++++++++++++++++++----- website/src/templates/universe.js | 14 +++- 2 files changed, 118 insertions(+), 24 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 857e26813..58f4cc2aa 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -115,11 +115,11 @@ "print(text)" ], "category": ["scientific", "biomedical"], - "author": "Travis Hoppe", + "author": "Travis Hoppe", "author_links": { "github": "thoppe", - "twitter":"metasemantic", - "website" : "http://thoppe.github.io/" + "twitter": "metasemantic", + "website": "http://thoppe.github.io/" } }, { @@ -1132,7 +1132,7 @@ "type": "education", "id": "spacy-course", "title": "Advanced NLP with spaCy", - "slogan": "spaCy, 2019", + "slogan": "A free online course", "description": "In this free interactive course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", "url": "https://course.spacy.io", "image": "https://i.imgur.com/JC00pHW.jpg", @@ -1185,10 +1185,38 @@ "youtube": "6zm9NC9uRkk", "category": ["videos"] }, + { + "type": "education", + "id": "video-spacy-course", + "title": "Advanced NLP with spaCy · A free online course", + "description": "spaCy is a modern Python library for industrial-strength Natural Language Processing. In this free and interactive online course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", + "url": "https://course.spacy.io/en", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "THduWAnG97k", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-course-de", + "title": "Modernes NLP mit spaCy · Ein Gratis-Onlinekurs", + "description": "spaCy ist eine moderne Python-Bibliothek für industriestarkes Natural Language Processing. In diesem kostenlosen und interaktiven Onlinekurs lernst du, mithilfe von spaCy fortgeschrittene Systeme für die Analyse natürlicher Sprache zu entwickeln und dabei sowohl regelbasierte Verfahren, als auch moderne Machine-Learning-Technologie einzusetzen.", + "url": "https://course.spacy.io/de", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "K1elwpgDdls", + "category": ["videos"] + }, { "type": "education", "id": "video-intro-to-nlp-episode-1", - "title": "Intro to NLP with spaCy", + "title": "Intro to NLP with spaCy (1)", "slogan": "Episode 1: Data exploration", "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", "author": "Vincent Warmerdam", @@ -1202,7 +1230,7 @@ { "type": "education", "id": "video-intro-to-nlp-episode-2", - "title": "Intro to NLP with spaCy", + "title": "Intro to NLP with spaCy (2)", "slogan": "Episode 2: Rule-based Matching", "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", "author": "Vincent Warmerdam", @@ -1213,6 +1241,34 @@ "youtube": "KL4-Mpgbahw", "category": ["videos"] }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-3", + "title": "Intro to NLP with spaCy (3)", + "slogan": "Episode 2: Evaluation", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "4V0JDdohxAk", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-4", + "title": "Intro to NLP with spaCy (4)", + "slogan": "Episode 4: Named Entity Recognition", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "IqOJU1-_Fi0", + "category": ["videos"] + }, { "type": "education", "id": "video-spacy-irl-entity-linking", @@ -1286,6 +1342,22 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "podcast-init2", + "title": "Podcast.__init__ #256: An Open Source Toolchain For NLP From Explosion AI", + "slogan": "March 2020", + "description": "The state of the art in natural language processing is a constantly moving target. With the rise of deep learning, previously cutting edge techniques have given way to robust language models. Through it all the team at Explosion AI have built a strong presence with the trifecta of SpaCy, Thinc, and Prodigy to support fast and flexible data labeling to feed deep learning models and performant and scalable text processing. In this episode founder and open source author Matthew Honnibal shares his experience growing a business around cutting edge open source libraries for the machine learning developent process.", + "iframe": "https://cdn.podlove.org/web-player/share.html?episode=https%3A%2F%2Fwww.pythonpodcast.com%2F%3Fpodlove_player4%3D614", + "iframe_height": 200, + "thumb": "https://i.imgur.com/rpo6BuY.png", + "url": "https://www.pythonpodcast.com/explosion-ai-natural-language-processing-episode-256/", + "author": "Tobias Macey", + "author_links": { + "website": "https://www.podcastinit.com" + }, + "category": ["podcasts"] + }, { "type": "education", "id": "talk-python-podcast", @@ -1348,6 +1420,18 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "video-entity-linking", + "title": "Training a custom entity linking mode with spaCy", + "author": "Sofie Van Landeghem", + "author_links": { + "twitter": "OxyKodit", + "github": "svlandeg" + }, + "youtube": "8u57WSXVpmw", + "category": ["videos"] + }, { "id": "adam_qas", "title": "ADAM: Question Answering System", @@ -2182,22 +2266,22 @@ "pip": "pyate", "code_example": [ "import spacy", - "from pyate.term_extraction_pipeline import TermExtractionPipeline", - "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(TermExtractionPipeline())", - "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", - "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", - "", - "doc = nlp(string)", - "print(doc._.combo_basic.sort_values(ascending=False).head(5))", - "\"\"\"\"\"\"", - "dysfunctional tumor 1.443147", - "tumor suppressors 1.443147", - "genetic changes 1.386294", - "cancer cells 1.386294", - "dysfunctional tumor suppressors 1.298612", - "\"\"\"\"\"\"" + "from pyate.term_extraction_pipeline import TermExtractionPipeline", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(TermExtractionPipeline())", + "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", + "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", + "", + "doc = nlp(string)", + "print(doc._.combo_basic.sort_values(ascending=False).head(5))", + "\"\"\"\"\"\"", + "dysfunctional tumor 1.443147", + "tumor suppressors 1.443147", + "genetic changes 1.386294", + "cancer cells 1.386294", + "dysfunctional tumor suppressors 1.298612", + "\"\"\"\"\"\"" ], "code_language": "python", "url": "https://github.com/kevinlu1248/pyate", diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index e49e81b01..4a4e13bec 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -14,7 +14,7 @@ import Sidebar from '../components/sidebar' import Section from '../components/section' import Main from '../components/main' import Footer from '../components/footer' -import { H3, Label, InlineList } from '../components/typography' +import { H3, H5, Label, InlineList } from '../components/typography' import { YouTube, SoundCloud, Iframe } from '../components/embed' import { github, markdownToReact } from '../components/util' @@ -86,7 +86,10 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC <img src={`https://img.youtube.com/vi/${youtube}/0.jpg`} alt="" - style={{ clipPath: 'inset(12.5% 0)' }} + style={{ + clipPath: 'inset(12.9% 0)', + marginBottom: 'calc(-12.9% + 1rem)', + }} /> ) return cover ? ( @@ -95,6 +98,13 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC <img src={cover} alt={title || id} /> </Link> </p> + ) : data.id === 'videos' ? ( + <div> + <Link key={id} to={url} hidden> + {header} + <H5>{title}</H5> + </Link> + </div> ) : ( <Card key={id} From 71fe61fdcd6c04de739391251bb346ba1de94e4e Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 22 May 2020 10:14:34 +0200 Subject: [PATCH 030/203] Disallow merging 0-length spans --- spacy/errors.py | 1 + spacy/tests/doc/test_retokenize_merge.py | 7 +++++++ spacy/tokens/_retokenize.pyx | 2 ++ 3 files changed, 10 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index aca94d64e..6d92545d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -567,6 +567,7 @@ class Errors(object): E197 = ("Row out of bounds, unable to add row {row} for key {key}.") E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") + E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") @add_codes diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..636b7bb14 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab): retokenizer.merge(doc[0:2]) assert len(doc) == 2 assert doc[0].text == "hello world" + + +def test_retokenize_disallow_zero_length(en_vocab): + doc = Doc(en_vocab, words=["hello", "world", "!"]) + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[1:1]) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 512ad73bc..ce8e510d6 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -55,6 +55,8 @@ cdef class Retokenizer: """ if (span.start, span.end) in self._spans_to_merge: return + if span.end - span.start <= 0: + raise ValueError(Errors.E199.format(start=span.start, end=span.end)) for token in span: if token.i in self.tokens_to_merge: raise ValueError(Errors.E102.format(token=repr(token))) From e4a1b5dab1f2de60fa0ddbb3e80282b0749635da Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 22 May 2020 12:41:03 +0200 Subject: [PATCH 031/203] Rename to url_match Rename to `url_match` and update docs. --- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/language.py | 8 ++--- spacy/tests/tokenizer/test_urls.py | 4 +-- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 40 +++++++++++------------ website/docs/api/tokenizer.md | 16 ++++----- website/docs/usage/linguistic-features.md | 23 ++++++++----- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 6a9a5363f..67349916b 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -59,7 +59,7 @@ URL_PATTERN = ( ).strip() TOKEN_MATCH = None -TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match +URL_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index 2c7f4e2b5..53a788f2a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES +from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc @@ -89,7 +89,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match - token_match_with_affixes = cls.token_match_with_affixes + url_match = cls.url_match prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -107,12 +107,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, - token_match_with_affixes=token_match_with_affixes, + url_match=url_match, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH - token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES + url_match = URL_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2f76111e5..65ba93d66 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is not None + assert en_tokenizer.url_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is None + assert en_tokenizer.url_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 70d49bb39..694ea49cc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,7 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match - cdef object _token_match_with_affixes + cdef object _url_match cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index cf0421158..154a42c4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -31,7 +31,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - token_match_with_affixes=None): + url_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -44,7 +44,7 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. - token_match_with_affixes (callable): A boolean function matching strings to be + url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. @@ -58,7 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match - self.token_match_with_affixes = token_match_with_affixes + self.url_match = url_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -74,12 +74,12 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() - property token_match_with_affixes: + property url_match: def __get__(self): - return self._token_match_with_affixes + return self._url_match - def __set__(self, token_match_with_affixes): - self._token_match_with_affixes = token_match_with_affixes + def __set__(self, url_match): + self._url_match = url_match self._flush_cache() property prefix_search: @@ -125,7 +125,7 @@ cdef class Tokenizer: self.suffix_search, self.infix_finditer, self.token_match, - self.token_match_with_affixes) + self.url_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -311,8 +311,8 @@ cdef class Tokenizer: if cache_hit: pass elif (self.token_match and self.token_match(string)) or \ - (self.token_match_with_affixes and \ - self.token_match_with_affixes(string)): + (self.url_match and \ + self.url_match(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -467,9 +467,9 @@ cdef class Tokenizer: token_match = self.token_match if token_match is None: token_match = re.compile("a^").match - token_match_with_affixes = self.token_match_with_affixes - if token_match_with_affixes is None: - token_match_with_affixes = re.compile("a^").match + url_match = self.url_match + if url_match is None: + url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -505,8 +505,8 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' - elif token_match_with_affixes(substring): - tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + elif url_match(substring): + tokens.append(("URL_MATCH", substring)) substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) @@ -572,7 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), + ("url_match", lambda: _get_regex_pattern(self.url_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -594,12 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), - ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), + ("url_match", lambda b: data.setdefault("url_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -610,8 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match - if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): - self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match + if "url_match" in data and isinstance(data["url_match"], basestring_): + self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index f73e851f7..6f8badfe8 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -35,15 +35,15 @@ the > ``` | Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 91ca1267b..bcc943436 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, if token_match(substring): tokens.append(substring) substring = '' + elif url_match(substring): + tokens.append(substring) + substring = '' elif substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Look for a token match. If there is a match, stop processing and keep this token. -3. Check whether we have an explicitly defined rule for this substring. If we - do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that the token match and special cases always get priority. +2. Look for a token match. If there is a match, stop processing and keep this + token. +3. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to + #2, so that the token match and special cases always get priority. 5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -6. If we can't consume a prefix or a suffix, look for a special case. -7. Look for "infixes" — stuff like hyphens etc. and split the substring into +6. If we can't consume a prefix or a suffix, look for a URL match. +7. If there's no URL match, then look for a special case. +8. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. -8. Once we can't consume any more of the string, handle it as a single token. +9. Once we can't consume any more of the string, handle it as a single token. #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} @@ -836,6 +841,8 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. +6. An optional boolean function `url_match`, which is similar to `token_match` + except prefixes and suffixes are removed before applying the match. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its From 65c7e82de24739977d7ca775d585cacc7dc25cd5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Fri, 22 May 2020 13:50:30 +0200 Subject: [PATCH 032/203] Auto-format and remove 2.3 feature [ci skip] --- website/docs/api/token.md | 150 +++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 83 deletions(-) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 69dac23d6..0fa86b7bc 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -351,25 +351,9 @@ property to `0` for the first word of the document. - assert doc[4].sent_start == 1 + assert doc[4].is_sent_start == True ``` + </Infobox> -## Token.is_sent_end {#is_sent_end tag="property" new="2"} - -A boolean value indicating whether the token ends a sentence. `None` if -unknown. Defaults to `True` for the last token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[3].is_sent_end -> assert not doc[4].is_sent_end -> ``` - -| Name | Type | Description | -| ----------- | ---- | ------------------------------------ | -| **RETURNS** | bool | Whether the token ends a sentence. | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -424,71 +408,71 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| Name | Type | Description | +| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. | +| `text` | unicode | Verbatim text content. | +| `text_with_ws` | unicode | Text content, with trailing space character if present. | +| `whitespace_` | unicode | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | unicode | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | unicode | Fine-grained part-of-speech. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | unicode | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | unicode | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | From c685ee734ad7e3d103fbd5725033353737563b40 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Fri, 22 May 2020 14:22:36 +0200 Subject: [PATCH 033/203] Fix compat for v2.x branch --- spacy/tests/regression/test_issue5152.py | 3 +++ spacy/tests/regression/test_issue5230.py | 25 +++++++++++-------- spacy/tests/regression/test_issue5458.py | 5 ++++ .../serialize/test_serialize_vocab_strings.py | 2 ++ spacy/tests/vocab_vectors/test_vectors.py | 2 ++ 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..758ac9c14 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 337c82255..2b14ff589 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,16 +1,17 @@ # coding: utf8 import warnings from unittest import TestCase - import pytest import srsly from numpy import zeros from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors - from spacy.language import Language from spacy.pipeline import Pipe -from spacy.tests.util import make_tempdir +from spacy.compat import is_python2 + + +from ..util import make_tempdir def nlp(): @@ -96,12 +97,14 @@ def write_obj_and_catch_warnings(obj): return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) assert len(warnings_list) == 0 +@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") def test_writer_with_path_py35(): writer = None with make_tempdir() as d: @@ -132,11 +135,13 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) -class TestToDiskResourceWarningUnittest(TestCase): - def test_resource_warning(self): - scenarios = zip(*objects_to_test) +if not is_python2: - for scenario in scenarios: - with self.subTest(msg=scenario[1]): - warnings_list = write_obj_and_catch_warnings(scenario[0]) - self.assertEqual(len(warnings_list), 0) + class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) + + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 33281c858..3281e2a8c 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc @@ -6,11 +9,13 @@ from spacy.vocab import Vocab def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans + # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] vocab = Vocab(strings=words) dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + # fmt: on en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) en_doc.noun_chunks_iterator = noun_chunks diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 3be0a75b3..4727899a3 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -5,6 +5,7 @@ import pytest import pickle from spacy.vocab import Vocab from spacy.strings import StringStore +from spacy.compat import is_python2 from ..util import make_tempdir @@ -134,6 +135,7 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) != list(sstore2_d) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 1821f8abc..576ca93d2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -10,6 +10,7 @@ from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc +from spacy.compat import is_python2 from ..util import add_vecs_to_vocab, make_tempdir @@ -339,6 +340,7 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) +@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) From aa53ce69962103f5e2386210c7c9bf16e2f0bcd7 Mon Sep 17 00:00:00 2001 From: Jannis <34443309+JannisTriesToCode@users.noreply.github.com> Date: Fri, 22 May 2020 19:50:26 +0200 Subject: [PATCH 034/203] Documentation Typo Fix (#5492) * Fix typo Change 'realize' to 'realise' * Add contributer agreement --- .github/contributors/JannisTriesToCode.md | 106 ++++++++++++++++++++++ website/docs/usage/adding-languages.md | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/JannisTriesToCode.md diff --git a/.github/contributors/JannisTriesToCode.md b/.github/contributors/JannisTriesToCode.md new file mode 100644 index 000000000..d834794c5 --- /dev/null +++ b/.github/contributors/JannisTriesToCode.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ----------------------------- | +| Name | Jannis Rauschke | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 22.05.2020 | +| GitHub username | JannisTriesToCode | +| Website (optional) | https://twitter.com/JRauschke | diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 70411ec0b..29de08266 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -288,7 +288,7 @@ common spelling. This has no effect on any other token attributes, or tokenization in general, but it ensures that **equivalent tokens receive similar representations**. This can improve the model's predictions on words that weren't common in the training data, but are equivalent to other words – for -example, "realize" and "realize", or "thx" and "thanks". +example, "realise" and "realize", or "thx" and "thanks". Similarly, spaCy also includes [global base norms](https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py) From ae1c179f3a3bdd02906ce340d1d972402b9c0b62 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Sat, 23 May 2020 17:58:19 +0200 Subject: [PATCH 035/203] Remove the nested quote --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 58f4cc2aa..23aa42334 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2211,7 +2211,7 @@ "", "nlp = spacy.load('en_core_web_sm')", "nlp.add_pipe(LanguageDetector())", - "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')", + "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'", "assert doc._.language_score >= 0.8" From e06ca7ea24d151b77719b02f7430d724aa27a406 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 25 May 2020 10:13:56 +0200 Subject: [PATCH 036/203] Switch to new add API in PhraseMatcher unpickle --- spacy/matcher/phrasematcher.pyx | 2 +- spacy/tests/matcher/test_phrase_matcher.py | 24 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b66ec35b8..00c3357f5 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -332,7 +332,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr): matcher = PhraseMatcher(vocab, attr=attr) for key, specs in docs.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, specs, on_match=callback) return matcher diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..60aa584ef 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import srsly from mock import Mock from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -266,3 +267,26 @@ def test_phrase_matcher_basic_check(en_vocab): pattern = Doc(en_vocab, words=["hello", "world"]) with pytest.raises(ValueError): matcher.add("TEST", pattern) + + +def test_phrase_matcher_pickle(en_vocab): + matcher = PhraseMatcher(en_vocab) + mock = Mock() + matcher.add("TEST", [Doc(en_vocab, words=["test"])]) + matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock) + doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"]) + assert len(matcher) == 2 + + b = srsly.pickle_dumps(matcher) + matcher_unpickled = srsly.pickle_loads(b) + + # call after pickling to avoid recursion error related to mock + matches = matcher(doc) + matches_unpickled = matcher_unpickled(doc) + + assert len(matcher) == len(matcher_unpickled) + assert matches == matches_unpickled + + # clunky way to vaguely check that callback is unpickled + (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] + assert isinstance(callbacks.get("TEST2"), Mock) From 8b8efa1b42446b94e70ebd2cc2990f84167ae303 Mon Sep 17 00:00:00 2001 From: Rajat <22280243+R1j1t@users.noreply.github.com> Date: Mon, 25 May 2020 15:00:23 +0530 Subject: [PATCH 037/203] update spacy universe with my project (#5497) * added contextualSpellCheck in spacy universe meta * removed extra formatting by code * updated with permanent links * run json linter used by spacy * filled SCA * updated the description --- .github/contributors/R1j1t.md | 106 ++++++++++++++++++++++++++++++++++ website/meta/universe.json | 30 ++++++++++ 2 files changed, 136 insertions(+) create mode 100644 .github/contributors/R1j1t.md diff --git a/.github/contributors/R1j1t.md b/.github/contributors/R1j1t.md new file mode 100644 index 000000000..a92f1e092 --- /dev/null +++ b/.github/contributors/R1j1t.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Rajat | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24 May 2020 | +| GitHub username | R1j1t | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index 23aa42334..58be719ed 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2293,6 +2293,36 @@ }, "category": ["pipeline", "research"], "tags": ["term_extraction"] + }, + { + "id": "contextualSpellCheck", + "title": "Contextual Spell Check", + "slogan": "Contextual spell correction using BERT (bidirectional representations)", + "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE. In the coming days, I would like to focus on RWE and optimising the package by implementing it in cython.", + "github": "R1j1t/contextualSpellCheck", + "pip": "contextualSpellCheck", + "code_example": [ + "import spacy", + "import contextualSpellCheck", + "", + "nlp = spacy.load('en')", + "contextualSpellCheck.add_to_pipe(nlp)", + "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')", + "", + "print(doc._.performed_spellCheck) #Should be True", + "print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million." + ], + "code_language": "python", + "url": "https://github.com/R1j1t/contextualSpellCheck", + "thumb": "https://user-images.githubusercontent.com/22280243/82760949-98e68480-9e14-11ea-952e-4738620fd9e3.png", + "image": "https://user-images.githubusercontent.com/22280243/82138959-2852cd00-9842-11ea-918a-49b2a7873ef6.png", + "author": "Rajat Goel", + "author_links": { + "github": "r1j1t", + "website": "https://github.com/R1j1t" + }, + "category": ["pipeline", "conversational", "research"], + "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] } ], From 736f3cb5af4ab2f77b4c7dd5e64331f433355a8d Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 25 May 2020 12:03:49 +0200 Subject: [PATCH 038/203] Bump version and deps for v2.3.0 * spacy to v2.3.0 * thinc to v7.4.1 * spacy-lookups-data to v0.3.2 --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 6 +++--- spacy/about.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 827e2a797..fe66494ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.4.0", + "thinc==7.4.1", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index ec30efc16..b93def651 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==7.4.0 +thinc==7.4.1 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index af3579f88..1e29f1ead 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==7.4.0 + thinc==7.4.1 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==7.4.0 + thinc==7.4.1 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=1.0.2,<1.1.0 @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.3.1,<0.4.0 + spacy_lookups_data>=0.3.2,<0.4.0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = diff --git a/spacy/about.py b/spacy/about.py index 84dc86aa8..91810fa68 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.2.4" +__version__ = "2.3.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 3f727bc539542e97a8a0d94299f69405eda96ad3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 25 May 2020 12:57:20 +0200 Subject: [PATCH 039/203] Switch to v2.3.0.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 91810fa68..be1b3ae56 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0" +__version__ = "2.3.0.dev0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From c9c7b135c05fa8688202704e3cbbed80734f76af Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Mon, 25 May 2020 15:24:24 +0200 Subject: [PATCH 040/203] Update Makefile for v2.3.0 (#5502) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index cf96d6294..1891692ec 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 chmod a+rx $@ dist/pytest.pex : wheelhouse/pytest-*.whl @@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex From 69897b45d89877d6b243d122ed7d13fca315503c Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Mon, 25 May 2020 16:39:22 +0200 Subject: [PATCH 041/203] Handle spacy.pex renaming in Makefile (#5503) --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 1891692ec..2764da118 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 chmod a+rx $@ + cp $@ dist/spacy.pex dist/pytest.pex : wheelhouse/pytest-*.whl $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock From 1eed101be9adc8f94036761099d512f26439a2c5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 26 May 2020 09:56:12 +0200 Subject: [PATCH 042/203] Fix Polish lemmatizer for deserialized models Restructure Polish lemmatizer not to depend on lookups data in `__init__` since the lemmatizer is initialized before the lookups data is loaded from a saved model. The lookups tables are accessed first in `__call__` instead once the data is available. --- spacy/lang/pl/lemmatizer.py | 87 +++++++++++++------------------------ 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index d0d843b2a..8b8d7fe27 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -6,98 +6,73 @@ from ...parts_of_speech import NAMES class PolishLemmatizer(Lemmatizer): - # This lemmatizer implements lookup lemmatization based on - # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS - # It utilizes some prefix based improvements for - # verb and adjectives lemmatization, as well as case-sensitive - # lemmatization for nouns - def __init__(self, lookups, *args, **kwargs): - # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super(PolishLemmatizer, self).__init__(lookups) - self.lemma_lookups = {} - for tag in [ - "ADJ", - "ADP", - "ADV", - "AUX", - "NOUN", - "NUM", - "PART", - "PRON", - "VERB", - "X", - ]: - self.lemma_lookups[tag] = self.lookups.get_table( - "lemma_lookup_" + tag.lower(), {} - ) - self.lemma_lookups["DET"] = self.lemma_lookups["X"] - self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] - + # This lemmatizer implements lookup lemmatization based on the Morfeusz + # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. + # It utilizes some prefix based improvements for verb and adjectives + # lemmatization, as well as case-sensitive lemmatization for nouns. def __call__(self, string, univ_pos, morphology=None): if isinstance(univ_pos, int): univ_pos = NAMES.get(univ_pos, "X") univ_pos = univ_pos.upper() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": - return self.lemmatize_noun(string, morphology) + return self.lemmatize_noun(string, morphology, lookup_table) if univ_pos != "PROPN": string = string.lower() if univ_pos == "ADJ": - return self.lemmatize_adj(string, morphology) + return self.lemmatize_adj(string, morphology, lookup_table) elif univ_pos == "VERB": - return self.lemmatize_verb(string, morphology) + return self.lemmatize_verb(string, morphology, lookup_table) - lemma_dict = self.lemma_lookups.get(univ_pos, {}) - return [lemma_dict.get(string, string.lower())] + return [lookup_table.get(string, string.lower())] - def lemmatize_adj(self, string, morphology): + def lemmatize_adj(self, string, morphology, lookup_table): # this method utilizes different procedures for adjectives # with 'nie' and 'naj' prefixes - lemma_dict = self.lemma_lookups["ADJ"] - if string[:3] == "nie": search_string = string[3:] if search_string[:3] == "naj": naj_search_string = search_string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] if string[:3] == "naj": naj_search_string = string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_verb(self, string, morphology): + def lemmatize_verb(self, string, morphology, lookup_table): # this method utilizes a different procedure for verbs # with 'nie' prefix - lemma_dict = self.lemma_lookups["VERB"] - if string[:3] == "nie": search_string = string[3:] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_noun(self, string, morphology): + def lemmatize_noun(self, string, morphology, lookup_table): # this method is case-sensitive, in order to work # for incorrectly tagged proper names - lemma_dict = self.lemma_lookups["NOUN"] - if string != string.lower(): - if string.lower() in lemma_dict: - return [lemma_dict[string.lower()]] - elif string in lemma_dict: - return [lemma_dict[string]] + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] return [string.lower()] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] def lookup(self, string, orth=None): return string.lower() From b6b5908f5e9ca4e6a2e46ca42a2d370b00119d44 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 26 May 2020 14:50:53 +0200 Subject: [PATCH 043/203] Prefer _SP over SP for default tag map space attrs If `_SP` is already in the tag map, use the mapping from `_SP` instead of `SP` so that `SP` can be a valid non-space tag. (Chinese has a non-space tag `SP` which was overriding the mapping of `_SP` to `SPACE`.) --- spacy/morphology.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..a9bab38ed 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -152,7 +152,10 @@ cdef class Morphology: self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. - space_attrs = tag_map.get('SP', {POS: SPACE}) + if '_SP' in tag_map: + space_attrs = tag_map.get('_SP') + else: + space_attrs = tag_map.get('SP', {POS: SPACE}) if '_SP' not in tag_map: self.strings.add('_SP') tag_map = dict(tag_map) From f00488ab3078a57211f28dffee7588b669e5b4ca Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Tue, 26 May 2020 16:41:39 +0200 Subject: [PATCH 044/203] Update train_intent_parser.py --- examples/training/train_intent_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index d2472b6b9..a91102093 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -2,7 +2,7 @@ # coding: utf-8 """Using the parser to recognise your own semantics -spaCy's parser component can be used to trained to predict any type of tree +spaCy's parser component can be trained to predict any type of tree structure over your input text. You can also predict trees over whole documents or chat logs, with connections between the sentence-roots used to annotate discourse structure. In this example, we'll build a message parser for a common From aad0610a853b4731806397537f867878fec5efa8 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Tue, 26 May 2020 22:30:53 +0200 Subject: [PATCH 045/203] Map NR to PROPN (#5512) --- spacy/lang/zh/tag_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..f9b5389ac 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X -from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE +from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE, PROPN # The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn # Treebank tag set. We also map the tags to the simpler Universal Dependencies @@ -28,7 +28,7 @@ TAG_MAP = { "URL": {POS: X}, "INF": {POS: X}, "NN": {POS: NOUN}, - "NR": {POS: NOUN}, + "NR": {POS: PROPN}, "NT": {POS: NOUN}, "VA": {POS: VERB}, "VC": {POS: VERB}, From 25de2a2191c168ce133d922c4e2e041684431228 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 27 May 2020 14:48:54 +0200 Subject: [PATCH 046/203] Improve vector name loading from model meta --- spacy/language.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 53a788f2a..2058def8a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -934,15 +934,26 @@ class Language(object): DOCS: https://spacy.io/api/language#from_disk """ + def deserialize_meta(path): + if path.exists(): + data = srsly.read_json(path) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(path): + if path.exists(): + self.vocab.from_disk(path) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() - deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) - deserializers["vocab"] = lambda p: self.vocab.from_disk( - p - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( p, exclude=["vocab"] ) @@ -996,14 +1007,23 @@ class Language(object): DOCS: https://spacy.io/api/language#from_bytes """ + def deserialize_meta(b): + data = srsly.json_loads(b) + self.meta.update(data) + # self.meta always overrides meta["vectors"] with the metadata + # from self.vocab.vectors, so set the name directly + self.vocab.vectors.name = data.get("vectors", {}).get("name") + + def deserialize_vocab(b): + self.vocab.from_bytes(b) + _fix_pretrained_vectors_name(self) + if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() - deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) - deserializers["vocab"] = lambda b: self.vocab.from_bytes( - b - ) and _fix_pretrained_vectors_name(self) + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( b, exclude=["vocab"] ) @@ -1069,7 +1089,7 @@ class component(object): def _fix_pretrained_vectors_name(nlp): # TODO: Replace this once we handle vectors consistently as static # data - if "vectors" in nlp.meta and nlp.meta["vectors"].get("name"): + if "vectors" in nlp.meta and "name" in nlp.meta["vectors"]: nlp.vocab.vectors.name = nlp.meta["vectors"]["name"] elif not nlp.vocab.vectors.size: nlp.vocab.vectors.name = None From 5f0a91cf3771a96e6bcd0c63a9d70e3fc74020d1 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Fri, 29 May 2020 09:56:29 +0200 Subject: [PATCH 047/203] fix conv-depth parameter --- website/docs/api/cli.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 505977be9..b49a2fb08 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -455,7 +455,7 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] -[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] +[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] @@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | | `output_dir` | positional | Directory to write models to on each epoch. | | `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--conv-depth`, `-cd` | option | Depth of CNN layers. | | `--cnn-window`, `-cW` <Tag variant="new">2.2.2</Tag> | option | Window size for CNN layers. | | `--cnn-pieces`, `-cP` <Tag variant="new">2.2.2</Tag> | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). | | `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. | @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 04ba37b667764c5b18825a5ee8ce513962e73bcd Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Fri, 29 May 2020 13:52:39 +0200 Subject: [PATCH 048/203] fix description --- examples/training/pretrain_textcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index f3e493f6a..d29e20ad1 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats): width=("Width of CNN layers", "positional", None, int), embed_size=("Embedding rows", "positional", None, int), pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), + train_iters=("Number of iterations to train", "option", "tn", int), train_examples=("Number of labelled examples", "option", "eg", int), vectors_model=("Name or path to vectors model to learn from"), ) From e1b7cbd197954928974296c6a622ddb70211dd30 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 29 May 2020 14:33:47 +0200 Subject: [PATCH 049/203] Remove MorphAnalysis __str__ and __repr__ --- spacy/tokens/morphanalysis.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e09870741..12f2f6cc3 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -46,12 +46,6 @@ cdef class MorphAnalysis: """The number of features in the analysis.""" return self.c.length - def __str__(self): - return self.to_json() - - def __repr__(self): - return self.to_json() - def __hash__(self): return self.key From 291483157dacfc80ecd6ba2f7e097fbe98a4395a Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Fri, 29 May 2020 17:38:33 +0200 Subject: [PATCH 050/203] prevent loading a pretrained Tok2Vec layer AND pretrained components --- spacy/cli/train.py | 9 +++++++-- spacy/errors.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ce095c15..d4de9aeb4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,6 +15,7 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu +from ..errors import Errors from ..gold import GoldCorpus from ..compat import path2str from ..lookups import Lookups @@ -182,6 +183,7 @@ def train( msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 + base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) @@ -227,6 +229,7 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) + base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) @@ -299,7 +302,7 @@ def train( # Load in pretrained weights if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) + components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config @@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) -def _load_pretrained_tok2vec(nlp, loc): +def _load_pretrained_tok2vec(nlp, loc, base_components): """Load pretrained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ @@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc): loaded = [] for name, component in nlp.pipeline: if hasattr(component, "model") and hasattr(component.model, "tok2vec"): + if name in base_components: + raise ValueError(Errors.E200.format(component=name)) component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/errors.py b/spacy/errors.py index 6d92545d7..11b601e19 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -568,6 +568,8 @@ class Errors(object): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") + E200 = ("Specifying a base model with a pretrained component '{component}' " + "can not be combined with adding a pretrained Tok2Vec layer.") @add_codes From 64adda32029b867b25bc6f3313863abfc70a6fd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 29 May 2020 23:21:55 +0200 Subject: [PATCH 051/203] Revert "Remove peeking from Parser.begin_training (#5456)" This reverts commit 9393253b66b5f9fc6c5e58806cf261da5afd1778. The model shouldn't need to see all examples, and actually in v3 there's no equivalent step. All examples are provided to the component, for the component to do stuff like figuring out the labels. The model just needs to do stuff like shape inference. --- spacy/syntax/nn_parser.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fafa492c6..d5c6bf2a8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,6 +9,7 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np +from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -620,15 +621,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - docs = [] - golds = [] - for raw_text, annots_brackets in get_gold_tuples(): + doc_sample = [] + gold_sample = [] + for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - docs.append(Doc(self.vocab, words=words)) - golds.append(GoldParse(docs[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(docs, golds) + doc_sample.append(Doc(self.vocab, words=words)) + gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) From 15134ef611f6d63ccf45afaacc0e6240d26576f9 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 30 May 2020 12:53:32 +0200 Subject: [PATCH 052/203] fix deserialization order --- spacy/vectors.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 51ddc3f9a..aec086e6c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -425,9 +425,9 @@ cdef class Vectors: self.data = xp.load(str(path)) serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), ("vectors", load_vectors), + ("keys", load_keys), + ("key2row", load_key2row), )) util.from_disk(path, serializers, []) self._sync_unset() From a005ccd6d7b0d62018481cd5f0ffe34d7fb51ab3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Sun, 31 May 2020 19:57:54 +0200 Subject: [PATCH 053/203] Preserve _SP when filtering tag map in Tagger To allow "SP" as a tag (for Chinese OntoNotes), preserve "_SP" if present as the reference `SPACE` POS in the tag map in `Tagger.begin_training()`. --- spacy/pipeline/pipes.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ccd847ef1..105ce00e6 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -526,6 +526,8 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, From 7d5a89661e690473114e52d3ca1c27ef2ff733e9 Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Sun, 31 May 2020 20:13:39 +0200 Subject: [PATCH 054/203] contributor agreement signed (#5525) --- .github/contributors/leomrocha.md | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/leomrocha.md diff --git a/.github/contributors/leomrocha.md b/.github/contributors/leomrocha.md new file mode 100644 index 000000000..495654153 --- /dev/null +++ b/.github/contributors/leomrocha.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Leonardo M. Rocha | +| Company name (if applicable) | | +| Title or role (if applicable) | Eng. | +| Date | 31/05/2020 | +| GitHub username | leomrocha | +| Website (optional) | | From c21c308ecbf1021a180d2a6c201eda57e73078ca Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Sun, 31 May 2020 22:08:12 +0200 Subject: [PATCH 055/203] corrected issue #5524 changed <U+009C> 'STRING TERMINATOR' for <U+0153> LATIN SMALL LIGATURE OE' (#5526) --- spacy/lang/fr/_tokenizer_exceptions_list.py | 65 ++++++++++----------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..0fcf02351 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -534,7 +534,6 @@ FR_BASE_EXCEPTIONS = [ "Beaumont-Hamel", "Beaumont-Louestault", "Beaumont-Monteux", - "Beaumont-Pied-de-Buf", "Beaumont-Pied-de-Bœuf", "Beaumont-Sardolles", "Beaumont-Village", @@ -951,7 +950,7 @@ FR_BASE_EXCEPTIONS = [ "Buxières-sous-les-Côtes", "Buzy-Darmont", "Byhleguhre-Byhlen", - "Burs-en-Othe", + "Bœurs-en-Othe", "Bâle-Campagne", "Bâle-Ville", "Béard-Géovreissiat", @@ -1589,11 +1588,11 @@ FR_BASE_EXCEPTIONS = [ "Cruci-Falgardiens", "Cruquius-Oost", "Cruviers-Lascours", - "Crèvecur-en-Auge", - "Crèvecur-en-Brie", - "Crèvecur-le-Grand", - "Crèvecur-le-Petit", - "Crèvecur-sur-l'Escaut", + "Crèvecœur-en-Auge", + "Crèvecœur-en-Brie", + "Crèvecœur-le-Grand", + "Crèvecœur-le-Petit", + "Crèvecœur-sur-l'Escaut", "Crécy-Couvé", "Créon-d'Armagnac", "Cubjac-Auvézère-Val-d'Ans", @@ -1619,7 +1618,7 @@ FR_BASE_EXCEPTIONS = [ "Cuxac-Cabardès", "Cuxac-d'Aude", "Cuyk-Sainte-Agathe", - "Cuvres-et-Valsery", + "Cœuvres-et-Valsery", "Céaux-d'Allègre", "Céleste-Empire", "Cénac-et-Saint-Julien", @@ -1682,7 +1681,7 @@ FR_BASE_EXCEPTIONS = [ "Devrai-Gondragnières", "Dhuys et Morin-en-Brie", "Diane-Capelle", - "Dieffenbach-lès-Wrth", + "Dieffenbach-lès-Wœrth", "Diekhusen-Fahrstedt", "Diennes-Aubigny", "Diensdorf-Radlow", @@ -1755,7 +1754,7 @@ FR_BASE_EXCEPTIONS = [ "Durdat-Larequille", "Durfort-Lacapelette", "Durfort-et-Saint-Martin-de-Sossenac", - "Duil-sur-le-Mignon", + "Dœuil-sur-le-Mignon", "Dão-Lafões", "Débats-Rivière-d'Orpra", "Décines-Charpieu", @@ -2690,8 +2689,8 @@ FR_BASE_EXCEPTIONS = [ "Kuhlen-Wendorf", "KwaZulu-Natal", "Kyzyl-Arvat", - "Kur-la-Grande", - "Kur-la-Petite", + "Kœur-la-Grande", + "Kœur-la-Petite", "Kölln-Reisiek", "Königsbach-Stein", "Königshain-Wiederau", @@ -4027,7 +4026,7 @@ FR_BASE_EXCEPTIONS = [ "Marcilly-d'Azergues", "Marcillé-Raoul", "Marcillé-Robert", - "Marcq-en-Barul", + "Marcq-en-Barœul", "Marcy-l'Etoile", "Marcy-l'Étoile", "Mareil-Marly", @@ -4261,7 +4260,7 @@ FR_BASE_EXCEPTIONS = [ "Monlezun-d'Armagnac", "Monléon-Magnoac", "Monnetier-Mornex", - "Mons-en-Barul", + "Mons-en-Barœul", "Monsempron-Libos", "Monsteroux-Milieu", "Montacher-Villegardin", @@ -4351,7 +4350,7 @@ FR_BASE_EXCEPTIONS = [ "Mornay-Berry", "Mortain-Bocage", "Morteaux-Couliboeuf", - "Morteaux-Coulibuf", + "Morteaux-Coulibœuf", "Morteaux-Coulibœuf", "Mortes-Frontières", "Mory-Montcrux", @@ -4394,7 +4393,7 @@ FR_BASE_EXCEPTIONS = [ "Muncq-Nieurlet", "Murtin-Bogny", "Murtin-et-le-Châtelet", - "Murs-Verdey", + "Mœurs-Verdey", "Ménestérol-Montignac", "Ménil'muche", "Ménil-Annelles", @@ -4615,7 +4614,7 @@ FR_BASE_EXCEPTIONS = [ "Neuves-Maisons", "Neuvic-Entier", "Neuvicq-Montguyon", - "Neuville-lès-Luilly", + "Neuville-lès-Lœuilly", "Neuvy-Bouin", "Neuvy-Deux-Clochers", "Neuvy-Grandchamp", @@ -4776,8 +4775,8 @@ FR_BASE_EXCEPTIONS = [ "Nuncq-Hautecôte", "Nurieux-Volognat", "Nuthe-Urstromtal", - "Nux-les-Mines", - "Nux-lès-Auxi", + "Nœux-les-Mines", + "Nœux-lès-Auxi", "Nâves-Parmelan", "Nézignan-l'Evêque", "Nézignan-l'Évêque", @@ -5346,7 +5345,7 @@ FR_BASE_EXCEPTIONS = [ "Quincy-Voisins", "Quincy-sous-le-Mont", "Quint-Fonsegrives", - "Quux-Haut-Maînil", + "Quœux-Haut-Maînil", "Quœux-Haut-Maînil", "Qwa-Qwa", "R.-V.", @@ -5634,12 +5633,12 @@ FR_BASE_EXCEPTIONS = [ "Saint Aulaye-Puymangou", "Saint Geniez d'Olt et d'Aubrac", "Saint Martin de l'If", - "Saint-Denux", - "Saint-Jean-de-Buf", - "Saint-Martin-le-Nud", - "Saint-Michel-Tubuf", + "Saint-Denœux", + "Saint-Jean-de-Bœuf", + "Saint-Martin-le-Nœud", + "Saint-Michel-Tubœuf", "Saint-Paul - Flaugnac", - "Saint-Pierre-de-Buf", + "Saint-Pierre-de-Bœuf", "Saint-Thegonnec Loc-Eguiner", "Sainte-Alvère-Saint-Laurent Les Bâtons", "Salignac-Eyvignes", @@ -6211,7 +6210,7 @@ FR_BASE_EXCEPTIONS = [ "Tite-Live", "Titisee-Neustadt", "Tobel-Tägerschen", - "Togny-aux-Bufs", + "Togny-aux-Bœufs", "Tongre-Notre-Dame", "Tonnay-Boutonne", "Tonnay-Charente", @@ -6339,7 +6338,7 @@ FR_BASE_EXCEPTIONS = [ "Vals-près-le-Puy", "Valverde-Enrique", "Valzin-en-Petite-Montagne", - "Vanduvre-lès-Nancy", + "Vandœuvre-lès-Nancy", "Varces-Allières-et-Risset", "Varenne-l'Arconce", "Varenne-sur-le-Doubs", @@ -6460,9 +6459,9 @@ FR_BASE_EXCEPTIONS = [ "Villenave-d'Ornon", "Villequier-Aumont", "Villerouge-Termenès", - "Villers-aux-Nuds", + "Villers-aux-Nœuds", "Villez-sur-le-Neubourg", - "Villiers-en-Désuvre", + "Villiers-en-Désœuvre", "Villieu-Loyes-Mollon", "Villingen-Schwenningen", "Villié-Morgon", @@ -6470,7 +6469,7 @@ FR_BASE_EXCEPTIONS = [ "Vilosnes-Haraumont", "Vilters-Wangs", "Vincent-Froideville", - "Vincy-Manuvre", + "Vincy-Manœuvre", "Vincy-Manœuvre", "Vincy-Reuil-et-Magny", "Vindrac-Alayrac", @@ -6514,8 +6513,8 @@ FR_BASE_EXCEPTIONS = [ "Vrigne-Meusiens", "Vrijhoeve-Capelle", "Vuisternens-devant-Romont", - "Vlfling-lès-Bouzonville", - "Vuil-et-Giget", + "Vœlfling-lès-Bouzonville", + "Vœuil-et-Giget", "Vélez-Blanco", "Vélez-Málaga", "Vélez-Rubio", @@ -6618,7 +6617,7 @@ FR_BASE_EXCEPTIONS = [ "Wust-Fischbeck", "Wutha-Farnroda", "Wy-dit-Joli-Village", - "Wlfling-lès-Sarreguemines", + "Wœlfling-lès-Sarreguemines", "Wünnewil-Flamatt", "X-SAMPA", "X-arbre", From 925e93857034c29c46a8b582db4969df7ba50c06 Mon Sep 17 00:00:00 2001 From: Leo <3646521+leomrocha@users.noreply.github.com> Date: Mon, 1 Jun 2020 18:18:34 +0200 Subject: [PATCH 056/203] Spanish tokenizer exception and examples improvement (#5531) * Spanish tokenizer exception additions. Added Spanish question examples * erased slang tokenization examples --- spacy/lang/es/examples.py | 6 +++++- spacy/lang/es/tokenizer_exceptions.py | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..7ab0a7dfe 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -18,5 +18,9 @@ sentences = [ "El gato come pescado.", "Veo al hombre con el telescopio.", "La araña come moscas.", - "El pingüino incuba en su nido.", + "El pingüino incuba en su nido sobre el hielo.", + "¿Dónde estais?", + "¿Quién es el presidente Francés?", + "¿Dónde está encuentra la capital de Argentina?", + "¿Cuándo nació José de San Martín?", ] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 2c2631086..891323705 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA -_exc = { - "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], - "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}], -} +_exc = {} for exc_data in [ + {ORTH: "n°", LEMMA: "número"}, + {ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "dna.", LEMMA: "docena"}, + {ORTH: "dpto.", LEMMA: "departamento"}, + {ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "pág.", LEMMA: "página"}, {ORTH: "p.ej.", LEMMA: "por ejemplo"}, @@ -20,6 +21,8 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "vol.", NORM: "volúmen"}, + ]: _exc[exc_data[ORTH]] = [exc_data] @@ -39,10 +42,14 @@ for h in range(1, 12 + 1): for orth in [ "a.C.", "a.J.C.", + "d.C.", + "d.J.C.", "apdo.", "Av.", "Avda.", "Cía.", + "Dr.", + "Dra.", "EE.UU.", "etc.", "fig.", @@ -58,8 +65,10 @@ for orth in [ "Prof.", "Profa.", "q.e.p.d.", + "Q.E.P.D." "S.A.", "S.L.", + "S.R.L." "s.s.s.", "Sr.", "Sra.", From bbc1836581932d24818df064da8d64c7ec03ca23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 2 Jun 2020 17:23:16 +0200 Subject: [PATCH 057/203] Add rudimentary version checks on model load --- spacy/errors.py | 12 ++++++++++++ spacy/tests/test_misc.py | 30 ++++++++++++++++++++++++++++++ spacy/util.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 11b601e19..baed574f8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -115,6 +115,18 @@ class Warnings(object): "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") + W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and " + "is incompatible with the current spaCy version ({current}). This " + "may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") + W032 = ("Unable to determine model compatibility for model '{model}' " + "({model_version}) with the current spaCy version ({current}). " + "This may lead to unexpected results or runtime errors. To resolve " + "this, download a newer compatible model or retrain your custom " + "model with the current spaCy version. For more details and " + "available updates, run: python -m spacy validate") @add_codes diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 3ac621649..bb7ade35e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -10,6 +10,7 @@ from spacy import prefer_gpu, require_gpu from spacy.compat import symlink_to, symlink_remove, path2str, is_windows from spacy._ml import PrecomputableAffine from subprocess import CalledProcessError +from .util import make_tempdir @pytest.fixture @@ -146,3 +147,32 @@ def test_load_model_blank_shortcut(): assert nlp.pipeline == [] with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof") + + +def test_load_model_version_compat(): + """Test warnings for various spacy_version specifications in meta. Since + this is more of a hack for v2, manually specify the current major.minor + version to simplify test creation.""" + nlp = util.load_model("blank:en") + assert nlp.meta["spacy_version"].startswith(">=2.3") + with make_tempdir() as d: + # no change: compatible + nlp.to_disk(d) + nlp2 = util.load_model(d) + + # additional compatible upper pin + nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" + nlp.to_disk(d) + nlp2 = util.load_model(d) + + # incompatible older version + nlp.meta["spacy_version"] = ">=2.2.5" + nlp.to_disk(d) + with pytest.warns(UserWarning): + nlp_reloaded = util.load_model(d) + + # invalid version specification + nlp.meta["spacy_version"] = ">@#$%_invalid_version" + nlp.to_disk(d) + with pytest.warns(UserWarning): + nlp_reloaded = util.load_model(d) diff --git a/spacy/util.py b/spacy/util.py index 5fd296404..36df5725f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,6 +17,7 @@ import srsly import catalogue import sys import warnings +from . import about try: import jsonschema @@ -250,6 +251,33 @@ def get_model_meta(path): for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) + if "spacy_version" in meta: + about_major_minor = ".".join(about.__version__.split(".")[:2]) + if about_major_minor is not None and not meta["spacy_version"].startswith( + ">=" + about_major_minor + ): + # try to simplify version requirements from model meta to vx.x + # for warning message + meta_spacy_version = "v" + ".".join( + meta["spacy_version"].replace(">=", "").split(".")[:2] + ) + # if the format is unexpected, supply the full version + if not re.match(r"v\d+\.\d+", meta_spacy_version): + meta_spacy_version = meta["spacy_version"] + warn_msg = Warnings.W031.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + version=meta_spacy_version, + current=about.__version__, + ) + warnings.warn(warn_msg) + else: + warn_msg = Warnings.W032.format( + model=meta["lang"] + "_" + meta["name"], + model_version=meta["version"], + current=about.__version__, + ) + warnings.warn(warn_msg) return meta From 75f08ad62d9d0b9ea8ecf0454da332f99b00ec45 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 2 Jun 2020 17:41:25 +0200 Subject: [PATCH 058/203] Remove unnecessary check --- spacy/util.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 36df5725f..5362952e2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,9 +253,7 @@ def get_model_meta(path): raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: about_major_minor = ".".join(about.__version__.split(".")[:2]) - if about_major_minor is not None and not meta["spacy_version"].startswith( - ">=" + about_major_minor - ): + if not meta["spacy_version"].startswith(">=" + about_major_minor): # try to simplify version requirements from model meta to vx.x # for warning message meta_spacy_version = "v" + ".".join( From a57bdeecacb664e80e0c8408492e28eb9dd31a79 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 3 Jun 2020 12:10:12 +0200 Subject: [PATCH 059/203] Test util.get_model_meta instead of util.load_model --- spacy/tests/test_misc.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bb7ade35e..a361d5c0f 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest import os import ctypes +import srsly from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu @@ -158,21 +159,22 @@ def test_load_model_version_compat(): with make_tempdir() as d: # no change: compatible nlp.to_disk(d) - nlp2 = util.load_model(d) + meta_path = Path(d / "meta.json") + util.get_model_meta(d) # additional compatible upper pin nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - nlp.to_disk(d) - nlp2 = util.load_model(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) + util.get_model_meta(d) # incompatible older version nlp.meta["spacy_version"] = ">=2.2.5" - nlp.to_disk(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) with pytest.warns(UserWarning): - nlp_reloaded = util.load_model(d) + util.get_model_meta(d) # invalid version specification nlp.meta["spacy_version"] = ">@#$%_invalid_version" - nlp.to_disk(d) + srsly.write_json(Path(d / "meta.json"), nlp.meta) with pytest.warns(UserWarning): - nlp_reloaded = util.load_model(d) + util.get_model_meta(d) From 8c758ed1ebc3d35f03707e593b83b214d40f434b Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Wed, 3 Jun 2020 12:11:57 +0200 Subject: [PATCH 060/203] Fix meta path --- spacy/tests/test_misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index a361d5c0f..d48ba24a2 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -164,17 +164,17 @@ def test_load_model_version_compat(): # additional compatible upper pin nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) util.get_model_meta(d) # incompatible older version nlp.meta["spacy_version"] = ">=2.2.5" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d) # invalid version specification nlp.meta["spacy_version"] = ">@#$%_invalid_version" - srsly.write_json(Path(d / "meta.json"), nlp.meta) + srsly.write_json(meta_path, nlp.meta) with pytest.warns(UserWarning): util.get_model_meta(d) From 410fb7ee437b649c8bd291da84db5dc7cd65db45 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann <polm@dampfkraft.com> Date: Fri, 5 Jun 2020 02:15:43 +0900 Subject: [PATCH 061/203] Add Japanese Model (#5544) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add more rules to deal with Japanese UD mappings Japanese UD rules sometimes give different UD tags to tokens with the same underlying POS tag. The UD spec indicates these cases should be disambiguated using the output of a tool called "comainu", but rules are enough to get the right result. These rules are taken from Ginza at time of writing, see #3756. * Add new tags from GSD This is a few rare tags that aren't in Unidic but are in the GSD data. * Add basic Japanese sentencization This code is taken from Ginza again. * Add sentenceizer quote handling Could probably add more paired characters but this will do for now. Also includes some tests. * Replace fugashi with SudachiPy * Modify tag format to match GSD annotations Some of the tests still need to be updated, but I want to get this up for testing training. * Deal with case with closing punct without opening * refactor resolve_pos() * change tag field separator from "," to "-" * add TAG_ORTH_MAP * add TAG_BIGRAM_MAP * revise rules for 連体詞 * revise rules for 連体詞 * improve POS about 2% * add syntax_iterator.py (not mature yet) * improve syntax_iterators.py * improve syntax_iterators.py * add phrases including nouns and drop NPs consist of STOP_WORDS * First take at noun chunks This works in many situations but still has issues in others. If the start of a subtree has no noun, then nested phrases can be generated. また行きたい、そんな気持ちにさせてくれるお店です。 [そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店] For some reason て gets included sometimes. Not sure why. ゲンに連れ添って円盤生物を調査するパートナーとなる。 [て円盤生物, ...] Some phrases that look like they should be split are grouped together; not entirely sure that's wrong. This whole thing becomes one chunk: 道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み * Use new generic get_words_and_spaces The new get_words_and_spaces function is simpler than what was used in Japanese, so it's good to be able to switch to it. However, there was an issue. The new function works just on text, so POS info could get out of sync. Fixing this required a small change to the way dtokens (tokens with POS and lemma info) were generated. Specifically, multiple extraneous spaces now become a single token, so when generating dtokens multiple space tokens should be created in a row. * Fix noun_chunks, should be working now * Fix some tests, add naughty strings tests Some of the existing tests changed because the tokenization mode of Sudachi changed to the more fine-grained A mode. Sudachi also has issues with some strings, so this adds a test against the naughty strings. * Remove empty Sudachi tokens Not doing this creates zero-length tokens and causes errors in the internal spaCy processing. * Add yield_bunsetu back in as a separate piece of code Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai> --- spacy/lang/ja/__init__.py | 152 ++++++++++++++------- spacy/lang/ja/bunsetu.py | 144 ++++++++++++++++++++ spacy/lang/ja/syntax_iterators.py | 55 ++++++++ spacy/lang/ja/tag_bigram_map.py | 37 +++++ spacy/lang/ja/tag_map.py | 158 ++++++++++++---------- spacy/lang/ja/tag_orth_map.py | 30 ++++ spacy/tests/lang/ja/test_lemmatization.py | 2 +- spacy/tests/lang/ja/test_tokenizer.py | 35 +++-- 8 files changed, 486 insertions(+), 127 deletions(-) create mode 100644 spacy/lang/ja/bunsetu.py create mode 100644 spacy/lang/ja/syntax_iterators.py create mode 100644 spacy/lang/ja/tag_bigram_map.py create mode 100644 spacy/lang/ja/tag_orth_map.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..09546467e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -5,97 +5,148 @@ import re from collections import namedtuple from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP +from .tag_orth_map import TAG_ORTH_MAP +from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG -from ...language import Language -from ...tokens import Doc from ...compat import copy_reg -from ...util import DummyTokenizer +from ...language import Language +from ...symbols import POS +from ...tokens import Doc +from ...util import DummyTokenizer, get_words_and_spaces + +# Hold the attributes we need with convenient names +DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) # Handling for multiple spaces in a row is somewhat awkward, this simplifies # the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) -DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) +DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) +DummySpace = DummyNode(" ", " ", " ") -def try_fugashi_import(): - """Fugashi is required for Japanese support, so check for it. +def try_sudachi_import(): + """SudachiPy is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: - import fugashi + from sudachipy import dictionary, tokenizer - return fugashi + tok = dictionary.Dictionary().create( + mode=tokenizer.Tokenizer.SplitMode.A + ) + return tok except ImportError: raise ImportError( - "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" + "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" ) -def resolve_pos(token): +def resolve_pos(token, next_token): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context - in the sentence. This function adds information to the POS tag to - resolve ambiguous mappings. + in the sentence. This function returns resolved POSs for both token + and next_token by tuple. """ - # this is only used for consecutive ascii spaces - if token.surface == " ": - return "空白" + # Some tokens have their UD tag decided based on the POS of the following + # token. - # TODO: This is a first take. The rules here are crude approximations. - # For many of these, full dependencies are needed to properly resolve - # PoS mappings. - if token.pos == "連体詞,*,*,*": - if re.match(r"[こそあど此其彼]の", token.surface): - return token.pos + ",DET" - if re.match(r"[こそあど此其彼]", token.surface): - return token.pos + ",PRON" - return token.pos + ",ADJ" - return token.pos + # orth based rules + if token.pos in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[token.pos[0]] + if token.surface in orth_map: + return orth_map[token.surface], None + + # tag bi-gram mapping + if next_token: + tag_bigram = token.pos[0], next_token.pos[0] + if tag_bigram in TAG_BIGRAM_MAP: + bipos = TAG_BIGRAM_MAP[tag_bigram] + if bipos[0] is None: + return TAG_MAP[token.pos[0]][POS], bipos[1] + else: + return bipos + + return TAG_MAP[token.pos[0]][POS], None -def get_words_and_spaces(tokenizer, text): - """Get the individual tokens that make up the sentence and handle white space. +# Use a mapping of paired punctuation to avoid splitting quoted sentences. +pairpunct = {'「':'」', '『': '』', '【': '】'} - Japanese doesn't usually use white space, and MeCab's handling of it for - multiple spaces in a row is somewhat awkward. + +def separate_sentences(doc): + """Given a doc, mark tokens that start sentences based on Unidic tags. """ - tokens = tokenizer.parseToNodeList(text) + stack = [] # save paired punctuation + for i, token in enumerate(doc[:-2]): + # Set all tokens after the first to false by default. This is necessary + # for the doc code to be aware we've done sentencization, see + # `is_sentenced`. + token.sent_start = (i == 0) + if token.tag_: + if token.tag_ == "補助記号-括弧開": + ts = str(token) + if ts in pairpunct: + stack.append(pairpunct[ts]) + elif stack and ts == stack[-1]: + stack.pop() + + if token.tag_ == "補助記号-句点": + next_token = doc[i+1] + if next_token.tag_ != token.tag_ and not stack: + next_token.sent_start = True + + +def get_dtokens(tokenizer, text): + tokens = tokenizer.tokenize(text) words = [] - spaces = [] - for token in tokens: - # If there's more than one space, spaces after the first become tokens - for ii in range(len(token.white_space) - 1): - words.append(DummySpace) - spaces.append(False) - - words.append(token) - spaces.append(bool(token.white_space)) - return words, spaces + for ti, token in enumerate(tokens): + tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) + inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) + dtoken = DetailedToken( + token.surface(), + (tag, inf), + token.dictionary_form()) + if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': + # don't add multiple space tokens in a row + continue + words.append(dtoken) + # remove empty tokens. These can be produced with characters like … that + # Sudachi normalizes internally. + words = [ww for ww in words if len(ww.surface) > 0] + return words class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_fugashi_import().Tagger() - self.tokenizer.parseToNodeList("") # see #2901 + self.tokenizer = try_sudachi_import() def __call__(self, text): - dtokens, spaces = get_words_and_spaces(self.tokenizer, text) + dtokens = get_dtokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + words, spaces = get_words_and_spaces(words, text) + unidic_tags = [",".join(x.pos) for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) - unidic_tags = [] - for token, dtoken in zip(doc, dtokens): - unidic_tags.append(dtoken.pos) - token.tag_ = resolve_pos(dtoken) + next_pos = None + for ii, (token, dtoken) in enumerate(zip(doc, dtokens)): + ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None + token.tag_ = dtoken.pos[0] + if next_pos: + token.pos = next_pos + next_pos = None + else: + token.pos, next_pos = resolve_pos(dtoken, ntoken) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.feature.lemma or dtoken.surface + token.lemma_ = dtoken.lemma doc.user_data["unidic_tags"] = unidic_tags + + separate_sentences(doc) return doc @@ -104,6 +155,7 @@ class JapaneseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py new file mode 100644 index 000000000..7c3eee336 --- /dev/null +++ b/spacy/lang/ja/bunsetu.py @@ -0,0 +1,144 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + + +POS_PHRASE_MAP = { + "NOUN": "NP", + "NUM": "NP", + "PRON": "NP", + "PROPN": "NP", + + "VERB": "VP", + + "ADJ": "ADJP", + + "ADV": "ADVP", + + "CCONJ": "CCONJP", +} + + +# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] +def yield_bunsetu(doc, debug=False): + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + prev = None + prev_tag = None + prev_dep = None + prev_head = None + for t in doc: + pos = t.pos_ + pos_type = POS_PHRASE_MAP.get(pos, None) + tag = t.tag_ + dep = t.dep_ + head = t.head.i + if debug: + print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + + # DET is always an individual bunsetu + if pos == "DET": + if bunsetu: + yield bunsetu, phrase_type, phrase + yield [t], None, None + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + + # PRON or Open PUNCT always splits bunsetu + elif tag == "補助記号-括弧開": + if bunsetu: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = True + phrase_type = None + phrase = None + + # bunsetu head not appeared + elif phrase_type is None: + if bunsetu and prev_tag == "補助記号-読点": + yield bunsetu, phrase_type, phrase + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + bunsetu.append(t) + if pos_type: # begin phrase + phrase = [t] + phrase_type = pos_type + if pos_type in {"ADVP", "CCONJP"}: + bunsetu_may_end = True + + # entering new bunsetu + elif pos_type and ( + pos_type != phrase_type or # different phrase type arises + bunsetu_may_end # same phrase type but bunsetu already ended + ): + # exceptional case: NOUN to VERB + if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + bunsetu.append(t) + phrase_type = "VP" + phrase.append(t) + # exceptional case: VERB to NOUN + elif phrase_type == "VP" and pos_type == "NP" and ( + prev_dep == 'compound' and prev_head == t.i or + dep == 'compound' and prev == head or + prev_dep == 'nmod' and prev_head == t.i + ): + bunsetu.append(t) + phrase_type = "NP" + phrase.append(t) + else: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = False + phrase_type = pos_type + phrase = [t] + + # NOUN bunsetu + elif phrase_type == "NP": + bunsetu.append(t) + if not bunsetu_may_end and (( + (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # VERB bunsetu + elif phrase_type == "VP": + bunsetu.append(t) + if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + phrase.append(t) + else: + bunsetu_may_end = True + + # ADJ bunsetu + elif phrase_type == "ADJP" and tag != '連体詞': + bunsetu.append(t) + if not bunsetu_may_end and (( + pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # other bunsetu + else: + bunsetu.append(t) + + prev = t.i + prev_tag = t.tag_ + prev_dep = t.dep_ + prev_head = head + + if bunsetu: + yield bunsetu, phrase_type, phrase diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py new file mode 100644 index 000000000..cd1e4fde7 --- /dev/null +++ b/spacy/lang/ja/syntax_iterators.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON, VERB + +# XXX this can probably be pruned a bit +labels = [ + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", +] + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings.add(label) for label in labels] + conj = doc.vocab.strings.add("conj") + np_label = doc.vocab.strings.add("NP") + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + unseen = [w.i for w in word.subtree if w.i not in seen] + if not unseen: + continue + + # this takes care of particles etc. + seen.update(j.i for j in word.subtree) + # This avoids duplicating embedded clauses + seen.update(range(word.i + 1)) + + # if the head of this is a verb, mark that and rights seen + # Don't do the subtree as that can hide other phrases + if word.head.pos == VERB: + seen.add(word.head.i) + seen.update(w.i for w in word.head.rights) + yield unseen[0], word.i + 1, np_label + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py new file mode 100644 index 000000000..5ed9aec89 --- /dev/null +++ b/spacy/lang/ja/tag_bigram_map.py @@ -0,0 +1,37 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB + +# mapping from tag bi-gram to pos of previous token +TAG_BIGRAM_MAP = { + # This covers only small part of AUX. + ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), + + ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), + # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), + + # This covers acl, advcl, obl and root, but has side effect for compound. + ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), + # This covers almost all of the deps + ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), + + ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), + ("副詞", "動詞-非自立可能"): (None, VERB), + ("形容詞-一般", "動詞-非自立可能"): (None, VERB), + ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB), + ("接頭辞", "動詞-非自立可能"): (None, VERB), + ("助詞-係助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), + ("補助記号-読点", "動詞-非自立可能"): (None, VERB), + + ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), + + ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), + ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), + + ("動詞-一般", "助詞-副助詞"): (None, PART), + ("動詞-非自立可能", "助詞-副助詞"): (None, PART), + ("助動詞", "助詞-副助詞"): (None, PART), +} diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..ad416e109 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,82 +1,104 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN +from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { # Explanation of Unidic tags: # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf - # Universal Dependencies Mapping: + # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/pos/all.html - "記号,一般,*,*": { - POS: PUNCT + "記号-一般": { + POS: NOUN }, # this includes characters used to represent sounds like ドレミ - "記号,文字,*,*": { - POS: PUNCT - }, # this is for Greek and Latin characters used as sumbols, as in math - "感動詞,フィラー,*,*": {POS: INTJ}, - "感動詞,一般,*,*": {POS: INTJ}, - # this is specifically for unicode full-width space - "空白,*,*,*": {POS: X}, - # This is used when sequential half-width spaces are present + "記号-文字": { + POS: NOUN + }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math + "感動詞-フィラー": {POS: INTJ}, + "感動詞-一般": {POS: INTJ}, + "空白": {POS: SPACE}, - "形状詞,一般,*,*": {POS: ADJ}, - "形状詞,タリ,*,*": {POS: ADJ}, - "形状詞,助動詞語幹,*,*": {POS: ADJ}, - "形容詞,一般,*,*": {POS: ADJ}, - "形容詞,非自立可能,*,*": {POS: AUX}, # XXX ADJ if alone, AUX otherwise - "助詞,格助詞,*,*": {POS: ADP}, - "助詞,係助詞,*,*": {POS: ADP}, - "助詞,終助詞,*,*": {POS: PART}, - "助詞,準体助詞,*,*": {POS: SCONJ}, # の as in 走るのが速い - "助詞,接続助詞,*,*": {POS: SCONJ}, # verb ending て - "助詞,副助詞,*,*": {POS: PART}, # ばかり, つつ after a verb - "助動詞,*,*,*": {POS: AUX}, - "接続詞,*,*,*": {POS: SCONJ}, # XXX: might need refinement - "接頭辞,*,*,*": {POS: NOUN}, - "接尾辞,形状詞的,*,*": {POS: ADJ}, # がち, チック - "接尾辞,形容詞的,*,*": {POS: ADJ}, # -らしい - "接尾辞,動詞的,*,*": {POS: NOUN}, # -じみ - "接尾辞,名詞的,サ変可能,*": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* - "接尾辞,名詞的,一般,*": {POS: NOUN}, - "接尾辞,名詞的,助数詞,*": {POS: NOUN}, - "接尾辞,名詞的,副詞可能,*": {POS: NOUN}, # -後, -過ぎ - "代名詞,*,*,*": {POS: PRON}, - "動詞,一般,*,*": {POS: VERB}, - "動詞,非自立可能,*,*": {POS: VERB}, # XXX VERB if alone, AUX otherwise - "動詞,非自立可能,*,*,AUX": {POS: AUX}, - "動詞,非自立可能,*,*,VERB": {POS: VERB}, - "副詞,*,*,*": {POS: ADV}, - "補助記号,AA,一般,*": {POS: SYM}, # text art - "補助記号,AA,顔文字,*": {POS: SYM}, # kaomoji - "補助記号,一般,*,*": {POS: SYM}, - "補助記号,括弧開,*,*": {POS: PUNCT}, # open bracket - "補助記号,括弧閉,*,*": {POS: PUNCT}, # close bracket - "補助記号,句点,*,*": {POS: PUNCT}, # period or other EOS marker - "補助記号,読点,*,*": {POS: PUNCT}, # comma - "名詞,固有名詞,一般,*": {POS: PROPN}, # general proper noun - "名詞,固有名詞,人名,一般": {POS: PROPN}, # person's name - "名詞,固有名詞,人名,姓": {POS: PROPN}, # surname - "名詞,固有名詞,人名,名": {POS: PROPN}, # first name - "名詞,固有名詞,地名,一般": {POS: PROPN}, # place name - "名詞,固有名詞,地名,国": {POS: PROPN}, # country name - "名詞,助動詞語幹,*,*": {POS: AUX}, - "名詞,数詞,*,*": {POS: NUM}, # includes Chinese numerals - "名詞,普通名詞,サ変可能,*": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun - "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB}, - "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN}, # ex: 下手 - "名詞,普通名詞,一般,*": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 - "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ}, - "名詞,普通名詞,助数詞可能,*": {POS: NOUN}, # counter / unit - "名詞,普通名詞,副詞可能,*": {POS: NOUN}, - "連体詞,*,*,*": {POS: ADJ}, # XXX this has exceptions based on literal token - "連体詞,*,*,*,ADJ": {POS: ADJ}, - "連体詞,*,*,*,PRON": {POS: PRON}, - "連体詞,*,*,*,DET": {POS: DET}, + + "形状詞-一般": {POS: ADJ}, + "形状詞-タリ": {POS: ADJ}, + "形状詞-助動詞語幹": {POS: AUX}, + + "形容詞-一般": {POS: ADJ}, + + "形容詞-非自立可能": {POS: ADJ}, # XXX ADJ if alone, AUX otherwise + + "助詞-格助詞": {POS: ADP}, + + "助詞-係助詞": {POS: ADP}, + + "助詞-終助詞": {POS: PART}, + "助詞-準体助詞": {POS: SCONJ}, # の as in 走るのが速い + "助詞-接続助詞": {POS: SCONJ}, # verb ending て0 + + "助詞-副助詞": {POS: ADP}, # ばかり, つつ after a verb + + "助動詞": {POS: AUX}, + + "接続詞": {POS: CCONJ}, # XXX: might need refinement + "接頭辞": {POS: NOUN}, + "接尾辞-形状詞的": {POS: PART}, # がち, チック + + "接尾辞-形容詞的": {POS: AUX}, # -らしい + + "接尾辞-動詞的": {POS: PART}, # -じみ + "接尾辞-名詞的-サ変可能": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞-名詞的-一般": {POS: NOUN}, + "接尾辞-名詞的-助数詞": {POS: NOUN}, + "接尾辞-名詞的-副詞可能": {POS: NOUN}, # -後, -過ぎ + + "代名詞": {POS: PRON}, + + "動詞-一般": {POS: VERB}, + + "動詞-非自立可能": {POS: AUX}, # XXX VERB if alone, AUX otherwise + + "副詞": {POS: ADV}, + + "補助記号-AA-一般": {POS: SYM}, # text art + "補助記号-AA-顔文字": {POS: PUNCT}, # kaomoji + + "補助記号-一般": {POS: SYM}, + + "補助記号-括弧開": {POS: PUNCT}, # open bracket + "補助記号-括弧閉": {POS: PUNCT}, # close bracket + "補助記号-句点": {POS: PUNCT}, # period or other EOS marker + "補助記号-読点": {POS: PUNCT}, # comma + + "名詞-固有名詞-一般": {POS: PROPN}, # general proper noun + "名詞-固有名詞-人名-一般": {POS: PROPN}, # person's name + "名詞-固有名詞-人名-姓": {POS: PROPN}, # surname + "名詞-固有名詞-人名-名": {POS: PROPN}, # first name + "名詞-固有名詞-地名-一般": {POS: PROPN}, # place name + "名詞-固有名詞-地名-国": {POS: PROPN}, # country name + + "名詞-助動詞語幹": {POS: AUX}, + "名詞-数詞": {POS: NUM}, # includes Chinese numerals + + "名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + + "名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, + + "名詞-普通名詞-一般": {POS: NOUN}, + + "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 + + "名詞-普通名詞-助数詞可能": {POS: NOUN}, # counter / unit + + "名詞-普通名詞-副詞可能": {POS: NOUN}, + + "連体詞": {POS: DET}, # XXX this has exceptions based on literal token + + # GSD tags. These aren't in Unidic, but we need them for the GSD data. + "外国語": {POS: PROPN}, # Foreign words + + "絵文字・記号等": {POS: SYM}, # emoji / kaomoji ^^; + } diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py new file mode 100644 index 000000000..355cc655b --- /dev/null +++ b/spacy/lang/ja/tag_orth_map.py @@ -0,0 +1,30 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X + +# mapping from tag bi-gram to pos of previous token +TAG_ORTH_MAP = { + "空白": { + " ": SPACE, + " ": X, + }, + "助詞-副助詞": { + "たり": PART, + }, + "連体詞": { + "あの": DET, + "かの": DET, + "この": DET, + "その": DET, + "どの": DET, + "彼の": DET, + "此の": DET, + "其の": DET, + "ある": PRON, + "こんな": PRON, + "そんな": PRON, + "どんな": PRON, + "あらゆる": PRON, + }, +} diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..58cd3f3bf 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -6,7 +6,7 @@ import pytest @pytest.mark.parametrize( "word,lemma", - [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")], + [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")], ) def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..5213aed58 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -14,20 +14,26 @@ TOKENIZER_TESTS = [ ] TAG_TESTS = [ - ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']), - ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']), - ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']), - ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']), - ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*']) + ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']), + ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']), + ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']), + ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']), + ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能']) ] POS_TESTS = [ - ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']), + ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']), ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']), ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']), ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) ] + +SENTENCE_TESTS = [ + ('あれ。これ。', ['あれ。', 'これ。']), + ('「伝染るんです。」という漫画があります。', + ['「伝染るんです。」という漫画があります。']), + ] # fmt: on @@ -43,14 +49,27 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags +#XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos +@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) +def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): + sents = [str(sent) for sent in ja_tokenizer(text).sents] + assert sents == expected_sents + def test_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == " " - assert tokens[2].orth_ == " " + assert tokens[1].orth_ == " " + +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS + +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) +def test_tokenizer_naughty_strings(ja_tokenizer, text): + tokens = ja_tokenizer(text) + assert tokens.text_with_ws == text + From 4d1ba6feb414177457fcec5983038216e32f1a12 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Thu, 4 Jun 2020 19:16:33 +0200 Subject: [PATCH 062/203] add tag variant for 2.3 (#5542) --- website/docs/api/cli.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b49a2fb08..6f4b8bb73 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 1ac43d78f9f8e1d0fea518be0c020888cf117bda Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Thu, 4 Jun 2020 20:02:05 +0200 Subject: [PATCH 063/203] Avoid libc.stdint for UINT64_MAX (#5545) --- spacy/lexeme.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index dec2993fa..1df516dcb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -12,7 +12,6 @@ import numpy import warnings from thinc.neural.util import get_array_module -from libc.stdint cimport UINT64_MAX from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP @@ -23,7 +22,7 @@ from .attrs import intify_attrs from .errors import Errors, Warnings -OOV_RANK = UINT64_MAX +OOV_RANK = 0xffffffffffffffff # UINT64_MAX memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) EMPTY_LEXEME.id = OOV_RANK From 009119fa66c39f86fe500e35c087cb67dec5a4a8 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Sat, 6 Jun 2020 00:22:18 +0200 Subject: [PATCH 064/203] Requirements/setup for Japanese (#5553) * Add sudachipy and sudachidict_core to Makefile * Switch ja requirements from fugashi to sudachipy --- Makefile | 4 ++-- setup.cfg | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 2764da118..865bf44c5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ VENV := ./env$(PYVER) version := $(shell "bin/get-version.sh") dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp - $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 + $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core chmod a+rx $@ cp $@ dist/spacy.pex @@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* $(VENV)/bin/pip wheel . -w ./wheelhouse - $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 -w ./wheelhouse + $(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse touch $@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex diff --git a/setup.cfg b/setup.cfg index 1e29f1ead..e556ba19c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -78,7 +78,8 @@ cuda102 = cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = - fugashi>=0.1.3 + sudachipy>=0.4.5 + sudachidict_core>=20200330 ko = natto-py==0.9.0 th = From 456bf47f5184127510e39aaef7135a8ed979bc86 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Mon, 8 Jun 2020 22:49:34 +0900 Subject: [PATCH 065/203] fix a bug causing mis-alignments (#5560) --- .github/contributors/hiroshi-matsuda-rit.md | 106 ++++++++++++++++++++ spacy/lang/ja/__init__.py | 82 +++++++++++---- 2 files changed, 169 insertions(+), 19 deletions(-) create mode 100644 .github/contributors/hiroshi-matsuda-rit.md diff --git a/.github/contributors/hiroshi-matsuda-rit.md b/.github/contributors/hiroshi-matsuda-rit.md new file mode 100644 index 000000000..bf19125fb --- /dev/null +++ b/.github/contributors/hiroshi-matsuda-rit.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Hiroshi Matsuda | +| Company name (if applicable) | Megagon Labs, Tokyo | +| Title or role (if applicable) | Research Scientist | +| Date | June 6, 2020 | +| GitHub username | hiroshi-matsuda-rit | +| Website (optional) | | diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 09546467e..a623c7bdd 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,7 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -import re from collections import namedtuple from .stop_words import STOP_WORDS @@ -14,7 +13,9 @@ from ...compat import copy_reg from ...language import Language from ...symbols import POS from ...tokens import Doc -from ...util import DummyTokenizer, get_words_and_spaces +from ...util import DummyTokenizer + +from ...errors import Errors # Hold the attributes we need with convenient names DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) @@ -41,7 +42,7 @@ def try_sudachi_import(): ) -def resolve_pos(token, next_token): +def resolve_pos(orth, pos, next_pos): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context @@ -53,22 +54,22 @@ def resolve_pos(token, next_token): # token. # orth based rules - if token.pos in TAG_ORTH_MAP: - orth_map = TAG_ORTH_MAP[token.pos[0]] - if token.surface in orth_map: - return orth_map[token.surface], None + if pos[0] in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[pos[0]] + if orth in orth_map: + return orth_map[orth], None # tag bi-gram mapping - if next_token: - tag_bigram = token.pos[0], next_token.pos[0] + if next_pos: + tag_bigram = pos[0], next_pos[0] if tag_bigram in TAG_BIGRAM_MAP: bipos = TAG_BIGRAM_MAP[tag_bigram] if bipos[0] is None: - return TAG_MAP[token.pos[0]][POS], bipos[1] + return TAG_MAP[pos[0]][POS], bipos[1] else: return bipos - return TAG_MAP[token.pos[0]][POS], None + return TAG_MAP[pos[0]][POS], None # Use a mapping of paired punctuation to avoid splitting quoted sentences. @@ -120,6 +121,48 @@ def get_dtokens(tokenizer, text): words = [ww for ww in words if len(ww.surface) > 0] return words + +def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): + words = [x.surface for x in dtokens] + if "".join("".join(words).split()) != "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + text_words = [] + text_lemmas = [] + text_tags = [] + text_spaces = [] + text_pos = 0 + # normalize words to remove all whitespace tokens + norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) + # align words with text + for word, dtoken in zip(norm_words, norm_dtokens): + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) + if word_start > 0: + w = text[text_pos:text_pos + word_start] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + text_pos += word_start + text_words.append(word) + text_lemmas.append(dtoken.lemma) + text_tags.append(dtoken.pos) + text_spaces.append(False) + text_pos += len(word) + if text_pos < len(text) and text[text_pos] == " ": + text_spaces[-1] = True + text_pos += 1 + if text_pos < len(text): + w = text[text_pos:] + text_words.append(w) + text_lemmas.append(w) + text_tags.append(gap_tag) + text_spaces.append(False) + return text_words, text_lemmas, text_tags, text_spaces + + class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) @@ -128,22 +171,23 @@ class JapaneseTokenizer(DummyTokenizer): def __call__(self, text): dtokens = get_dtokens(self.tokenizer, text) - words = [x.surface for x in dtokens] - words, spaces = get_words_and_spaces(words, text) - unidic_tags = [",".join(x.pos) for x in dtokens] + words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) doc = Doc(self.vocab, words=words, spaces=spaces) next_pos = None - for ii, (token, dtoken) in enumerate(zip(doc, dtokens)): - ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None - token.tag_ = dtoken.pos[0] + for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): + token.tag_ = unidic_tag[0] if next_pos: token.pos = next_pos next_pos = None else: - token.pos, next_pos = resolve_pos(dtoken, ntoken) + token.pos, next_pos = resolve_pos( + token.orth_, + unidic_tag, + unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None + ) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.lemma + token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags separate_sentences(doc) From 3bf111585d251ceb6dc41ca5c097a85ca194fb3f Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Mon, 8 Jun 2020 16:29:05 +0200 Subject: [PATCH 066/203] Update Japanese tokenizer config and add serialization (#5562) * Use `config` dict for tokenizer settings * Add serialization of split mode setting * Add tests for tokenizer split modes and serialization of split mode setting Based on #5561 --- spacy/lang/ja/__init__.py | 77 +++++++++++++++++++++++---- spacy/tests/lang/ja/test_serialize.py | 37 +++++++++++++ spacy/tests/lang/ja/test_tokenizer.py | 26 +++++++-- 3 files changed, 127 insertions(+), 13 deletions(-) create mode 100644 spacy/tests/lang/ja/test_serialize.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index a623c7bdd..294c6b38d 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,7 +1,8 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from collections import namedtuple +import srsly +from collections import namedtuple, OrderedDict from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -10,12 +11,13 @@ from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG from ...compat import copy_reg +from ...errors import Errors from ...language import Language from ...symbols import POS from ...tokens import Doc from ...util import DummyTokenizer +from ... import util -from ...errors import Errors # Hold the attributes we need with convenient names DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) @@ -26,14 +28,20 @@ DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) DummySpace = DummyNode(" ", " ", " ") -def try_sudachi_import(): +def try_sudachi_import(split_mode="A"): """SudachiPy is required for Japanese support, so check for it. - It it's not available blow up and explain how to fix it.""" + It it's not available blow up and explain how to fix it. + split_mode should be one of these values: "A", "B", "C", None->"A".""" try: from sudachipy import dictionary, tokenizer - + split_mode = { + None: tokenizer.Tokenizer.SplitMode.A, + "A": tokenizer.Tokenizer.SplitMode.A, + "B": tokenizer.Tokenizer.SplitMode.B, + "C": tokenizer.Tokenizer.SplitMode.C, + }[split_mode] tok = dictionary.Dictionary().create( - mode=tokenizer.Tokenizer.SplitMode.A + mode=split_mode ) return tok except ImportError: @@ -164,9 +172,10 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): class JapaneseTokenizer(DummyTokenizer): - def __init__(self, cls, nlp=None): + def __init__(self, cls, nlp=None, config={}): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_sudachi_import() + self.split_mode = config.get("split_mode", None) + self.tokenizer = try_sudachi_import(self.split_mode) def __call__(self, text): dtokens = get_dtokens(self.tokenizer, text) @@ -193,6 +202,54 @@ class JapaneseTokenizer(DummyTokenizer): separate_sentences(doc) return doc + def _get_config(self): + config = OrderedDict( + ( + ("split_mode", self.split_mode), + ) + ) + return config + + def _set_config(self, config={}): + self.split_mode = config.get("split_mode", None) + + def to_bytes(self, **kwargs): + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ) + ) + util.from_bytes(data, deserializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ) + ) + util.from_disk(path, serializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -203,8 +260,8 @@ class JapaneseDefaults(Language.Defaults): writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod - def create_tokenizer(cls, nlp=None): - return JapaneseTokenizer(cls, nlp) + def create_tokenizer(cls, nlp=None, config={}): + return JapaneseTokenizer(cls, nlp, config) class Japanese(Language): diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py new file mode 100644 index 000000000..018e645bb --- /dev/null +++ b/spacy/tests/lang/ja/test_serialize.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.ja import Japanese +from ...util import make_tempdir + + +def test_ja_tokenizer_serialize(ja_tokenizer): + tokenizer_bytes = ja_tokenizer.to_bytes() + nlp = Japanese() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + with make_tempdir() as d: + file_path = d / "tokenizer" + ja_tokenizer.to_disk(file_path) + nlp = Japanese() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + assert nlp.tokenizer.split_mode == None + + # split mode is (de)serialized correctly + nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_r = Japanese() + nlp_bytes = nlp.to_bytes() + nlp_r.from_bytes(nlp_bytes) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" + + with make_tempdir() as d: + nlp.to_disk(d) + nlp_r = Japanese() + nlp_r.from_disk(d) + assert nlp_bytes == nlp_r.to_bytes() + assert nlp_r.tokenizer.split_mode == "B" diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 5213aed58..82c43fe4c 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import pytest +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS +from spacy.lang.ja import Japanese # fmt: off TOKENIZER_TESTS = [ @@ -55,21 +57,39 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] assert sents == expected_sents -def test_extra_spaces(ja_tokenizer): +def test_ja_tokenizer_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") assert tokens[1].orth_ == " " -from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS @pytest.mark.parametrize("text", NAUGHTY_STRINGS) -def test_tokenizer_naughty_strings(ja_tokenizer, text): +def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): tokens = ja_tokenizer(text) assert tokens.text_with_ws == text + +@pytest.mark.parametrize("text,len_a,len_b,len_c", + [ + ("選挙管理委員会", 4, 3, 1), + ("客室乗務員", 3, 2, 1), + ("労働者協同組合", 4, 3, 1), + ("機能性食品", 3, 2, 1), + ] +) +def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): + nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) + nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) + + assert len(ja_tokenizer(text)) == len_a + assert len(nlp_a(text)) == len_a + assert len(nlp_b(text)) == len_b + assert len(nlp_c(text)) == len_c From d1799da200782fb5f3b09bee58cf00092e5a05f0 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Mon, 8 Jun 2020 19:47:32 +0200 Subject: [PATCH 067/203] bot for answered issues (#5563) * add tiangolo's issue manager * fix formatting * spaces, tabs, who knows * formatting * I'll get this right at some point * maybe one more space ? --- .github/workflows/issue-manager.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/issue-manager.yml diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml new file mode 100644 index 000000000..8a5c1ee94 --- /dev/null +++ b/.github/workflows/issue-manager.yml @@ -0,0 +1,28 @@ +name: Issue Manager + +on: + schedule: + - cron: "0 0 * * *" + issue_comment: + types: + - created + - edited + issues: + types: + - labeled + +jobs: + issue-manager: + runs-on: ubuntu-latest + steps: + - uses: tiangolo/issue-manager@0.2.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + config: > + { + "answered": { + "delay": "P3D", + "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", + "remove_label": true + } + } From de00f967ce5fd720633c717252aae83b6f2b1602 Mon Sep 17 00:00:00 2001 From: Martino Mensio <martino.mensio@open.ac.uk> Date: Mon, 8 Jun 2020 19:26:30 +0100 Subject: [PATCH 068/203] adding spacy-universal-sentence-encoder (#5534) * adding spacy-universal-sentence-encoder * update affiliation * updated code example --- .github/contributors/MartinoMensio.md | 4 ++-- website/meta/universe.json | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/contributors/MartinoMensio.md b/.github/contributors/MartinoMensio.md index 1cd32d622..27e453699 100644 --- a/.github/contributors/MartinoMensio.md +++ b/.github/contributors/MartinoMensio.md @@ -99,8 +99,8 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | | Name | Martino Mensio | -| Company name (if applicable) | Polytechnic University of Turin | -| Title or role (if applicable) | Student | +| Company name (if applicable) | The Open University | +| Title or role (if applicable) | PhD Student | | Date | 17 November 2017 | | GitHub username | MartinoMensio | | Website (optional) | https://martinomensio.github.io/ | diff --git a/website/meta/universe.json b/website/meta/universe.json index 58be719ed..2c74a2964 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,29 @@ { "resources": [ + { + "id": "spacy-universal-sentence-encoder", + "title": "SpaCy - Universal Sentence Encoder", + "slogan": "Make use of Google's Universal Sentence Encoder directly within SpaCy", + "description": "This library lets you use Universal Sentence Encoder embeddings of Docs, Spans and Tokens directly from TensorFlow Hub", + "github": "MartinoMensio/spacy-universal-sentence-encoder-tfhub", + "code_example": [ + "import spacy_universal_sentence_encoder", + "load one of the models: ['en_use_md', 'en_use_lg', 'xx_use_md', 'xx_use_lg']", + "nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')", + "# get two documents", + "doc_1 = nlp('Hi there, how are you?')", + "doc_2 = nlp('Hello there, how are you doing today?')", + "# use the similarity method that is based on the vectors, on Doc, Span or Token", + "print(doc_1.similarity(doc_2[0:7]))" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, { "id": "whatlies", "title": "whatlies", From f162815f45c69dd71e194361284dbef3939fb9fc Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Mon, 8 Jun 2020 21:09:23 +0200 Subject: [PATCH 069/203] Handle empty and whitespace-only docs for Japanese (#5564) Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer. --- spacy/lang/ja/__init__.py | 10 ++++++++++ spacy/tests/lang/ja/test_tokenizer.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 294c6b38d..39e0445c2 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): text_tags = [] text_spaces = [] text_pos = 0 + # handle empty and whitespace-only texts + if len(words) == 0: + return text_words, text_lemmas, text_tags, text_spaces + elif len([word for word in words if not word.isspace()]) == 0: + assert text.isspace() + text_words = [text] + text_lemmas = [text] + text_tags = [gap_tag] + text_spaces = [False] + return text_words, text_lemmas, text_tags, text_spaces # normalize words to remove all whitespace tokens norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) # align words with text diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 82c43fe4c..30cba42b1 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_a(text)) == len_a assert len(nlp_b(text)) == len_b assert len(nlp_c(text)) == len_c + + +def test_ja_tokenizer_emptyish_texts(ja_tokenizer): + doc = ja_tokenizer("") + assert len(doc) == 0 + doc = ja_tokenizer(" ") + assert len(doc) == 1 + doc = ja_tokenizer("\n\n\n \t\t \n\n\n") + assert len(doc) == 1 From 86112d2168dc1d763a233a8c531f09002101818e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Tue, 9 Jun 2020 08:57:38 +0200 Subject: [PATCH 070/203] update issue manager's version --- .github/workflows/issue-manager.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 8a5c1ee94..b789494a2 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -15,7 +15,7 @@ jobs: issue-manager: runs-on: ubuntu-latest steps: - - uses: tiangolo/issue-manager@0.2.0 + - uses: tiangolo/issue-manager@0.2.1 with: token: ${{ secrets.GITHUB_TOKEN }} config: > From b7e6e1b9a75ea1301ea8253cd2c6a5d3740cef12 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Tue, 9 Jun 2020 12:00:59 +0200 Subject: [PATCH 071/203] Disable sentence segmentation in ja tokenizer (#5566) --- spacy/lang/ja/__init__.py | 1 - spacy/tests/lang/ja/test_tokenizer.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 39e0445c2..371cc0f98 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -209,7 +209,6 @@ class JapaneseTokenizer(DummyTokenizer): token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags - separate_sentences(doc) return doc def _get_config(self): diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 30cba42b1..26be5cf59 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -58,6 +58,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): assert pos == expected_pos +@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] From 0a70bd62811778b59429fa23871b6ca862678636 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Tue, 9 Jun 2020 15:47:31 +0200 Subject: [PATCH 072/203] Bump version to 2.3.0.dev1 (#5567) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index be1b3ae56..90b5f9245 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0.dev0" +__version__ = "2.3.0.dev1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 12c1965070a1a8bbe80efaae5755116633d94886 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Wed, 10 Jun 2020 10:46:12 +0200 Subject: [PATCH 073/203] set delay to 7 days --- .github/workflows/issue-manager.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b789494a2..b52095fe8 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -21,7 +21,7 @@ jobs: config: > { "answered": { - "delay": "P3D", + "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", "remove_label": true } From 28db7dd5d9aaf53a3c4e9b13048415502d998aae Mon Sep 17 00:00:00 2001 From: Jones Martins <jonesmvc@gmail.com> Date: Wed, 10 Jun 2020 13:47:04 -0300 Subject: [PATCH 074/203] Add missing pronoums/determiners (#5569) * Add missing pronoums/determiners * Add test for missing pronoums * Add contributor file --- .github/contributors/jonesmartins.md | 106 +++++++++++++++++++++++++ spacy/lang/en/tokenizer_exceptions.py | 2 +- spacy/tests/lang/en/test_exceptions.py | 2 +- 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/jonesmartins.md diff --git a/.github/contributors/jonesmartins.md b/.github/contributors/jonesmartins.md new file mode 100644 index 000000000..5663f6193 --- /dev/null +++ b/.github/contributors/jonesmartins.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jones Martins | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-10 | +| GitHub username | jonesmartins | +| Website (optional) | | diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 6a553052b..f8367c0f5 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -139,7 +139,7 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: +for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index a78e1815f..1ff64eff2 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -46,7 +46,7 @@ def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"]) +@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"]) def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 From bab30e4ad2ad35d7133b7f8027a3558a02e018e4 Mon Sep 17 00:00:00 2001 From: Jones Martins <jonesmvc@gmail.com> Date: Wed, 10 Jun 2020 16:54:06 -0300 Subject: [PATCH 075/203] Add "c'mon" token exception (#5570) * Add "c'mon" exception * Fix typo in "C'mon" exception --- spacy/lang/en/tokenizer_exceptions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index f8367c0f5..964a714ae 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -399,6 +399,14 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], + "c'mon": [ + {ORTH: "c'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ], + "C'mon": [ + {ORTH: "C'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ] } _exc.update(_other_exc) From fe167fcf7d23ee6c73877a11351984221a9aacd5 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Thu, 11 Jun 2020 10:23:50 +0200 Subject: [PATCH 076/203] Update pytest conf for sudachipy with Japanese (#5574) --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() From 556895177edbc5d7dc64e0f95e36273a2fb16478 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Thu, 11 Jun 2020 13:47:37 +0200 Subject: [PATCH 077/203] Expand Japanese requirements warning (#5572) Include explicit install instructions in Japanese requirements warning. --- spacy/lang/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 371cc0f98..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -46,7 +46,10 @@ def try_sudachi_import(split_mode="A"): return tok except ImportError: raise ImportError( - "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) From 18c6dc8093df4e075f6168b98afd500a73a384e6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Thu, 11 Jun 2020 14:09:40 +0200 Subject: [PATCH 078/203] removing label both on comment and on close --- .github/workflows/issue-manager.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b52095fe8..3fb42ed01 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -20,9 +20,10 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} config: > { - "answered": { + "resolved": { "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", - "remove_label": true + "remove_label_on_comment": true, + "remove_label_on_close": true } } From fa46e0bef2226d1ba673537d2097d92f151304c5 Mon Sep 17 00:00:00 2001 From: theudas <psodmann@gmail.com> Date: Fri, 12 Jun 2020 02:03:23 +0200 Subject: [PATCH 079/203] Added Parameter to NEL to take n sentences into account (#5548) * added setting for neighbour sentence in NEL * added spaCy contributor agreement * added multi sentence also for training * made the try-except block smaller --- .github/contributors/theudas.md | 106 ++++++++++++++++++++++++++ spacy/pipeline/pipes.pyx | 131 ++++++++++++++++++++------------ 2 files changed, 189 insertions(+), 48 deletions(-) create mode 100644 .github/contributors/theudas.md diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 105ce00e6..01472a6d0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1170,6 +1170,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1218,6 +1221,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1228,17 +1234,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1309,69 +1332,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From 44967a3f9cfc3e20375aac3782897325785e15a9 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Thu, 11 Jun 2020 10:23:50 +0200 Subject: [PATCH 080/203] Update pytest conf for sudachipy with Japanese (#5574) --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() From 4724fa4cf4b24be92a15c39c564d571eeae1470a Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Thu, 11 Jun 2020 13:47:37 +0200 Subject: [PATCH 081/203] Expand Japanese requirements warning (#5572) Include explicit install instructions in Japanese requirements warning. --- spacy/lang/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 371cc0f98..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -46,7 +46,10 @@ def try_sudachi_import(split_mode="A"): return tok except ImportError: raise ImportError( - "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) From 3f5e2f9d99bc8ad3b86c53b8c9eadcba56c5a1a7 Mon Sep 17 00:00:00 2001 From: theudas <psodmann@gmail.com> Date: Fri, 12 Jun 2020 02:03:23 +0200 Subject: [PATCH 082/203] Added Parameter to NEL to take n sentences into account (#5548) * added setting for neighbour sentence in NEL * added spaCy contributor agreement * added multi sentence also for training * made the try-except block smaller --- .github/contributors/theudas.md | 106 ++++++++++++++++++++++++++ spacy/pipeline/pipes.pyx | 131 ++++++++++++++++++++------------ 2 files changed, 189 insertions(+), 48 deletions(-) create mode 100644 .github/contributors/theudas.md diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 105ce00e6..01472a6d0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1170,6 +1170,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1218,6 +1221,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1228,17 +1234,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1309,69 +1332,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From aa5b40fa6423916ae79bf6e750a17c50020f4078 Mon Sep 17 00:00:00 2001 From: Arvind Srinivasan <arvind@cheenu.net> Date: Sat, 13 Jun 2020 19:26:26 +0530 Subject: [PATCH 083/203] Added Tamil Example Sentences (#5583) * Added Examples for Tamil Sentences #### Description This PR add example sentences for the Tamil language which were missing as per issue #1107 #### Type of Change This is an enhancement. * Accepting spaCy Contributor Agreement * Signed on my behalf as an individual --- .github/contributors/Arvindcheenu.md | 106 +++++++++++++++++++++++++++ spacy/lang/ta/examples.py | 5 ++ 2 files changed, 111 insertions(+) create mode 100644 .github/contributors/Arvindcheenu.md diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] From c482f20778f3464fefbc7aa57782de5fe713a77f Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 15 Jun 2020 14:56:04 +0200 Subject: [PATCH 084/203] Fix and add warnings related to spacy-lookups-data (#5588) * Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.) --- spacy/errors.py | 13 ++++++++++--- spacy/pipeline/pipes.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/parser/test_ner.py | 17 +++++++++++++++++ spacy/tests/test_lemmatizer.py | 6 +++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list From c94f7d0e75e9e4ce25b719edee3adb4ecd74ee50 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 15 Jun 2020 14:56:51 +0200 Subject: [PATCH 085/203] Updates to docstrings (#5589) --- spacy/gold.pyx | 1 + spacy/vocab.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} From f698007907518833d853740a5add8cd2b2a253b1 Mon Sep 17 00:00:00 2001 From: Arvind Srinivasan <arvind@cheenu.net> Date: Sat, 13 Jun 2020 19:26:26 +0530 Subject: [PATCH 086/203] Added Tamil Example Sentences (#5583) * Added Examples for Tamil Sentences #### Description This PR add example sentences for the Tamil language which were missing as per issue #1107 #### Type of Change This is an enhancement. * Accepting spaCy Contributor Agreement * Signed on my behalf as an individual --- .github/contributors/Arvindcheenu.md | 106 +++++++++++++++++++++++++++ spacy/lang/ta/examples.py | 5 ++ 2 files changed, 111 insertions(+) create mode 100644 .github/contributors/Arvindcheenu.md diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] From e867e9fa8ffe8b7eec9185bb1d35c39c835458d1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 15 Jun 2020 14:56:04 +0200 Subject: [PATCH 087/203] Fix and add warnings related to spacy-lookups-data (#5588) * Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.) --- spacy/errors.py | 13 ++++++++++--- spacy/pipeline/pipes.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/parser/test_ner.py | 17 +++++++++++++++++ spacy/tests/test_lemmatizer.py | 6 +++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list From 0d8405aafac08353d91ead0cf060fd2962e540da Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Mon, 15 Jun 2020 14:56:51 +0200 Subject: [PATCH 088/203] Updates to docstrings (#5589) --- spacy/gold.pyx | 1 + spacy/vocab.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} From 7ff447c5a0198600bfb8f4a43b042a6ed8276126 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 15 Jun 2020 18:22:25 +0200 Subject: [PATCH 089/203] Set version to v2.3.0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 90b5f9245..91810fa68 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "2.3.0.dev1" +__version__ = "2.3.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From d5110ffbf2474339ffde948fc6d899873484285e Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 15:37:35 +0200 Subject: [PATCH 090/203] Documentation updates for v2.3.0 (#5593) * Update website models for v2.3.0 * Add docs for Chinese word segmentation * Tighten up Chinese docs section * Merge branch 'master' into docs/v2.3.0 [ci skip] * Merge branch 'master' into docs/v2.3.0 [ci skip] * Auto-format and update version * Update matcher.md * Update languages and sorting * Typo in landing page * Infobox about token_match behavior * Add meta and basic docs for Japanese * POS -> TAG in models table * Add info about lookups for normalization * Updates to API docs for v2.3 * Update adding norm exceptions for adding languages * Add --omit-extra-lookups to CLI API docs * Add initial draft of "What's New in v2.3" * Add new in v2.3 tags to Chinese and Japanese sections * Add tokenizer to migration section * Add new in v2.3 flags to init-model * Typo * More what's new in v2.3 Co-authored-by: Ines Montani <ines@ines.io> --- README.md | 17 +- website/docs/api/cli.md | 21 ++- website/docs/api/cython-structs.md | 3 - website/docs/api/goldparse.md | 1 + website/docs/api/lexeme.md | 2 +- website/docs/api/matcher.md | 11 +- website/docs/api/sentencizer.md | 2 +- website/docs/api/token.md | 2 +- website/docs/api/vocab.md | 3 + website/docs/usage/adding-languages.md | 34 +++- website/docs/usage/linguistic-features.md | 23 ++- website/docs/usage/models.md | 117 ++++++++++++ website/docs/usage/v2-3.md | 213 ++++++++++++++++++++++ website/meta/languages.json | 149 ++++++++++----- website/meta/sidebars.json | 1 + website/src/templates/models.js | 2 +- website/src/widgets/landing.js | 2 +- website/src/widgets/languages.js | 4 +- 18 files changed, 519 insertions(+), 88 deletions(-) create mode 100644 website/docs/usage/v2-3.md diff --git a/README.md b/README.md index 31dc78d63..4b5f3d0fa 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. spaCy comes with [pretrained statistical models](https://spacy.io/models) and word vectors, and -currently supports tokenization for **50+ languages**. It features +currently supports tokenization for **60+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. -💫 **Version 2.2 out now!** +💫 **Version 2.3 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -32,7 +32,7 @@ It's commercial open-source software, released under the MIT license. | --------------- | -------------------------------------------------------------- | | [spaCy 101] | New to spaCy? Here's everything you need to know! | | [Usage Guides] | How to use spaCy and its features. | -| [New in v2.2] | New features, backwards incompatibilities and migration guide. | +| [New in v2.3] | New features, backwards incompatibilities and migration guide. | | [API Reference] | The detailed reference for spaCy's API. | | [Models] | Download statistical language models for spaCy. | | [Universe] | Libraries, extensions, demos, books and courses. | @@ -40,7 +40,7 @@ It's commercial open-source software, released under the MIT license. | [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 -[new in v2.2]: https://spacy.io/usage/v2-2 +[new in v2.3]: https://spacy.io/usage/v2-3 [usage guides]: https://spacy.io/usage/ [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models @@ -113,12 +113,13 @@ of `v2.0.13`). pip install spacy ``` -To install additional data tables for lemmatization in **spaCy v2.2+** you can -run `pip install spacy[lookups]` or install +To install additional data tables for lemmatization and normalization in +**spaCy v2.2+** you can run `pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) separately. The lookups package is needed to create blank models with -lemmatization data, and to lemmatize in languages that don't yet come with -pretrained models and aren't powered by third-party libraries. +lemmatization data for v2.2+ plus normalization data for v2.3+, and to +lemmatize in languages that don't yet come with pretrained models and aren't +powered by third-party libraries. When using pip it is generally recommended to install packages in a virtual environment to avoid modifying system state: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 6f4b8bb73..fe8877c69 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -541,16 +541,17 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 935bce25d..8ee1f1b9a 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -171,9 +171,6 @@ struct. | `shape` | <Abbr title="uint64_t">`attr_t`</Abbr> | Transform of the lexeme's string, to show orthographic features. | | `prefix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the start of the lexeme. Defaults to `N=1`. | | `suffix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the end of the lexeme. Defaults to `N=3`. | -| `cluster` | <Abbr title="uint64_t">`attr_t`</Abbr> | Brown cluster ID. | -| `prob` | `float` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | -| `sentiment` | `float` | A scalar value indicating positivity or negativity. | ### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 443913311..5df625991 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -22,6 +22,7 @@ missing – the gradient for those labels will be zero. | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | | `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | +| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False.`. | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..f7f6d654c 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation. | `like_url` | bool | Does the lexeme resemble a URL? | | `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Is the lexeme out-of-vocabulary? | +| `is_oov` | bool | Does the lexeme have a word vector? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | | `lang_` | unicode | Language of the parent vocabulary. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..ac2f898e0 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -40,7 +40,8 @@ string where an integer is expected) or unexpected property names. ## Matcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. +Find all token sequences matching the supplied patterns on the `Doc`. As of +spaCy v2.3, the `Matcher` can also be called on `Span` objects. > #### Example > @@ -54,10 +55,10 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doc` | `Doc` | The document to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).. | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | <Infobox title="Important note" variant="warning"> diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..5a1ea162a 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -42,7 +42,7 @@ Initialize the sentencizer. | Name | Type | Description | | ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | | **RETURNS** | `Sentencizer` | The newly constructed object. | ## Sentencizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 0fa86b7bc..9f8594c96 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -459,7 +459,7 @@ The L2 norm of the token's vector representation. | `like_url` | bool | Does the token resemble a URL? | | `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_oov` | bool | Does the token have a word vector? | | `is_stop` | bool | Is the token part of a "stop list"? | | `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..2be6d67ed 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,6 +27,9 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | +| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | +| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | +| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..d42aad705 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -297,9 +297,35 @@ though `$` and `€` are very different, spaCy normalizes them both to `$`. This way, they'll always be seen as similar, no matter how common they were in the training data. -Norm exceptions can be provided as a simple dictionary. For more examples, see -the English -[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/norm_exceptions.py). +As of spaCy v2.3, language-specific norm exceptions are provided as a +JSON dictionary in the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather +than in the main library. For a full example, see +[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json). + +```json +### Example +{ + "cos": "because", + "fav": "favorite", + "accessorise": "accessorize", + "accessorised": "accessorized" +} +``` + +If you're adding tables for a new languages, be sure to add the tables to +[`spacy_lookups_data/__init__.py`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/__init__.py) +and register the entry point under `spacy_lookups` in +[`setup.cfg`](https://github.com/explosion/spacy-lookups-data/blob/master/setup.cfg). + +Alternatively, you can initialize your language [`Vocab`](/api/vocab) with a +[`Lookups`](/api/lookups) object that includes the table `lexeme_norm`. + +<Accordion title="Norm exceptions in spaCy v2.0-v2.2" id="norm-exceptions-v2.2"> + +Previously in spaCy v2.0-v2.2, norm exceptions were provided as a simple python +dictionary. For more examples, see the English +[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/v2.2.x/spacy/lang/en/norm_exceptions.py). ```python ### Example @@ -327,6 +353,8 @@ norm exceptions overwrite any of the global exceptions, they should be added first. Also note that the tokenizer exceptions will always have priority over the attribute getters. +</Accordion> + ### Lexical attributes {#lex-attrs new="2"} spaCy provides a range of [`Token` attributes](/api/token#attributes) that diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..84bb3d71b 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -732,7 +732,7 @@ rather than performance: ```python def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, - infix_finditer, token_match): + infix_finditer, token_match, url_match): tokens = [] for substring in text.split(): suffixes = [] @@ -829,7 +829,7 @@ for t in tok_exp: ### Customizing spaCy's Tokenizer class {#native-tokenizers} Let's imagine you wanted to create a tokenizer for a new language or specific -domain. There are five things you would need to define: +domain. There are six things you may need to define: 1. A dictionary of **special cases**. This handles things like contractions, units of measurement, emoticons, certain abbreviations, etc. @@ -840,9 +840,22 @@ domain. There are five things you would need to define: 4. A function `infixes_finditer`, to handle non-whitespace separators, such as hyphens etc. 5. An optional boolean function `token_match` matching strings that should never - be split, overriding the infix rules. Useful for things like URLs or numbers. + be split, overriding the infix rules. Useful for things like numbers. 6. An optional boolean function `url_match`, which is similar to `token_match` - except prefixes and suffixes are removed before applying the match. + except that prefixes and suffixes are removed before applying the match. + +<Infobox title="Important note: token match in spaCy v2.2" variant="warning"> + +In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` +above and there was no match pattern applied before prefixes and suffixes were +analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its +behavior in v2.2.1 and earlier with precedence over prefixes and suffixes. + +The `url_match` is introduced in v2.3.0 to handle cases like URLs where the +tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a +URL) before applying the match. + +</Infobox> You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its @@ -865,7 +878,7 @@ def custom_tokenizer(nlp): prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, - token_match=simple_url_re.match) + url_match=simple_url_re.match) nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5fd92f8f3..382193157 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -85,6 +85,123 @@ To load your model with the neutral, multi-language class, simply set `meta.json`. You can also import the class directly, or call [`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading. +### Chinese language support {#chinese new=2.3} + +The Chinese language class supports three word segmentation options: + +> ```python +> from spacy.lang.zh import Chinese +> +> # Disable jieba to use character segmentation +> Chinese.Defaults.use_jieba = False +> nlp = Chinese() +> +> # Disable jieba through tokenizer config options +> cfg = {"use_jieba": False} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> ``` + +1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word + segmentation by default. It's enabled when you create a new `Chinese` + language class or call `spacy.blank("zh")`. +2. **Character segmentation:** Character segmentation is supported by disabling + `jieba` and setting `Chinese.Defaults.use_jieba = False` _before_ + initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer + config options can be used to configure `use_jieba`. +3. **PKUSeg**: In spaCy v2.3.0, support for + [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support + better segmentation for Chinese OntoNotes and the new + [Chinese models](/models/zh). + +<Accordion title="Details on spaCy's PKUSeg API"> + +The `meta` argument of the `Chinese` language class supports the following +following tokenizer config settings: + +| Name | Type | Description | +| ------------------ | ------- | ---------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | unicode | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | +| `pkuseg_user_dict` | unicode | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | +| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | + +```python +### Examples +# Load "default" model +cfg = {"pkuseg_model": "default", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Load local model +cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Override the user directory +cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +``` + +You can also modify the user dictionary on-the-fly: + +```python +# Append words to user dict +nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +# Remove all words from user dict and replace with new words +nlp.tokenizer.pkuseg_update_user_dict(["中国"], reset=True) + +# Remove all words from user dict +nlp.tokenizer.pkuseg_update_user_dict([], reset=True) +``` + +</Accordion> + +<Accordion title="Details on pretrained and custom Chinese models"> + +The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg` +model trained only on +[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the +models provided by `pkuseg` include data restricted to research use. For +research use, `pkuseg` provides models for several different domains +(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses, +`pkuseg` provides a simple +[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage): + +```python +import pkuseg +from spacy.lang.zh import Chinese + +# Train pkuseg model +pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model") +# Load pkuseg model in spaCy Chinese tokenizer +nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}}) +``` + +</Accordion> + +### Japanese language support {#japanese new=2.3} + +> ```python +> from spacy.lang.ja import Japanese +> +> # Load SudachiPy with split mode A (default) +> nlp = Japanese() +> +> # Load SudachiPy with split mode B +> cfg = {"split_mode": "B"} +> nlp = Japanese(meta={"tokenizer": {"config": cfg}}) +> ``` + +The Japanese language class uses +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. The default Japanese language class +and the provided Japanese models use SudachiPy split mode `A`. + +The `meta` argument of the `Japanese` language class can be used to configure +the split mode to `A`, `B` or `C`. + ## Installing and using models {#download} > #### Downloading models in spaCy < v1.7 diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md new file mode 100644 index 000000000..ba75b01ab --- /dev/null +++ b/website/docs/usage/v2-3.md @@ -0,0 +1,213 @@ +--- +title: What's New in v2.3 +teaser: New features, backwards incompatibilities and migration guide +menu: + - ['New Features', 'features'] + - ['Backwards Incompatibilities', 'incompat'] + - ['Migrating from v2.2', 'migrating'] +--- + +## New Features {#features hidden="true"} + +spaCy v2.3 features new pretrained models for five languages, word vectors for +all language models, and decreased model size and loading times for models with +vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish +and Romanian** and updated the training data and vectors for most languages. +Model packages with vectors are about **2×** smaller on disk and load +**2-4×** faster. For the full changelog, see the [release notes on +GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more +details and a behind-the-scenes look at the new release, [see our blog +post](https://explosion.ai/blog/spacy-v2-3). + +### Expanded model families with vectors {#models} + +> #### Example +> +> ```bash +> python -m spacy download da_core_news_sm +> python -m spacy download ja_core_news_sm +> python -m spacy download pl_core_news_sm +> python -m spacy download ro_core_news_sm +> python -m spacy download zh_core_web_sm +> ``` + +With new model families for Chinese, Danish, Polish, Romanian and Chinese plus +`md` and `lg` models with word vectors for all languages, this release provides +a total of 46 model packages. For models trained using [Universal +Dependencies](https://universaldependencies.org) corpora, the training data has +been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been +extended to include both UD Dutch Alpino and LassySmall. + +<Infobox> + +**Models:** [Models directory](/models) **Benchmarks: ** +[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.3.0) + +</Infobox> + +### Chinese {#chinese} + +> #### Example +> ```python +> from spacy.lang.zh import Chinese +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Append words to user dict +> nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +This release adds support for +[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and +the new Chinese models ship with a custom pkuseg model trained on OntoNotes. +The Chinese tokenizer can be initialized with both `pkuseg` and custom models +and the `pkuseg` user dictionary is easy to customize. + +<Infobox> + +**Chinese:** [Chinese tokenizer usage](/usage/models#chinese) + +</Infobox> + +### Japanese {#japanese} + +The updated Japanese language class switches to +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies +installing spaCy for Japanese, which is now possible with a single command: +`pip install spacy[ja]`. + +<Infobox> + +**Japanese:** [Japanese tokenizer usage](/usage/models#japanese) + +</Infobox> + +### Small CLI updates + +- `spacy debug-data` provides the coverage of the vectors in a base model with + `spacy debug-data lang train dev -b base_model` +- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en + dev.json`) to evaluate the tokenization accuracy without loading a model +- `spacy train` on GPU restricts the CPU timing evaluation to the first + iteration + +## Backwards incompatibilities {#incompat} + +<Infobox title="Important note on models" variant="warning"> + +If you've been training **your own models**, you'll need to **retrain** them +with the new version. Also don't forget to upgrade all models to the latest +versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible +with models for v2.3. To check if all of your models are up to date, you can +run the [`spacy validate`](/api/cli#validate) command. + +</Infobox> + +> #### Install with lookups data +> +> ```bash +> $ pip install spacy[lookups] +> ``` +> +> You can also install +> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) +> directly. + +- If you're training new models, you'll want to install the package + [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), + which now includes both the lemmatization tables (as in v2.2) and the + normalization tables (new in v2.3). If you're using pretrained models, + **nothing changes**, because the relevant tables are included in the model + packages. +- Due to the updated Universal Dependencies training data, the fine-grained + part-of-speech tags will change for many provided language models. The + coarse-grained part-of-speech tagset remains the same, but the mapping from + particular fine-grained to coarse-grained tags may show minor differences. +- For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech + tagsets contain new merged tags related to contracted forms, such as + `ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head + `"à"`. This increases the accuracy of the models by improving the alignment + between spaCy's tokenization and Universal Dependencies multi-word tokens + used for contractions. + +### Migrating from spaCy 2.2 {#migrating} + +#### Tokenizer settings + +In spaCy v2.2.2-v2.2.4, there was a change to the precedence of `token_match` +that gave prefixes and suffixes priority over `token_match`, which caused +problems for many custom tokenizer configurations. This has been reverted in +v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1 +and earlier versions. + +A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle +cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., +a comma at the end of a URL) before applying the match. See the full [tokenizer +documentation](/usage/linguistic-features#tokenization) and try out +[`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when +debugging your tokenizer configuration. + +#### Warnings configuration + +spaCy's custom warnings have been replaced with native python +[`warnings`](https://docs.python.org/3/library/warnings.html). Instead of +setting `SPACY_WARNING_IGNORE`, use the [warnings +filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +to manage warnings. + +#### Normalization tables + +The normalization tables have moved from the language data in +[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to +the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If +you're adding data for a new language, the normalization table should be added +to `spacy-lookups-data`. See [adding norm +exceptions](/usage/adding-languages#norm-exceptions). + +#### Probability and cluster features + +> #### Load and save extra prob lookups table +> +> ```python +> from spacy.lang.en import English +> nlp = English() +> doc = nlp("the") +> print(doc[0].prob) # lazily loads extra prob table +> nlp.to_disk("/path/to/model") # includes prob table +> ``` + +The `Token.prob` and `Token.cluster` features, which are no longer used by the +core pipeline components as of spaCy v2, are no longer provided in the +pretrained models to reduce the model size. To keep these features available +for users relying on them, the `prob` and `cluster` features for the most +frequent 1M tokens have been moved to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as +`extra` features for the relevant languages (English, German, Greek and +Spanish). + +The extra tables are loaded lazily, so if you have `spacy-lookups-data` +installed and your code accesses `Token.prob`, the full table is loaded into +the model vocab, which will take a few seconds on initial loading. When you +save this model after loading the `prob` table, the full `prob` table will be +saved as part of the model vocab. + +If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as +part of a new model, add the data to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under +the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can +initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a +[`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`, +`lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is +currently only used to provide a custom `oov_prob`. See examples in the [`data` +directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) +in `spacy-lookups-data`. + +#### Initializing new models without extra lookups tables + +When you initialize a new model with [`spacy init-model`](/api/cli#init-model), +the `prob` table from `spacy-lookups-data` may be loaded as part of the +initialization. If you'd like to omit this extra data as in spaCy's provided +v2.3 models, use the new flag `--omit-extra-lookups`. diff --git a/website/meta/languages.json b/website/meta/languages.json index 41c1bce7f..facfc3541 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,5 +1,35 @@ { "languages": [ + { + "code": "zh", + "name": "Chinese", + "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "dependencies": [ + { + "name": "Jieba", + "url": "https://github.com/fxsjy/jieba" + }, + { + "name": "PKUSeg", + "url": "https://github.com/lancopku/PKUSeg-python" + } + ], + "has_examples": true + }, + { + "code": "da", + "name": "Danish", + "example": "Dette er en sætning.", + "has_examples": true, + "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] + }, + { + "code": "nl", + "name": "Dutch", + "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "example": "Dit is een zin.", + "has_examples": true + }, { "code": "en", "name": "English", @@ -14,68 +44,91 @@ "example": "This is a sentence.", "has_examples": true }, + { + "code": "fr", + "name": "French", + "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], + "example": "C'est une phrase.", + "has_examples": true + }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md"], + "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], "starters": ["de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, { - "code": "fr", - "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md"], - "example": "C'est une phrase.", - "has_examples": true - }, - { - "code": "es", - "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md"], - "example": "Esto es una frase.", - "has_examples": true - }, - { - "code": "pt", - "name": "Portuguese", - "models": ["pt_core_news_sm"], - "example": "Esta é uma frase.", + "code": "el", + "name": "Greek", + "models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"], + "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, { "code": "it", "name": "Italian", - "models": ["it_core_news_sm"], + "models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"], "example": "Questa è una frase.", "has_examples": true }, { - "code": "nl", - "name": "Dutch", - "models": ["nl_core_news_sm"], - "example": "Dit is een zin.", + "code": "ja", + "name": "Japanese", + "models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"], + "dependencies": [ + { + "name": "SudachiPy", + "url": "https://github.com/WorksApplications/SudachiPy" + } + ], "has_examples": true }, { - "code": "el", - "name": "Greek", - "models": ["el_core_news_sm", "el_core_news_md"], - "example": "Αυτή είναι μια πρόταση.", - "has_examples": true + "code": "lt", + "name": "Lithuanian", + "has_examples": true, + "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "nb", "name": "Norwegian Bokmål", "example": "Dette er en setning.", "has_examples": true, - "models": ["nb_core_news_sm"] + "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] }, - { "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true }, + { + "code": "pl", + "name": "Polish", + "example": "To jest zdanie.", + "has_examples": true, + "models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"] + }, + { + "code": "pt", + "name": "Portuguese", + "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"], + "example": "Esta é uma frase.", + "has_examples": true + }, + { + "code": "ro", + "name": "Romanian", + "example": "Aceasta este o propoziție.", + "has_examples": true, + "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] + }, + { + "code": "es", + "name": "Spanish", + "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], + "example": "Esto es una frase.", + "has_examples": true + }, + { "code": "sv", "name": "Swedish", "has_examples": true }, + { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, - { "code": "pl", "name": "Polish", "example": "To jest zdanie.", "has_examples": true }, { "code": "ru", "name": "Russian", @@ -88,12 +141,6 @@ "has_examples": true, "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, - { - "code": "ro", - "name": "Romanian", - "example": "Aceasta este o propoziție.", - "has_examples": true - }, { "code": "hr", "name": "Croatian", "has_examples": true }, { "code": "eu", "name": "Basque", "has_examples": true }, { "code": "yo", "name": "Yoruba", "has_examples": true }, @@ -123,7 +170,6 @@ { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, { "code": "cs", "name": "Czech" }, { "code": "is", "name": "Icelandic" }, - { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] }, { "code": "lv", "name": "Latvian" }, { "code": "sr", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, @@ -145,12 +191,6 @@ "example": "นี่คือประโยค", "has_examples": true }, - { - "code": "zh", - "name": "Chinese", - "dependencies": [{ "name": "Jieba", "url": "https://github.com/fxsjy/jieba" }], - "has_examples": true - }, { "code": "ja", "name": "Japanese", @@ -187,6 +227,21 @@ "example": "Sta chì a l'é unna fraxe.", "has_examples": true }, + { + "code": "hy", + "name": "Armenian", + "has_examples": true + }, + { + "code": "gu", + "name": "Gujarati", + "has_examples": true + }, + { + "code": "ml", + "name": "Malayalam", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3fafc52b0..d7129875f 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,6 +9,7 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, + { "text": "New in v2.3", "url": "/usage/v2-3" }, { "text": "New in v2.2", "url": "/usage/v2-2" }, { "text": "New in v2.1", "url": "/usage/v2-1" }, { "text": "New in v2.0", "url": "/usage/v2" } diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..5bba1922b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -83,7 +83,7 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { tags_acc: 'TAG', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') return Object.keys(data).map(key => ({ diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index c96905733..1f788877c 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -124,7 +124,7 @@ const Landing = ({ data }) => { {counts.modelLangs} languages </Li> <Li> - pretrained <strong>word vectors</strong> + Pretrained <strong>word vectors</strong> </Li> <Li>State-of-the-art speed</Li> <Li> diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js index 55645f951..bb26e57cd 100644 --- a/website/src/widgets/languages.js +++ b/website/src/widgets/languages.js @@ -38,10 +38,10 @@ const Languages = () => ( const langs = site.siteMetadata.languages const withModels = langs .filter(({ models }) => models && !!models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withoutModels = langs .filter(({ models }) => !models || !models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withDeps = langs.filter(({ dependencies }) => dependencies && dependencies.length) return ( <> From bb54f54369be830651658191807c4e8625abb48c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 16 Jun 2020 16:10:12 +0200 Subject: [PATCH 091/203] Fix model accuracy table [ci skip] --- website/src/templates/models.js | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..3c5e9d2a4 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState, useMemo } from 'react' +import React, { useEffect, useState, useMemo, Fragment } from 'react' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' @@ -83,15 +83,24 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { + las: 'LAS', + uas: 'UAS', + tags_acc: 'TAG', + ents_f: 'NER F', + ents_p: 'NER P', + ents_r: 'NER R', + } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') - return Object.keys(data).map(key => ({ - label: labels[key] || key.toUpperCase(), - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + return Object.keys(data) + .filter(key => labels[key]) + .map(key => ({ + label: labels[key], + value: data[key].toFixed(2), + help: MODEL_META[key], + type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, + })) } function formatModelMeta(data) { @@ -115,11 +124,11 @@ function formatModelMeta(data) { function formatSources(data = []) { const sources = data.map(s => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( - <> + <Fragment key={i}> {i > 0 && <br />} {name && url ? <Link to={url}>{name}</Link> : name} {author && ` (${author})`} - </> + </Fragment> )) } @@ -308,12 +317,12 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl </Td> <Td> {labelNames.map((label, i) => ( - <> + <Fragment key={i}> {i > 0 && ', '} <InlineCode wrap key={label}> {label} </InlineCode> - </> + </Fragment> ))} </Td> </Tr> From a9e5b840ee43746cd39213da9d27a01188be1904 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 16 Jun 2020 16:38:45 +0200 Subject: [PATCH 092/203] Fix typos and auto-format [ci skip] --- website/docs/api/goldparse.md | 49 ++++++++++++++++++----------------- website/docs/api/matcher.md | 2 +- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 5df625991..bc33dd4e6 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -12,18 +12,18 @@ expects true examples of a label to have the value `1.0`, and negative examples of a label to have the value `0.0`. Labels not in the dictionary are treated as missing – the gradient for those labels will be zero. -| Name | Type | Description | -| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document the annotations refer to. | -| `words` | iterable | A sequence of unicode word strings. | -| `tags` | iterable | A sequence of strings, representing tag annotations. | -| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | -| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | -| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | -| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | -| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False.`. | -| **RETURNS** | `GoldParse` | The newly constructed object. | +| Name | Type | Description | +| ----------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document the annotations refer to. | +| `words` | iterable | A sequence of unicode word strings. | +| `tags` | iterable | A sequence of strings, representing tag annotations. | +| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | +| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | +| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | +| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | +| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False`. | +| **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -43,17 +43,17 @@ Whether the provided syntactic annotations form a projective dependency tree. ## Attributes {#attributes} -| Name | Type | Description | -| ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `words` | list | The words. | -| `tags` | list | The part-of-speech tag annotations. | -| `heads` | list | The syntactic head annotations. | -| `labels` | list | The syntactic relation-type annotations. | -| `ner` | list | The named entity annotations as BILUO tags. | -| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | -| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` <Tag variant="new">2</Tag> | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | -| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | +| Name | Type | Description | +| ------------------------------------ | ---- | ------------------------------------------------------------------------------------------------------------------------ | +| `words` | list | The words. | +| `tags` | list | The part-of-speech tag annotations. | +| `heads` | list | The syntactic head annotations. | +| `labels` | list | The syntactic relation-type annotations. | +| `ner` | list | The named entity annotations as BILUO tags. | +| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | +| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | +| `cats` <Tag variant="new">2</Tag> | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | +| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | ## Utilities {#util} @@ -61,7 +61,8 @@ Whether the provided syntactic annotations form a projective dependency tree. Convert a list of Doc objects into the [JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. > #### Example > diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index ac2f898e0..7b195e352 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -57,7 +57,7 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects. | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).. | +| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | <Infobox title="Important note" variant="warning"> From 44af53bdd93713b24ac28459c5d2543f03c47a18 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 16 Jun 2020 17:13:35 +0200 Subject: [PATCH 093/203] Add pkuseg warnings and auto-format [ci skip] --- website/docs/usage/models.md | 16 ++++- website/docs/usage/v2-3.md | 121 ++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 382193157..4549e8433 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -117,6 +117,18 @@ The Chinese language class supports three word segmentation options: better segmentation for Chinese OntoNotes and the new [Chinese models](/models/zh). +<Infobox variant="warning"> + +Note that [`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship +with pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can +install it from our fork and compile it locally: + +```bash +$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip +``` + +</Infobox> + <Accordion title="Details on spaCy's PKUSeg API"> The `meta` argument of the `Chinese` language class supports the following @@ -196,8 +208,8 @@ nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_mo The Japanese language class uses [SudachiPy](https://github.com/WorksApplications/SudachiPy) for word -segmentation and part-of-speech tagging. The default Japanese language class -and the provided Japanese models use SudachiPy split mode `A`. +segmentation and part-of-speech tagging. The default Japanese language class and +the provided Japanese models use SudachiPy split mode `A`. The `meta` argument of the `Japanese` language class can be used to configure the split mode to `A`, `B` or `C`. diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md index ba75b01ab..d59b50a6e 100644 --- a/website/docs/usage/v2-3.md +++ b/website/docs/usage/v2-3.md @@ -14,10 +14,10 @@ all language models, and decreased model size and loading times for models with vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish and Romanian** and updated the training data and vectors for most languages. Model packages with vectors are about **2×** smaller on disk and load -**2-4×** faster. For the full changelog, see the [release notes on -GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more -details and a behind-the-scenes look at the new release, [see our blog -post](https://explosion.ai/blog/spacy-v2-3). +**2-4×** faster. For the full changelog, see the +[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). +For more details and a behind-the-scenes look at the new release, +[see our blog post](https://explosion.ai/blog/spacy-v2-3). ### Expanded model families with vectors {#models} @@ -33,10 +33,10 @@ post](https://explosion.ai/blog/spacy-v2-3). With new model families for Chinese, Danish, Polish, Romanian and Chinese plus `md` and `lg` models with word vectors for all languages, this release provides -a total of 46 model packages. For models trained using [Universal -Dependencies](https://universaldependencies.org) corpora, the training data has -been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been -extended to include both UD Dutch Alpino and LassySmall. +a total of 46 model packages. For models trained using +[Universal Dependencies](https://universaldependencies.org) corpora, the +training data has been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) +and Dutch has been extended to include both UD Dutch Alpino and LassySmall. <Infobox> @@ -48,6 +48,7 @@ extended to include both UD Dutch Alpino and LassySmall. ### Chinese {#chinese} > #### Example +> > ```python > from spacy.lang.zh import Chinese > @@ -57,41 +58,49 @@ extended to include both UD Dutch Alpino and LassySmall. > > # Append words to user dict > nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) +> ``` This release adds support for -[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and -the new Chinese models ship with a custom pkuseg model trained on OntoNotes. -The Chinese tokenizer can be initialized with both `pkuseg` and custom models -and the `pkuseg` user dictionary is easy to customize. +[`pkuseg`](https://github.com/lancopku/pkuseg-python) for word segmentation and +the new Chinese models ship with a custom pkuseg model trained on OntoNotes. The +Chinese tokenizer can be initialized with both `pkuseg` and custom models and +the `pkuseg` user dictionary is easy to customize. Note that +[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with +pre-compiled wheels for Python 3.8. See the +[usage documentation](/usage/models#chinese) for details on how to install it on +Python 3.8. <Infobox> -**Chinese:** [Chinese tokenizer usage](/usage/models#chinese) +**Models:** [Chinese models](/models/zh) **Usage: ** +[Chinese tokenizer usage](/usage/models#chinese) </Infobox> ### Japanese {#japanese} The updated Japanese language class switches to -[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word -segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies +[`SudachiPy`](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. Using `SudachiPy` greatly simplifies installing spaCy for Japanese, which is now possible with a single command: `pip install spacy[ja]`. <Infobox> -**Japanese:** [Japanese tokenizer usage](/usage/models#japanese) +**Models:** [Japanese models](/models/ja) **Usage:** +[Japanese tokenizer usage](/usage/models#japanese) </Infobox> ### Small CLI updates -- `spacy debug-data` provides the coverage of the vectors in a base model with - `spacy debug-data lang train dev -b base_model` -- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en - dev.json`) to evaluate the tokenization accuracy without loading a model -- `spacy train` on GPU restricts the CPU timing evaluation to the first - iteration +- [`spacy debug-data`](/api/cli#debug-data) provides the coverage of the vectors + in a base model with `spacy debug-data lang train dev -b base_model` +- [`spacy evaluate`](/api/cli#evaluate) supports `blank:lg` (e.g. + `spacy evaluate blank:en dev.json`) to evaluate the tokenization accuracy + without loading a model +- [`spacy train`](/api/cli#train) on GPU restricts the CPU timing evaluation to + the first iteration ## Backwards incompatibilities {#incompat} @@ -100,8 +109,8 @@ installing spaCy for Japanese, which is now possible with a single command: If you've been training **your own models**, you'll need to **retrain** them with the new version. Also don't forget to upgrade all models to the latest versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible -with models for v2.3. To check if all of your models are up to date, you can -run the [`spacy validate`](/api/cli#validate) command. +with models for v2.3. To check if all of your models are up to date, you can run +the [`spacy validate`](/api/cli#validate) command. </Infobox> @@ -116,21 +125,20 @@ run the [`spacy validate`](/api/cli#validate) command. > directly. - If you're training new models, you'll want to install the package - [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), - which now includes both the lemmatization tables (as in v2.2) and the - normalization tables (new in v2.3). If you're using pretrained models, - **nothing changes**, because the relevant tables are included in the model - packages. + [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which + now includes both the lemmatization tables (as in v2.2) and the normalization + tables (new in v2.3). If you're using pretrained models, **nothing changes**, + because the relevant tables are included in the model packages. - Due to the updated Universal Dependencies training data, the fine-grained part-of-speech tags will change for many provided language models. The coarse-grained part-of-speech tagset remains the same, but the mapping from particular fine-grained to coarse-grained tags may show minor differences. - For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech - tagsets contain new merged tags related to contracted forms, such as - `ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head - `"à"`. This increases the accuracy of the models by improving the alignment - between spaCy's tokenization and Universal Dependencies multi-word tokens - used for contractions. + tagsets contain new merged tags related to contracted forms, such as `ADP_DET` + for French `"au"`, which maps to UPOS `ADP` based on the head `"à"`. This + increases the accuracy of the models by improving the alignment between + spaCy's tokenization and Universal Dependencies multi-word tokens used for + contractions. ### Migrating from spaCy 2.2 {#migrating} @@ -143,29 +151,28 @@ v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1 and earlier versions. A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle -cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., -a comma at the end of a URL) before applying the match. See the full [tokenizer -documentation](/usage/linguistic-features#tokenization) and try out +cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., a +comma at the end of a URL) before applying the match. See the full +[tokenizer documentation](/usage/linguistic-features#tokenization) and try out [`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when debugging your tokenizer configuration. #### Warnings configuration -spaCy's custom warnings have been replaced with native python +spaCy's custom warnings have been replaced with native Python [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of -setting `SPACY_WARNING_IGNORE`, use the [warnings -filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +setting `SPACY_WARNING_IGNORE`, use the +[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) to manage warnings. #### Normalization tables The normalization tables have moved from the language data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to -the package -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If -you're adding data for a new language, the normalization table should be added -to `spacy-lookups-data`. See [adding norm -exceptions](/usage/adding-languages#norm-exceptions). +[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to the +package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). +If you're adding data for a new language, the normalization table should be +added to `spacy-lookups-data`. See +[adding norm exceptions](/usage/adding-languages#norm-exceptions). #### Probability and cluster features @@ -181,28 +188,28 @@ exceptions](/usage/adding-languages#norm-exceptions). The `Token.prob` and `Token.cluster` features, which are no longer used by the core pipeline components as of spaCy v2, are no longer provided in the -pretrained models to reduce the model size. To keep these features available -for users relying on them, the `prob` and `cluster` features for the most -frequent 1M tokens have been moved to +pretrained models to reduce the model size. To keep these features available for +users relying on them, the `prob` and `cluster` features for the most frequent +1M tokens have been moved to [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as `extra` features for the relevant languages (English, German, Greek and Spanish). The extra tables are loaded lazily, so if you have `spacy-lookups-data` -installed and your code accesses `Token.prob`, the full table is loaded into -the model vocab, which will take a few seconds on initial loading. When you -save this model after loading the `prob` table, the full `prob` table will be -saved as part of the model vocab. +installed and your code accesses `Token.prob`, the full table is loaded into the +model vocab, which will take a few seconds on initial loading. When you save +this model after loading the `prob` table, the full `prob` table will be saved +as part of the model vocab. -If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as -part of a new model, add the data to +If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as part +of a new model, add the data to [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a [`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`, `lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is -currently only used to provide a custom `oov_prob`. See examples in the [`data` -directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) +currently only used to provide a custom `oov_prob`. See examples in the +[`data` directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) in `spacy-lookups-data`. #### Initializing new models without extra lookups tables From fd89f44c0c81bd1f1a2c1ec396c0ff3a29ac6423 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 16 Jun 2020 17:34:26 +0200 Subject: [PATCH 094/203] Update Binder URL [ci skip] --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 29d71048e..9083e98a0 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -23,7 +23,7 @@ "apiKey": "371e26ed49d29a27bd36273dfdaf89af", "indexName": "spacy" }, - "binderUrl": "ines/spacy-io-binder", + "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", "binderVersion": "2.2.0", "sections": [ From 41003a5117d23f519c99edddfb4fc3a80370d7d1 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 16 Jun 2020 17:41:23 +0200 Subject: [PATCH 095/203] Update Binder version [ci skip] --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 9083e98a0..8b8424f82 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -25,7 +25,7 @@ }, "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", - "binderVersion": "2.2.0", + "binderVersion": "2.3.0", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, From 457babfa0c581d868fe16b418c0dcef357d78a97 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 20:22:03 +0200 Subject: [PATCH 096/203] Update alignment example for new gold.align --- website/docs/usage/linguistic-features.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..a442cc7a0 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1130,9 +1130,9 @@ from spacy.gold import align other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) -print("Misaligned tokens:", cost) # 2 +print("Edit distance:", cost) # 3 print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) -print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7]) +print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7]) print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} print("Many-to-one mappings b-> a", b2a_multi) # {} ``` @@ -1140,7 +1140,7 @@ print("Many-to-one mappings b-> a", b2a_multi) # {} Here are some insights from the alignment information generated in the example above: -- Two tokens are misaligned. +- The edit distance (cost) is `3`: two deletions and one insertion. - The one-to-one mappings for the first four tokens are identical, which means they map to each other. This makes sense because they're also identical in the input: `"i"`, `"listened"`, `"to"` and `"obama"`. From 9aff317ca788cc996da5125e7d9c4783c8ab9f7e Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 20:26:57 +0200 Subject: [PATCH 097/203] Update POS in tagging example --- website/docs/usage/101/_pos-deps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index 1a438e424..1e8960edf 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -36,7 +36,7 @@ for token in doc: | Text | Lemma | POS | Tag | Dep | Shape | alpha | stop | | ------- | ------- | ------- | ----- | ---------- | ------- | ------- | ------- | | Apple | apple | `PROPN` | `NNP` | `nsubj` | `Xxxxx` | `True` | `False` | -| is | be | `VERB` | `VBZ` | `aux` | `xx` | `True` | `True` | +| is | be | `AUX` | `VBZ` | `aux` | `xx` | `True` | `True` | | looking | look | `VERB` | `VBG` | `ROOT` | `xxxx` | `True` | `False` | | at | at | `ADP` | `IN` | `prep` | `xx` | `True` | `True` | | buying | buy | `VERB` | `VBG` | `pcomp` | `xxxx` | `True` | `False` | From a6abdfbc3c5a298b9d0e547451701f6705fd09b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 20:35:45 +0200 Subject: [PATCH 098/203] Fix numpy.zeros() dtype for Doc.from_array --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index a442cc7a0..1e3b129ac 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -471,7 +471,7 @@ doc = nlp.make_doc("London is a big city in the United Kingdom.") print("Before", doc.ents) # [] header = [ENT_IOB, ENT_TYPE] -attr_array = numpy.zeros((len(doc), len(header))) +attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64") attr_array[0, 0] = 3 # B attr_array[0, 1] = doc.vocab.strings["GPE"] doc.from_array(header, attr_array) From f0fd77648fb488c26852cd1494b69073e5766b65 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 20:36:21 +0200 Subject: [PATCH 099/203] Change example title to Dr. Change example title to Dr. so the current model does exclude the title in the initial example. --- website/docs/usage/rule-based-matching.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1db2405d1..f7866fe31 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1158,17 +1158,17 @@ what you need for your application. > available corpus. For example, the corpus spaCy's [English models](/models/en) were trained on -defines a `PERSON` entity as just the **person name**, without titles like "Mr" -or "Dr". This makes sense, because it makes it easier to resolve the entity type -back to a knowledge base. But what if your application needs the full names, -_including_ the titles? +defines a `PERSON` entity as just the **person name**, without titles like "Mr." +or "Dr.". This makes sense, because it makes it easier to resolve the entity +type back to a knowledge base. But what if your application needs the full +names, _including_ the titles? ```python ### {executable="true"} import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -1233,7 +1233,7 @@ def expand_person_entities(doc): # Add the component after the named entity recognizer nlp.add_pipe(expand_person_entities, after='ner') -doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.") +doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` From 02369f91d307a6ba43f1d9ad97efbb5e348cc599 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Tue, 16 Jun 2020 20:41:17 +0200 Subject: [PATCH 100/203] Fix spacy convert argument --- website/docs/usage/adding-languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..98d4fdec9 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -634,7 +634,7 @@ One thing to keep in mind is that spaCy expects to train its models from **whole documents**, not just single sentences. If your corpus only contains single sentences, spaCy's models will never learn to expect multi-sentence documents, leading to low performance on real text. To mitigate this problem, you can use -the `-N` argument to the `spacy convert` command, to merge some of the sentences +the `-n` argument to the `spacy convert` command, to merge some of the sentences into longer pseudo-documents. ### Training the tagger and parser {#train-tagger-parser} From 931d80de72db45bb11d571e767d7062a45209182 Mon Sep 17 00:00:00 2001 From: Adriane Boyd <adrianeboyd@gmail.com> Date: Fri, 19 Jun 2020 12:43:41 +0200 Subject: [PATCH 101/203] Warning for sudachipy 0.4.5 (#5611) --- website/docs/usage/models.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 4549e8433..b11e6347a 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -214,6 +214,14 @@ the provided Japanese models use SudachiPy split mode `A`. The `meta` argument of the `Japanese` language class can be used to configure the split mode to `A`, `B` or `C`. +<Infobox variant="warning"> + +If you run into errors related to `sudachipy`, which is currently under active +development, we suggest downgrading to `sudachipy==0.4.5`, which is the version +used for training the current [Japanese models](/models/ja). + +</Infobox> + ## Installing and using models {#download} > #### Downloading models in spaCy < v1.7 From ccd7edf04bac4a8a29431433e73e1a2474acc0dd Mon Sep 17 00:00:00 2001 From: "Marat M. Yavrumyan" <myavrum@ysu.am> Date: Fri, 19 Jun 2020 20:34:27 +0400 Subject: [PATCH 102/203] Create myavrum.md (#5612) --- .github/contributors/myavrum.md | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/myavrum.md diff --git a/.github/contributors/myavrum.md b/.github/contributors/myavrum.md new file mode 100644 index 000000000..dc8f1bb84 --- /dev/null +++ b/.github/contributors/myavrum.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Marat M. Yavrumyan | +| Company name (if applicable) | YSU, UD_Armenian Project | +| Title or role (if applicable) | Dr., Principal Investigator | +| Date | 2020-06-19 | +| GitHub username | myavrum | +| Website (optional) | http://armtreebank.yerevann.com/ | From 8120b641ccb66b088fa70c028e5be542bf561dfd Mon Sep 17 00:00:00 2001 From: "Marat M. Yavrumyan" <myavrum@ysu.am> Date: Fri, 19 Jun 2020 22:00:34 +0400 Subject: [PATCH 103/203] Update lex_attrs.py (#5608) --- spacy/lang/hy/lex_attrs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 910625fb8..b556d679c 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -5,8 +5,8 @@ from ...attrs import LIKE_NUM _num_words = [ - "զրօ", - "մէկ", + "զրո", + "մեկ", "երկու", "երեք", "չորս", @@ -28,10 +28,10 @@ _num_words = [ "քսան" "երեսուն", "քառասուն", "հիսուն", - "վաթցսուն", + "վաթսուն", "յոթանասուն", "ութսուն", - "ինիսուն", + "իննսուն", "հարյուր", "հազար", "միլիոն", From ff6a084e9cdf9114cbc8cb55fe0e9c69e4cabc34 Mon Sep 17 00:00:00 2001 From: Karen Hambardzumyan <mahnerak@gmail.com> Date: Sat, 20 Jun 2020 13:14:26 +0400 Subject: [PATCH 104/203] Create mahnerak.md (#5615) --- .github/contributors/mahnerak.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/mahnerak.md diff --git a/.github/contributors/mahnerak.md b/.github/contributors/mahnerak.md new file mode 100644 index 000000000..cc7739681 --- /dev/null +++ b/.github/contributors/mahnerak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Karen Hambardzumyan | +| Company name (if applicable) | YerevaNN | +| Title or role (if applicable) | Researcher | +| Date | 2020-06-19 | +| GitHub username | mahnerak | +| Website (optional) | https://mahnerak.com/| From 8283df80e91d7fba385b12c42eb976ab30ca1e2a Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 14:15:04 +0200 Subject: [PATCH 105/203] Tidy up and auto-format --- spacy/cli/pretrain.py | 4 +- spacy/cli/train_from_config.py | 86 ++++++++++--------- spacy/lemmatizer.py | 9 +- spacy/ml/__init__.py | 2 +- spacy/ml/_biluo.py | 23 +++-- spacy/ml/_iob.py | 22 +++-- spacy/ml/_precomputable_affine.py | 2 +- spacy/ml/models/__init__.py | 2 +- spacy/ml/models/multi_task.py | 17 +++- spacy/ml/models/parser.py | 8 +- spacy/ml/models/simple_ner.py | 29 ++++--- spacy/ml/models/tagger.py | 3 +- spacy/ml/models/textcat.py | 76 ++++++++++------ spacy/ml/models/tok2vec.py | 55 ++++++++---- spacy/ml/tb_framework.py | 16 ++-- spacy/pipeline/simple_ner.py | 36 ++++---- spacy/scorer.py | 41 ++++++--- spacy/tests/doc/test_add_entities.py | 14 ++- spacy/tests/parser/test_add_label.py | 17 +++- spacy/tests/parser/test_arc_eager_oracle.py | 7 +- spacy/tests/parser/test_ner.py | 21 ++++- spacy/tests/parser/test_neural_parser.py | 7 +- spacy/tests/parser/test_nn_beam.py | 7 +- spacy/tests/parser/test_preset_sbd.py | 7 +- spacy/tests/pipeline/test_entity_linker.py | 12 ++- spacy/tests/pipeline/test_morphologizer.py | 20 ++++- spacy/tests/pipeline/test_simple_ner.py | 27 +++--- spacy/tests/regression/test_issue1501-2000.py | 7 +- spacy/tests/regression/test_issue3001-3500.py | 7 +- spacy/tests/regression/test_issue3830.py | 14 ++- spacy/tests/regression/test_issue4042.py | 7 +- spacy/tests/regression/test_issue4313.py | 7 +- spacy/tests/regression/test_issue4924.py | 1 - .../tests/serialize/test_serialize_config.py | 4 +- .../serialize/test_serialize_pipeline.py | 7 +- .../serialize/test_serialize_vocab_strings.py | 6 +- spacy/tests/test_scorer.py | 3 +- spacy/tests/test_util.py | 16 ++-- spacy/util.py | 4 +- 39 files changed, 421 insertions(+), 232 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 4f4707b52..4f4029834 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -24,8 +24,8 @@ from ..gold import Example output_dir=("Directory to write models to on each epoch", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), - resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), - epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), + resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), # fmt: on ) def pretrain( diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index f24feffab..6080b698b 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -3,7 +3,6 @@ from timeit import default_timer as timer import srsly from pydantic import BaseModel, FilePath -import plac import tqdm from pathlib import Path from wasabi import msg @@ -16,7 +15,9 @@ from ..gold import GoldCorpus from ..lookups import Lookups from .. import util from ..errors import Errors -from ..ml import models # don't remove - required to load the built-in architectures + +# Don't remove - required to load the built-in architectures +from ..ml import models # noqa: F401 registry = util.registry @@ -114,33 +115,19 @@ class ConfigSchema(BaseModel): extra = "allow" -@plac.annotations( - # fmt: off - train_path=("Location of JSON-formatted training data", "positional", None, Path), - dev_path=("Location of JSON-formatted development data", "positional", None, Path), - config_path=("Path to config file", "positional", None, Path), - output_path=("Output directory to store model in", "option", "o", Path), - init_tok2vec=( - "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", - Path), - raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), - verbose=("Display more information for debugging purposes", "flag", "VV", bool), - use_gpu=("Use GPU", "option", "g", int), - tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), - omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), - # fmt: on -) def train_cli( - train_path, - dev_path, - config_path, - output_path=None, - init_tok2vec=None, - raw_text=None, - verbose=False, - use_gpu=-1, - tag_map_path=None, - omit_extra_lookups=False, + # fmt: off + train_path: ("Location of JSON-formatted training data", "positional", None, Path), + dev_path: ("Location of JSON-formatted development data", "positional", None, Path), + config_path: ("Path to config file", "positional", None, Path), + output_path: ("Output directory to store model in", "option", "o", Path) = None, + init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, + raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, + verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, + use_gpu: ("Use GPU", "option", "g", int) = -1, + tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, + omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, + # fmt: on ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -212,7 +199,7 @@ def train( config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): - # It feels kind of weird to not have a default for this. + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -227,7 +214,9 @@ def train( # verify textcat config if "textcat" in nlp_config["pipeline"]: textcat_labels = set(nlp.get_pipe("textcat").labels) - textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] + textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][ + "exclusive_classes" + ] # check whether the setting 'exclusive_classes' corresponds to the provided training data if textcat_multilabel: @@ -255,7 +244,9 @@ def train( "to 'false' in the config to train a classifier with classes " "that are not mutually exclusive." ) - msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + msg.info( + f"Initialized textcat component for {len(textcat_labels)} unique labels" + ) nlp.get_pipe("textcat").labels = tuple(textcat_labels) # if 'positive_label' is provided: double check whether it's in the data and the task is binary @@ -281,9 +272,7 @@ def train( nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - nlp.begin_training( - lambda: corpus.train_examples - ) + nlp.begin_training(lambda: corpus.train_examples) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) @@ -310,8 +299,7 @@ def train( tok2vec = tok2vec.get(subpath) if not tok2vec: msg.fail( - f"Could not locate the tok2vec model at {tok2vec_path}.", - exits=1, + f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, ) tok2vec.from_bytes(weights_data) @@ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=0.0, # I think this is deprecated? + noise_level=0.0, # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], @@ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + raise KeyError( + Errors.E983.format( + dict_name="score_weights", key=str(e), keys=list(scores.keys()) + ) + ) scores["speed"] = wps return weighted_score, scores @@ -578,15 +570,25 @@ def setup_printer(training, nlp): ] except KeyError as e: raise KeyError( - Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + Errors.E983.format( + dict_name="scores (losses)", + key=str(e), + keys=list(info["losses"].keys()), + ) + ) try: scores = [ - "{0:.2f}".format(float(info["other_scores"][col])) - for col in score_cols + "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols ] except KeyError as e: - raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + raise KeyError( + Errors.E983.format( + dict_name="scores (other)", + key=str(e), + keys=list(info["other_scores"].keys()), + ) + ) data = ( [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c4944407f..7d6bfbc12 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,3 @@ -from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups from .parts_of_speech import NAMES as UPOS_NAMES @@ -51,7 +50,13 @@ class Lemmatizer(object): index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): if univ_pos == "propn": return [string] else: diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index aed4fa323..c382d915b 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1 @@ -from .models import * +from .models import * # noqa: F401, F403 diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 28339089a..77a2a6a77 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -1,11 +1,8 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional +from typing import Dict, Optional import numpy -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.api import to_numpy -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from thinc.api import Model +from thinc.types import Padded, Floats3d def BILUO() -> Model[Padded, Padded]: @@ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): masks = model.ops.alloc3f(*Y.shape) max_value = Xp.data.max() for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t+2)).astype("i") + is_last = (Xp.lengths < (t + 2)).astype("i") masks[t] = valid_transitions[is_last, prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) + Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: @@ -83,13 +80,13 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) + U_start, U_end = (L_end, L_end + n_labels) # noqa: F841 # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end + O_action = U_end # noqa: F841 # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 0ce9a71e6..9f385ec0d 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -1,9 +1,7 @@ """Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, with_array, softmax_activation, padded2list -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d - -from ..tokens import Doc +from typing import Dict, Optional +from thinc.api import Ops, Model +from thinc.types import Padded, Floats3d def IOB() -> Model[Padded, Padded]: @@ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]: forward, init=init, dims={"nO": None}, - attrs={"get_num_actions": get_num_actions} + attrs={"get_num_actions": get_num_actions}, ) -def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): +def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error @@ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): for t in range(Xp.data.shape[0]): masks[t] = valid_transitions[prev_actions] # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t]:] = 0 + masks[t, Xp.size_at_t[t] :] = 0 # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) + Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) prev_actions = Y[t].argmax(axis=-1) def backprop_biluo(dY: Padded) -> Padded: # Masking the gradient seems to do poorly here. But why? - #dY.data *= masks + # dY.data *= masks return dY return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo @@ -83,10 +81,10 @@ def _get_transition_table( B_range = ops.xp.arange(B_start, B_end) I_range = ops.xp.arange(I_start, I_end) # B and O are always valid - table[:, B_start : B_end] = 1 + table[:, B_start:B_end] = 1 table[:, O_action] = 1 # I can only follow a matching B table[B_range, I_range] = 1 - + _cache[n_actions] = table return table diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index f4b5b16fe..215cdeda1 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # (ids < 0).T @ dY mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) return d_pad.reshape((1, nF, nO, nP)) diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index 40cde2437..dd58dab00 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,6 @@ from .entity_linker import * # noqa from .parser import * # noqa -from .simple_ner import * +from .simple_ner import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 4a360a9e6..b3a9e0815 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), + Maxout( + nO=token_vector_width * 2, + nI=token_vector_width, + nP=maxout_pieces, + dropout=0.0, + ), LayerNorm(token_vector_width * 2), softmax, ) @@ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 + nO=nO, + nI=tok2vec.get_dim("nO"), + nP=maxout_pieces, + normalize=True, + dropout=0.0, ), Linear(nO=nO, nI=nO, init_W=zero_init), ) @@ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update( + docs + ) # drop=drop def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index bdcd709b1..47c94cfa1 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -16,18 +16,14 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - with_array(Linear(hidden_width, t2v_width)), - list2array(), - ) + tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), - nP=maxout_pieces + nP=maxout_pieces, ) if use_upper: with use_ops("numpy"): diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 01661f55b..1fb5a71c0 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -1,9 +1,8 @@ -import functools -from typing import List, Tuple, Dict, Optional -from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list +from typing import List +from thinc.api import Model, Linear, with_array, softmax_activation, padded2list from thinc.api import chain, list2padded, configure_normal_init from thinc.api import Dropout -from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d +from thinc.types import Floats2d from ...tokens import Doc from .._biluo import BILUO @@ -12,12 +11,12 @@ from ...util import registry @registry.architectures.register("spacy.BiluoTagger.v1") -def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def BiluoTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = BILUO() linear = Linear( - nO=None, - nI=tok2vec.get_dim("nO"), - init_W=configure_normal_init(mean=0.02) + nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) ) model = chain( tok2vec, @@ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L with_array(chain(Dropout(0.1), linear)), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L layers=[model, linear], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) + @registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: +def IOBTagger( + tok2vec: Model[List[Doc], List[Floats2d]] +) -> Model[List[Doc], List[Floats2d]]: biluo = IOB() linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) model = chain( @@ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis with_array(linear), biluo, with_array(softmax_activation()), - padded2list() + padded2list(), ) return Model( @@ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis layers=[model], refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]} + attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, ) - def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: if model.get_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 00e268ede..7fe417321 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,5 +1,4 @@ -from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout -from thinc.api import glorot_uniform_init +from thinc.api import zero_init, with_array, Softmax, chain, Model from ...util import registry diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a02e1a5a1..9db6f982f 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,11 +1,12 @@ -from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention -from thinc.api import chain, concatenate, clone, Dropout -from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor +from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic +from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout +from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout +from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed +from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor from ..spacy_vectors import SpacyVectors from ... import util -from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER +from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") -def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, dropout, nO=None): +def build_text_classifier( + width, + embed_size, + pretrained_vectors, + exclusive_classes, + ngram_size, + window_size, + conv_depth, + dropout, + nO=None, +): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) + lower = HashEmbed( + nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout + ) + prefix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout + ) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class vectors_width = width tok2vec = vector_layer >> with_array( Maxout(width, vectors_width, normalize=True) - >> residual((expand_window(window_size=window_size) - >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, + >> residual( + ( + expand_window(window_size=window_size) + >> Maxout( + nO=width, nI=width * ((window_size * 2) + 1), normalize=True + ) + ) + ) + ** conv_depth, pad=conv_depth, ) cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) + tok2vec + >> list2ragged() + >> ParametricAttention(width) + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) ) linear_model = build_bow_text_classifier( - nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False + nO=nO, + ngram_size=ngram_size, + exclusive_classes=exclusive_classes, + no_output_layer=False, ) - nO_double = nO*2 if nO else None + nO_double = nO * 2 if nO else None if exclusive_classes: output_layer = Softmax(nO=nO, nI=nO_double) else: - output_layer = ( - Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - ) + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() model = (linear_model | cnn_model) >> output_layer model.set_ref("tok2vec", tok2vec) if model.has_dim("nO") is not False: diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 53798e57c..b1bed1ea1 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -99,7 +99,13 @@ def hash_charembed_cnn( @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout + pretrained_vectors, + width, + depth, + embed_size, + subword_features, + maxout_pieces, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout( - nO=width, - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): +def MultiHashEmbed( + columns, width, rows, use_subwords, pretrained_vectors, mix, dropout +): norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout + ) if pretrained_vectors: glove = StaticVectors( @@ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): cnn = chain( expand_window(window_size=window_size), - Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), + Maxout( + nO=width, + nI=width * ((window_size * 2) + 1), + nP=maxout_pieces, + dropout=0.0, + normalize=True, + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) @@ -247,11 +262,19 @@ def build_Tok2Vec_model( subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) + norm = HashEmbed( + nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout + ) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) + prefix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout + ) + suffix = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout + ) + shape = HashEmbed( + nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout + ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 251189389..69b40cbcf 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), - "resize_output": resize_output - } + "resize_output": resize_output, + }, ) @@ -31,14 +31,14 @@ def forward(model, X, is_train): model.layers, unseen_classes=model.attrs["unseen_classes"], train=is_train, - has_upper=model.attrs["has_upper"] + has_upper=model.attrs["has_upper"], ) return step_model, step_model.finish_steps def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) + tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841 lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) @@ -46,7 +46,7 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: @@ -62,7 +62,7 @@ def resize_output(model, new_nO): nI = None if smaller.has_dim("nI"): nI = smaller.get_dim("nI") - with use_ops('numpy'): + with use_ops("numpy"): larger = Linear(nO=new_nO, nI=nI) larger.init = smaller.init # it could be that the model is not initialized yet, then skip this bit @@ -74,8 +74,8 @@ def resize_output(model, new_nO): # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. if smaller.has_dim("nO"): - larger_W[:smaller.get_dim("nO")] = smaller_W - larger_b[:smaller.get_dim("nO")] = smaller_b + larger_W[: smaller.get_dim("nO")] = smaller_W + larger_b[: smaller.get_dim("nO")] = smaller_b for i in range(smaller.get_dim("nO"), new_nO): model.attrs["unseen_classes"].add(i) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c674046af..58f647b67 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -21,9 +21,7 @@ class SimpleNER(Pipe): self.model = model self.cfg = {"labels": []} self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) assert self.model is not None @@ -38,21 +36,21 @@ class SimpleNER(Pipe): def add_label(self, label): if label not in self.cfg["labels"]: self.cfg["labels"].append(label) - + def get_tag_names(self): if self.is_biluo: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - [f"L-{label}" for label in self.labels] + - [f"U-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + [f"L-{label}" for label in self.labels] + + [f"U-{label}" for label in self.labels] + + ["O"] ) else: return ( - [f"B-{label}" for label in self.labels] + - [f"I-{label}" for label in self.labels] + - ["O"] + [f"B-{label}" for label in self.labels] + + [f"I-{label}" for label in self.labels] + + ["O"] ) def predict(self, docs: List[Doc]) -> List[Floats2d]: @@ -108,7 +106,7 @@ class SimpleNER(Pipe): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) - if not hasattr(get_examples, '__call__'): + if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples labels = _get_labels(get_examples()) @@ -117,14 +115,12 @@ class SimpleNER(Pipe): labels = self.labels n_actions = self.model.attrs["get_num_actions"](len(labels)) self.model.set_dim("nO", n_actions) - self.model.initialize() + self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), - normalize=True, - missing_value=None + names=self.get_tag_names(), normalize=True, missing_value=None ) return sgd @@ -135,7 +131,7 @@ class SimpleNER(Pipe): def _has_ner(eg): for ner_tag in eg.gold.ner: - if ner_tag != "-" and ner_tag != None: + if ner_tag != "-" and ner_tag is not None: return True else: return False @@ -145,7 +141,7 @@ def _get_labels(examples): labels = set() for eg in examples: for ner_tag in eg.token_annotation.entities: - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) + if ner_tag != "O" and ner_tag != "-": + _, label = ner_tag.split("-", 1) labels.add(label) return list(sorted(labels)) diff --git a/spacy/scorer.py b/spacy/scorer.py index 288da23aa..af74db80e 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -98,7 +98,9 @@ class Scorer(object): for name, component in pipeline: if name == "textcat": self.textcat_multilabel = component.model.attrs["multi_label"] - self.textcat_positive_label = component.cfg.get("positive_label", None) + self.textcat_positive_label = component.cfg.get( + "positive_label", None + ) for label in component.cfg.get("labels", []): self.textcat_auc_per_cat[label] = ROCAUCScore() self.textcat_f_per_cat[label] = PRFScore() @@ -119,19 +121,19 @@ class Scorer(object): @property def morphs_acc(self): - """RETURNS (float): Morph tag accuracy (morphological features, + """RETURNS (float): Morph tag accuracy (morphological features, i.e. `Token.morph`). """ - return self.morphs.fscore * 100 + return self.morphs.fscore * 100 @property def morphs_per_type(self): - """RETURNS (dict): Scores per dependency label. + """RETURNS (dict): Scores per dependency label. """ - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.morphs_per_feat.items() - } + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.morphs_per_feat.items() + } @property def sent_p(self): @@ -302,7 +304,15 @@ class Scorer(object): gold_morphs_per_feat = {} gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): + for id_, tag, pos, morph, head, dep, sent_start in zip( + orig.ids, + orig.tags, + orig.pos, + orig.morphs, + orig.heads, + orig.deps, + orig.sent_starts, + ): gold_tags.add((id_, tag)) gold_pos.add((id_, pos)) gold_morphs.add((id_, morph)) @@ -400,7 +410,10 @@ class Scorer(object): self.pos.score_set(cand_pos, gold_pos) self.morphs.score_set(cand_morphs, gold_morphs) for field in self.morphs_per_feat: - self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) + self.morphs_per_feat[field].score_set( + cand_morphs_per_feat.get(field, set()), + gold_morphs_per_feat.get(field, set()), + ) self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: @@ -412,7 +425,9 @@ class Scorer(object): ) if ( len(gold.cats) > 0 - and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) + and set(self.textcat_f_per_cat) + == set(self.textcat_auc_per_cat) + == set(gold.cats) and set(gold.cats) == set(doc.cats) ): goldcat = max(gold.cats, key=gold.cats.get) @@ -424,10 +439,10 @@ class Scorer(object): ) for label in set(gold.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold.cats[label] + doc.cats[label], gold.cats[label] ) self.textcat_f_per_cat[label].score_set( - set([label]) & set([candcat]), set([label]) & set([goldcat]) + set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 879334056..b9c230516 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) @@ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index f9663ba32..893465b45 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,9 +1,8 @@ import pytest -from thinc.api import Adam, NumpyOps +from thinc.api import Adam from spacy.attrs import NORM from spacy.gold import GoldParse from spacy.vocab import Vocab - from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer @@ -17,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -58,7 +62,12 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 5d265261f..42b62251e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -138,7 +138,12 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b0a8109dc..e82de03bf 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -138,7 +138,12 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -157,7 +162,12 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity @@ -215,7 +225,12 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 7f3e981ea..d88517fb5 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -28,7 +28,12 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } return Parser(vocab, model=default_parser(), moves=arc_eager, **config) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index fa5d59f9e..841eb058c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index ccf7d3ba3..37a9136aa 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,12 @@ def vocab(): @pytest.fixture def parser(vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 32b434e04..62c7fbf17 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe(nlp.create_pipe('sentencizer')) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data ruler = EntityRuler(nlp) - patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} + ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) @@ -285,7 +287,11 @@ def test_overfitting_IO(): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index f9307afc2..f052c4380 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -15,8 +15,17 @@ def test_label_types(): TRAIN_DATA = [ - ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), - ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), + ( + "I like green eggs", + { + "morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], + "pos": ["NOUN", "VERB", "ADJ", "NOUN"], + }, + ), + ( + "Eat blue ham", + {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + ), ] @@ -38,7 +47,12 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) - gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] + gold_morphs = [ + "Feat=N|POS=NOUN", + "Feat=V|POS=VERB", + "Feat=J|POS=ADJ", + "Feat=N|POS=NOUN", + ] assert gold_morphs == [t.morph_ for t in doc] # Also test the results are still the same after IO diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index 9d4acf2fd..024d7bd26 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,30 +1,31 @@ import pytest from collections import namedtuple - from thinc.api import NumpyOps from spacy.ml._biluo import BILUO, _get_transition_table -from spacy.pipeline.simple_ner import SimpleNER -import spacy -@pytest.fixture(params=[ - ["PER", "ORG", "LOC", "MISC"], - ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] -]) +@pytest.fixture( + params=[ + ["PER", "ORG", "LOC", "MISC"], + ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], + ] +) def labels(request): return request.param + @pytest.fixture def ops(): return NumpyOps() + def _get_actions(labels): action_names = ( - [f"B{label}" for label in labels] + \ - [f"I{label}" for label in labels] + \ - [f"L{label}" for label in labels] + \ - [f"U{label}" for label in labels] + \ - ["O"] + [f"B{label}" for label in labels] + + [f"I{label}" for label in labels] + + [f"L{label}" for label in labels] + + [f"U{label}" for label in labels] + + ["O"] ) A = namedtuple("actions", action_names) return A(**{name: i for i, name in enumerate(action_names)}) @@ -228,7 +229,7 @@ def test_transition_table(ops): assert table[0, a.O, a.Uloc] == 1 assert table[0, a.O, a.Uorg] == 1 assert table[0, a.O, a.O] == 1 - + # Last token, prev action was B assert table[1, a.Bper, a.Bper] == 0 assert table[1, a.Bper, a.Bloc] == 0 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 177b6bb3d..6a2d16733 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,12 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 6df437b3c..a37707379 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,12 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 15632bdf8..06b7893a7 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels @@ -16,7 +21,12 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 4978aba44..f47290b92 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,7 +74,12 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 946316d85..5e2764618 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -12,7 +12,12 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index b240f6d4a..10c7868a0 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,4 +1,3 @@ -import pytest from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 870a980f2..cfb9d7381 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -112,7 +112,7 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones @@ -132,7 +132,7 @@ def test_serialize_parser(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") + tok2vec = model.get_ref("tok2vec") # noqa: F841 upper = model.get_ref("upper") # check that we have the correct settings, not the default ones diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 9c4e1f61e..abb5ccb27 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index d3e82296e..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) + assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE + assert sorted([s for s in new_vocab1.strings]) == sorted( + strings1 + list(default_strings) + ) @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index d750a8202..2e1cf2730 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -40,6 +40,7 @@ test_ner_apple = [ ] ] + @pytest.fixture def tagged_doc(): text = "Sarah's sister flew to Silicon Valley via London." @@ -184,7 +185,7 @@ def test_tag_score(tagged_doc): tagged_doc, tags=[t.tag_ for t in tagged_doc], pos=[t.pos_ for t in tagged_doc], - morphs=[t.morph_ for t in tagged_doc] + morphs=[t.morph_ for t in tagged_doc], ) scorer.score((tagged_doc, gold)) results = scorer.scores diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1410755db..a7258449d 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -13,7 +13,7 @@ from spacy.util import minibatch_by_words ([400, 400, 199, 3], [4]), ([400, 400, 199, 3, 200], [3, 2]), ([400, 400, 199, 3, 1], [5]), - ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded + ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded ([400, 400, 199, 3, 1, 200], [3, 3]), ([400, 400, 199, 3, 1, 999], [3, 3]), ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), @@ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=True + ) + ) assert [len(batch) for batch in batches] == expected_batches max_size = batch_size + batch_size * tol @@ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 - batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) + batches = list( + minibatch_by_words( + examples=examples, size=batch_size, tolerance=tol, discard_oversize=False + ) + ) assert [len(batch) for batch in batches] == expected_batches - - diff --git a/spacy/util.py b/spacy/util.py index d2d87bef9..ad3dc3635 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -697,7 +697,9 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): +def minibatch_by_words( + examples, size, count_words=len, tolerance=0.2, discard_oversize=False +): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by themselves, or be discarded if discard_oversize=True.""" From f91e9e8c8437020505c8af07ff9e123ef5324293 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 14:47:17 +0200 Subject: [PATCH 106/203] Remove F841 [ci skip] --- spacy/ml/_biluo.py | 4 ++-- spacy/ml/tb_framework.py | 4 ++-- spacy/tests/serialize/test_serialize_config.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 77a2a6a77..ab2bd9e10 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -80,13 +80,13 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) # noqa: F841 + U_start, U_end = (L_end, L_end + n_labels) # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end # noqa: F841 + O_action = U_end # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 69b40cbcf..f7dad565e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,7 +38,7 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) # noqa: F841 + tok2vec = model.get_ref("tok2vec").initialize(X=X) lower = model.get_ref("lower").initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) @@ -46,7 +46,7 @@ def init(model, X=None, Y=None): def resize_output(model, new_nO): - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") lower = model.get_ref("lower") upper = model.get_ref("upper") if not model.attrs["has_upper"]: diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index cfb9d7381..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -112,7 +112,7 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") upper = model.get_ref("upper") # check that we have the correct settings, not the default ones @@ -132,7 +132,7 @@ def test_serialize_parser(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - tok2vec = model.get_ref("tok2vec") # noqa: F841 + tok2vec = model.get_ref("tok2vec") upper = model.get_ref("upper") # check that we have the correct settings, not the default ones From 0cdb631e6c328bdc985f631125dcbb3e5a55c673 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 16:02:42 +0200 Subject: [PATCH 107/203] Fix merge errors --- spacy/pipeline/pipes.pyx | 2 +- spacy/tokenizer.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 946cd5366..7c800eed8 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1337,7 +1337,7 @@ class EntityLinker(Pipe): final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - sent_doc = doc[start_token:end_token].as_doc() + sent_doc = doc[sent.start:sent.end].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model([sent_doc])[0] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ef5b14d87..b40113460 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -782,7 +782,7 @@ cdef class Tokenizer: "suffix_search": lambda b: data.setdefault("suffix_search", b), "infix_finditer": lambda b: data.setdefault("infix_finditer", b), "token_match": lambda b: data.setdefault("token_match", b), - "url_match": lambda b: data.setdefault("url_match", b) + "url_match": lambda b: data.setdefault("url_match", b), "exceptions": lambda b: data.setdefault("rules", b) } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) @@ -795,7 +795,7 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], str): self.token_match = re.compile(data["token_match"]).match - if "url_match" in data and isinstance(data["url_match"], basestring_): + if "url_match" in data and isinstance(data["url_match"], str): self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions From 296b5d633b94ca51ed038b31d207edb5f53e0acb Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 16:11:13 +0200 Subject: [PATCH 108/203] Remove references to Python 2 / is_python2 --- spacy/tests/regression/test_issue5230.py | 19 +++++++------------ .../serialize/test_serialize_vocab_strings.py | 2 -- spacy/tests/vocab_vectors/test_vectors.py | 2 -- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 2b14ff589..9e83d6818 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -8,7 +8,6 @@ from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language from spacy.pipeline import Pipe -from spacy.compat import is_python2 from ..util import make_tempdir @@ -97,14 +96,12 @@ def write_obj_and_catch_warnings(obj): return list(filter(lambda x: isinstance(x, ResourceWarning), warnings_list)) -@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") @pytest.mark.parametrize("obj", objects_to_test[0], ids=objects_to_test[1]) def test_to_disk_resource_warning(obj): warnings_list = write_obj_and_catch_warnings(obj) assert len(warnings_list) == 0 -@pytest.mark.skipif(is_python2, reason="ResourceWarning needs Python 3.x") def test_writer_with_path_py35(): writer = None with make_tempdir() as d: @@ -135,13 +132,11 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) -if not is_python2: +class TestToDiskResourceWarningUnittest(TestCase): + def test_resource_warning(self): + scenarios = zip(*objects_to_test) - class TestToDiskResourceWarningUnittest(TestCase): - def test_resource_warning(self): - scenarios = zip(*objects_to_test) - - for scenario in scenarios: - with self.subTest(msg=scenario[1]): - warnings_list = write_obj_and_catch_warnings(scenario[0]) - self.assertEqual(len(warnings_list), 0) + for scenario in scenarios: + with self.subTest(msg=scenario[1]): + warnings_list = write_obj_and_catch_warnings(scenario[0]) + self.assertEqual(len(warnings_list), 0) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index f0bad9c10..e570b1025 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -2,7 +2,6 @@ import pytest import pickle from spacy.vocab import Vocab from spacy.strings import StringStore -from spacy.compat import is_python2 from ..util import make_tempdir @@ -135,7 +134,6 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): assert list(sstore1_d) != list(sstore2_d) -@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 819338eeb..cc95252a6 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -6,7 +6,6 @@ from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string from spacy.tokens import Doc -from spacy.compat import is_python2 from ..util import add_vecs_to_vocab, get_cosine, make_tempdir @@ -336,7 +335,6 @@ def test_vocab_prune_vectors(): assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3) -@pytest.mark.skipif(is_python2, reason="Dict order? Not sure if worth investigating") def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) From 63c22969f4cc73cbee577fd026b46f2c4ecff43e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 16:17:48 +0200 Subject: [PATCH 109/203] Update test_issue5230.py --- spacy/tests/regression/test_issue5230.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9e83d6818..42b08eeff 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -76,6 +76,7 @@ def entity_linker(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker From 5424b70e51277049fac470a5c5458830202d03f0 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 16:18:53 +0200 Subject: [PATCH 110/203] Remove v2 test --- spacy/tests/test_misc.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index cc6d3a57d..5f9e72f79 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -107,36 +107,6 @@ def test_load_model_blank_shortcut(): util.load_model("blank:fjsfijsdof") -def test_load_model_version_compat(): - """Test warnings for various spacy_version specifications in meta. Since - this is more of a hack for v2, manually specify the current major.minor - version to simplify test creation.""" - nlp = util.load_model("blank:en") - assert nlp.meta["spacy_version"].startswith(">=2.3") - with make_tempdir() as d: - # no change: compatible - nlp.to_disk(d) - meta_path = Path(d / "meta.json") - util.get_model_meta(d) - - # additional compatible upper pin - nlp.meta["spacy_version"] = ">=2.3.0,<2.4.0" - srsly.write_json(meta_path, nlp.meta) - util.get_model_meta(d) - - # incompatible older version - nlp.meta["spacy_version"] = ">=2.2.5" - srsly.write_json(meta_path, nlp.meta) - with pytest.warns(UserWarning): - util.get_model_meta(d) - - # invalid version specification - nlp.meta["spacy_version"] = ">@#$%_invalid_version" - srsly.write_json(meta_path, nlp.meta) - with pytest.warns(UserWarning): - util.get_model_meta(d) - - @pytest.mark.parametrize( "version,constraint,compatible", [ From 988d2a4edaaf654b3076367868781ede5cf4e0da Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 20 Jun 2020 09:43:12 -0700 Subject: [PATCH 111/203] Add --code-path option to train CLI (#5618) --- spacy/cli/train_from_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 6080b698b..14e6d5b56 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -121,6 +121,7 @@ def train_cli( dev_path: ("Location of JSON-formatted development data", "positional", None, Path), config_path: ("Path to config file", "positional", None, Path), output_path: ("Output directory to store model in", "option", "o", Path) = None, + code_path: ("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path) = None, init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, @@ -155,6 +156,13 @@ def train_cli( "the specified output path doesn't exist, the directory will be " "created for you.", ) + if code_path is not None: + if not code_path.exists(): + msg.fail("Path to Python code not found", code_path, exits=1) + try: + util.import_file("python_code", code_path) + except Exception as e: + msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) tag_map = {} From dc069e90b39ef3ca0604e950780a84482386973f Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 21:13:11 +0200 Subject: [PATCH 112/203] fix token.morph_ for v.3 (cf PR #5517) --- spacy/tokens/morphanalysis.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index e0db52d5b..77e499968 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -73,3 +73,10 @@ cdef class MorphAnalysis: """Produce a dict representation. """ return self.vocab.morphology.feats_to_dict(self.to_json()) + + def __str__(self): + return self.to_json() + + def __repr__(self): + return self.to_json() + From c9242e9bf49f751907debfac92e80ae3f93057e8 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 21:47:23 +0200 Subject: [PATCH 113/203] fix entity linker (cf PR #5548) --- spacy/pipeline/pipes.pyx | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7c800eed8..536c2a8a5 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1219,13 +1219,11 @@ class EntityLinker(Pipe): sent_doc = doc[start_token:end_token].as_doc() sentence_docs.append(sent_doc) - sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) - bp_context(d_scores, sgd=sgd) set_dropout_rate(self.model, drop) sentence_encodings, bp_context = self.model.begin_update(sentence_docs) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) bp_context(d_scores) + if sgd is not None: self.model.finish_update(sgd) @@ -1306,22 +1304,28 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] - for i, doc in enumerate(docs): sentences = [s for s in doc.sents] if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() + for sent_index, sent in enumerate(sentences): + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + sent_doc = doc[start_token:end_token].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) sentence_encoding = self.model.predict([sent_doc])[0] xp = get_array_module(sentence_encoding) sentence_encoding_t = sentence_encoding.T sentence_norm = xp.linalg.norm(sentence_encoding_t) - for ent in sent_doc.ents: + for ent in sent.ents: entity_count += 1 to_discard = self.cfg.get("labels_discard", []) @@ -1337,21 +1341,11 @@ class EntityLinker(Pipe): final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - sent_doc = doc[sent.start:sent.end].as_doc() + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - - for ent in sent.ents: - entity_count += 1 - - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) final_tensors.append(sentence_encoding) else: From 5cb812e0ab2f1cbddcc13b9cf442482112d28ced Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 22:04:18 +0200 Subject: [PATCH 114/203] fix NER warn empty lookups (cf PR #5588) --- spacy/syntax/nn_parser.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8218240f0..1dcb92016 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -608,6 +608,8 @@ cdef class Parser: def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): self.cfg.update(kwargs) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples From 256d4c27c838f0f995b3e5beb5712f649ecf9ba1 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 22:38:00 +0200 Subject: [PATCH 115/203] fix tagger begin_training being called without examples --- spacy/pipeline/pipes.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 536c2a8a5..b3fa77732 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -394,12 +394,11 @@ class Tagger(Pipe): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} - # TODO: do we still need this? - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] cdef Vocab vocab = self.vocab if new_tag_map: + if "_SP" in orig_tag_map: + new_tag_map["_SP"] = orig_tag_map["_SP"] vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) From 617977427897bd2c2bb1fce9ff190a3045169cf9 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 22:49:37 +0200 Subject: [PATCH 116/203] fix test_build_dependencies by ignoring new libs --- spacy/language.py | 2 +- spacy/tests/package/test_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 36ecad68b..94da63a1a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1166,7 +1166,7 @@ def _fix_pretrained_vectors_name(nlp): else: raise ValueError(Errors.E092) if nlp.vocab.vectors.size != 0: - link_vectors_to_models(nlp.vocab, skip_rank=True) + link_vectors_to_models(nlp.vocab) for name, proc in nlp.pipeline: if not hasattr(proc, "cfg"): continue diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 0dc0f9d6c..a7c9a3ea4 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,7 +10,7 @@ def test_build_dependencies(): "mock", "flake8", ] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] + libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] # check requirements.txt req_dict = {} From 12dc8ab208720e019ed02b3b63432f59982bc5bd Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 23:07:42 +0200 Subject: [PATCH 117/203] remove redundant code from master in EntityLinker --- spacy/pipeline/pipes.pyx | 8 -------- spacy/tests/regression/test_issue5230.py | 7 +++---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b3fa77732..98414736b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1136,14 +1136,6 @@ class EntityLinker(Pipe): # how many neightbour sentences to take into account self.n_sents = cfg.get("n_sents", 0) - def set_kb(self, kb): - self.kb = kb - - def require_model(self): - # Raise an error if the component's model is not initialized. - if getattr(self, "model", None) in (None, True, False): - raise ValueError(Errors.E109.format(name=self.name)) - def require_kb(self): # Raise an error if the knowledge base is not initialized. if len(self.kb) == 0: diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 42b08eeff..b46bf9063 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -70,14 +70,13 @@ def tagger(): def entity_linker(): nlp = Language() - nlp.add_pipe(nlp.create_pipe("entity_linker")) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) + nlp.add_pipe(nlp.create_pipe("entity_linker", {"kb": kb})) entity_linker = nlp.get_pipe("entity_linker") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) - kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) - entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker From 2f6062a8a4353a3ee8c0602acbd1dba22f857fe4 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 23:14:45 +0200 Subject: [PATCH 118/203] add line that got removed from EntityLinker --- spacy/pipeline/pipes.pyx | 105 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 98414736b..4e04b96b5 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1302,71 +1302,72 @@ class EntityLinker(Pipe): # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. for sent_index, sent in enumerate(sentences): - # get n_neightbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) - for ent in sent.ents: - entity_count += 1 + for ent in sent.ents: + entity_count += 1 - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) - - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax().item() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From 689600e17d0e3734b29bf758e09068b7b4413437 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Sat, 20 Jun 2020 23:23:57 +0200 Subject: [PATCH 119/203] add additional test back in (it works now) --- spacy/tests/test_lemmatizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 4f7c0a026..050206539 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -35,6 +35,8 @@ def test_tagger_warns_no_lookups(): nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") + with pytest.warns(UserWarning): + tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() From c12713a8be09b4c9c5bd7c02ccf2f853d8698881 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 21 Jun 2020 13:44:00 +0200 Subject: [PATCH 120/203] Port CLI to Typer and add project stubs --- spacy/__main__.py | 33 +---------- spacy/about.py | 1 + spacy/cli/__init__.py | 11 +--- spacy/cli/_app.py | 31 ++++++++++ spacy/cli/convert.py | 36 ++++++++---- spacy/cli/debug_data.py | 21 ++++--- spacy/cli/download.py | 14 ++++- spacy/cli/evaluate.py | 17 +++--- spacy/cli/info.py | 11 +++- spacy/cli/init_model.py | 27 +++++---- spacy/cli/package.py | 13 +++-- spacy/cli/pretrain.py | 31 ++++------ spacy/cli/profile.py | 9 ++- spacy/cli/project.py | 100 +++++++++++++++++++++++++++++++++ spacy/cli/train_from_config.py | 68 ++++++---------------- spacy/cli/validate.py | 2 + spacy/schemas.py | 72 +++++++++++++++++++++--- 17 files changed, 327 insertions(+), 170 deletions(-) create mode 100644 spacy/cli/_app.py create mode 100644 spacy/cli/project.py diff --git a/spacy/__main__.py b/spacy/__main__.py index beed3170d..f3b3a66f6 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,31 +1,4 @@ -if __name__ == "__main__": - import plac - import sys - from wasabi import msg - from spacy.cli import download, link, info, package, pretrain, convert - from spacy.cli import init_model, profile, evaluate, validate, debug_data - from spacy.cli import train_cli +from spacy.cli import app - commands = { - "download": download, - "link": link, - "info": info, - "train": train_cli, - "pretrain": pretrain, - "debug-data": debug_data, - "evaluate": evaluate, - "convert": convert, - "package": package, - "init-model": init_model, - "profile": profile, - "validate": validate, - } - if len(sys.argv) == 1: - msg.info("Available commands", ", ".join(commands), exits=1) - command = sys.argv.pop(1) - sys.argv[0] = f"spacy {command}" - if command in commands: - plac.call(commands[command], sys.argv[1:]) - else: - available = f"Available: {', '.join(commands)}" - msg.fail(f"Unknown command: {command}", available, exits=1) +if __name__ == "__main__": + app() diff --git a/spacy/about.py b/spacy/about.py index 04a660ad1..54753b5a1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -5,3 +5,4 @@ __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json" +__projects__ = "https://github.com/explosion/spacy-boilerplates" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2ffbe2d0c..59d099b34 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,5 +1,4 @@ -from wasabi import msg - +from ._app import app # noqa: F401 from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 @@ -11,10 +10,4 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 - - -def link(*args, **kwargs): - msg.warn( - "As of spaCy v3.0, model symlinks are deprecated. You can load models " - "using their full names or from a directory path." - ) +from .project import project_cli # noqa: F401 diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py new file mode 100644 index 000000000..ccc50ff63 --- /dev/null +++ b/spacy/cli/_app.py @@ -0,0 +1,31 @@ +import typer +from wasabi import msg + + +def Arg(*args, help=None, **kwargs): + # Filter out help for now until it's officially supported + return typer.Argument(*args, **kwargs) + + +def Opt(*args, **kwargs): + return typer.Option(*args, show_default=True, **kwargs) + + +app = typer.Typer( + name="spacy", + help="""spaCy Command-line Interface + + +DOCS: https://spacy.io/api/cli +""", +) + + +@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) +def link(*args, **kwargs): + """As of spaCy v3.0, model symlinks are deprecated. You can load models + using their full names or from a directory path.""" + msg.warn( + "As of spaCy v3.0, model symlinks are deprecated. You can load models " + "using their full names or from a directory path." + ) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 2ffbeb458..95386e2b0 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,8 +1,11 @@ +from typing import Optional +from enum import Enum from pathlib import Path from wasabi import Printer import srsly import re +from ._app import app, Arg, Opt from .converters import conllu2json, iob2json, conll_ner2json from .converters import ner_jsonl2json @@ -21,23 +24,29 @@ CONVERTERS = { } # File types -FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") +class FileTypes(str, Enum): + json = "json" + jsonl = "jsonl" + msg = "msg" + + +@app.command("convert") def convert( # fmt: off - input_file: ("Input file", "positional", None, str), - output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", - file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", - n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, - seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, - model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, - morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, - merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, - converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", - ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, - lang: ("Language (if tokenizer required)", "option", "l", str) = None, + input_file: str = Arg(..., help="Input file"), + output_dir: str = Arg("-", help="Output directory. '-' for stdout."), + file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"), + n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), + seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), + model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"), + morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), + merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), + converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), + ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)"), + lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), # fmt: on ): """ @@ -46,6 +55,9 @@ def convert( is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json """ + if isinstance(file_type, FileTypes): + # We get an instance of the FileTypes from the CLI so we need its string value + file_type = file_type.value no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 21f49956d..66a94845d 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,9 +1,11 @@ +from typing import Optional from pathlib import Path from collections import Counter import sys import srsly from wasabi import Printer, MESSAGES +from ._app import app, Arg, Opt from ..gold import GoldCorpus from ..syntax import nonproj from ..util import load_model, get_lang_class @@ -18,17 +20,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 +@app.command("debug-data") def debug_data( # fmt: off - lang: ("Model language", "positional", None, str), - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - base_model: ("Name of model to update (optional)", "option", "b", str) = None, - pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner", - ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False, - verbose: ("Print additional information and explanations", "flag", "V", bool) = False, - no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False, + lang: str = Arg(..., help="Model language"), + train_path: Path = Arg(..., help="Location of JSON-formatted training data"), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), + tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), + base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"), + pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"), + ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), + verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), + no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), # fmt: on ): """ diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 3d56822a5..0f8edc28f 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,17 +1,25 @@ +from typing import List import requests import os import subprocess import sys from wasabi import msg +from ._app import app, Arg, Opt from .. import about from ..util import is_package, get_base_version +@app.command( + "download", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def download( - model: ("Model to download (shortcut or name)", "positional", None, str), - direct: ("Force direct download of name + version", "flag", "d", bool) = False, - *pip_args: ("Additional arguments to be passed to `pip install` on model install"), + # fmt: off + model: str = Arg(..., help="Model to download (shortcut or name)"), + direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"), + pip_args: List[str] = Arg(..., help="Additional arguments to be passed to `pip install` on model install"), + # fmt: on ): """ Download compatible model from default download path using pip. If --direct diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index bae252b1c..263e98b1b 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,20 +1,23 @@ +from typing import Optional from timeit import default_timer as timer from wasabi import msg +from ._app import app, Arg, Opt from ..gold import GoldCorpus from .. import util from .. import displacy +@app.command("evaluate") def evaluate( # fmt: off - model: ("Model name or path", "positional", None, str), - data_path: ("Location of JSON-formatted evaluation data", "positional", None, str), - gpu_id: ("Use GPU", "option", "g", int) = -1, - gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, - displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None, - displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25, - return_scores: ("Return dict containing model scores", "flag", "R", bool) = False, + model: str = Arg(..., help="Model name or path"), + data_path: str = Arg(..., help="Location of JSON-formatted evaluation data"), + gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), + gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), + displacy_path: Optional[str] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML"), + displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), + return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), # fmt: on ): """ diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 98fd5cabf..8ed74d545 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,17 +1,22 @@ +from typing import Optional import platform from pathlib import Path from wasabi import msg import srsly +from ._app import app, Arg, Opt from .validate import get_model_pkgs from .. import util from .. import about +@app.command("info") def info( - model: ("Optional model name", "positional", None, str) = None, - markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, - silent: ("Don't print anything (just return)", "flag", "s") = False, + # fmt: off + model: Optional[str] = Arg(None, help="Optional model name"), + markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), + silent: bool = Opt(False, "--silent", "-s", help="Don't print anything (just return)"), + # fmt: on ): """ Print info about spaCy installation. If a model is speficied as an argument, diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 700fa43de..e0fadd865 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,3 +1,4 @@ +from typing import Optional import math from tqdm import tqdm import numpy @@ -11,6 +12,7 @@ import srsly import warnings from wasabi import msg +from ._app import app, Arg, Opt from ..vectors import Vectors from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, load_model, OOV_RANK @@ -25,20 +27,21 @@ except ImportError: DEFAULT_OOV_PROB = -20 +@app.command("init-model") def init_model( # fmt: off - lang: ("Model language", "positional", None, str), - output_dir: ("Model output directory", "positional", None, Path), - freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None, - clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None, - jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None, - vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None, - prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1, - truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0, - vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None, - model_name: ("Optional name for the model meta", "option", "mn", str) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, - base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None + lang: str = Arg(..., help="Model language"), + output_dir: Path = Arg(..., help="Model output directory"), + freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file"), + clusters_loc: Optional[str] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data"), + jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file"), + vectors_loc: Optional[str] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format"), + prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), + truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), + omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), + base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)") # fmt: on ): """ diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 153e61ba3..d304be086 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,19 +1,22 @@ +from typing import Optional import shutil from pathlib import Path from wasabi import msg, get_raw_input import srsly +from ._app import app, Arg, Opt from .. import util from .. import about +@app.command("package") def package( # fmt: off - input_dir: ("Directory with model data", "positional", None, str), - output_dir: ("Output parent directory", "positional", None, str), - meta_path: ("Path to meta.json", "option", "m", str) = None, - create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False, - force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False, + input_dir: str = Arg(..., help="Directory with model data"), + output_dir: str = Arg(..., help="Output parent directory"), + meta_path: Optional[str] = Opt(None, "--meta-path", "-m", help="Path to meta.json"), + create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"), + force: bool = Opt(False, "--force", "-f", help="Force overwriting existing model in output directory"), # fmt: on ): """ diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 4f4029834..53afd750f 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,14 +1,15 @@ +from typing import Optional import random import numpy import time import re from collections import Counter -import plac from pathlib import Path from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory from wasabi import msg import srsly +from ._app import app, Arg, Opt from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc @@ -17,25 +18,17 @@ from .. import util from ..gold import Example -@plac.annotations( - # fmt: off - texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), - vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), - output_dir=("Directory to write models to on each epoch", "positional", None, Path), - config_path=("Path to config file", "positional", None, Path), - use_gpu=("Use GPU", "option", "g", int), - resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), - epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), - # fmt: on -) +@app.command("pretrain") def pretrain( - texts_loc, - vectors_model, - config_path, - output_dir, - use_gpu=-1, - resume_path=None, - epoch_resume=None, + # fmt: off + texts_loc: str =Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'"), + vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"), + output_dir: Path = Arg(..., help="Directory to write models to on each epoch"), + config_path: Path = Arg(..., help="Path to config file"), + use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), + epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), + # fmt: on ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 5b7a02212..fe3a4a2be 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,3 +1,4 @@ +from typing import Optional import tqdm from pathlib import Path import srsly @@ -8,14 +9,16 @@ import itertools import ml_datasets from wasabi import msg +from ._app import app, Arg, Opt from ..util import load_model +@app.command("profile") def profile( # fmt: off - model: ("Model to load", "positional", None, str), - inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None, - n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000, + model: str = Arg(..., help="Model to load"), + inputs: Optional[str] = Arg(None, help="Location of input file. '-' for stdin."), + n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), # fmt: on ): """ diff --git a/spacy/cli/project.py b/spacy/cli/project.py new file mode 100644 index 000000000..ce60c0a21 --- /dev/null +++ b/spacy/cli/project.py @@ -0,0 +1,100 @@ +from typing import List, Dict +import typer +import srsly +from pathlib import Path +import os +import subprocess +import sys +from wasabi import msg +import shlex + +from ._app import app, Arg, Opt +from .. import about +from ..schemas import ProjectConfigSchema, validate + +CONFIG_FILE = "project.yml" +SUBDIRS = [ + "assets", + "configs", + "packages", + "metrics", + "scripts", + "notebooks", + "training", +] + + +project_cli = typer.Typer(help="Command-line interface for spaCy projects") + + +def load_project_config(path): + config_path = path / CONFIG_FILE + if not config_path.exists(): + msg.fail("Can't find project config", config_path, exits=1) + config = srsly.read_yaml(config_path) + errors = validate(ProjectConfigSchema, config) + if errors: + msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) + return config + + +def create_dirs(project_dir: Path): + for subdir in SUBDIRS: + (project_dir / subdir).mkdir(parents=True) + + +def run_cmd(command: str): + status = subprocess.call(shlex.split(command), env=os.environ.copy()) + if status != 0: + sys.exit(status) + + +def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}): + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + msg.info(command) + run_cmd(command) + + +@project_cli.command("clone") +def project_clone( + # fmt: off + name: str = Arg(..., help="The name of the template to fetch"), + dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), + repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + # fmt: on +): + """Clone a project template from a repository.""" + print("Cloning", repo) + + +@project_cli.command("run") +def project_run( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(None, help="Name of command defined in project config") + # fmt: on +): + """Run scripts defined in the project.""" + config = load_project_config(project_dir) + config_commands = config.get("commands", []) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand is None: + all_commands = config.get("run", []) + if not all_commands: + msg.warn("No run commands defined in project config", exits=0) + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + for command in all_commands: + if command not in commands: + msg.fail(f"Can't find command '{command}' in project config", exits=1) + msg.divider(command) + run_commands(commands[command]["script"], variables) + return + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + run_commands(commands[subcommand]["script"], variables) + + +app.add_typer(project_cli, name="project") diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 14e6d5b56..983433c0c 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,16 +1,15 @@ -from typing import Optional, Dict, List, Union, Sequence +from typing import Optional from timeit import default_timer as timer - import srsly -from pydantic import BaseModel, FilePath import tqdm from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model, use_pytorch_for_gpu_memory +from thinc.api import use_pytorch_for_gpu_memory import random +from ._app import app, Arg, Opt from ..gold import GoldCorpus from ..lookups import Lookups from .. import util @@ -19,6 +18,9 @@ from ..errors import Errors # Don't remove - required to load the built-in architectures from ..ml import models # noqa: F401 +# from ..schemas import ConfigSchema # TODO: include? + + registry = util.registry CONFIG_STR = """ @@ -80,54 +82,20 @@ subword_features = true """ -class PipelineComponent(BaseModel): - factory: str - model: Model - - class Config: - arbitrary_types_allowed = True - - -class ConfigSchema(BaseModel): - optimizer: Optional["Optimizer"] - - class training(BaseModel): - patience: int = 10 - eval_frequency: int = 100 - dropout: float = 0.2 - init_tok2vec: Optional[FilePath] = None - max_epochs: int = 100 - orth_variant_level: float = 0.0 - gold_preproc: bool = False - max_length: int = 0 - use_gpu: int = 0 - scores: List[str] = ["ents_p", "ents_r", "ents_f"] - score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} - limit: int = 0 - batch_size: Union[Sequence[int], int] - - class nlp(BaseModel): - lang: str - vectors: Optional[str] - pipeline: Optional[Dict[str, PipelineComponent]] - - class Config: - extra = "allow" - - +@app.command("train") def train_cli( # fmt: off - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - config_path: ("Path to config file", "positional", None, Path), - output_path: ("Output directory to store model in", "option", "o", Path) = None, - code_path: ("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path) = None, - init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, - verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, - use_gpu: ("Use GPU", "option", "g", int) = -1, - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, + train_path: Path = Arg(..., help="Location of JSON-formatted training data"), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), + config_path: Path = Arg(..., help="Path to config file"), + output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), + raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), + verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), + omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), # fmt: on ): """ diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 080cd77e2..7f4129d4f 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -3,11 +3,13 @@ import sys import requests from wasabi import msg +from ._app import app from .. import about from ..util import get_package_version, get_installed_models, get_base_version from ..util import get_package_path, get_model_meta, is_compatible_version +@app.command("validate") def validate(): """ Validate that the currently installed version of spaCy is compatible diff --git a/spacy/schemas.py b/spacy/schemas.py index 3024326dd..a20bbf6ed 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,8 +1,9 @@ -from typing import Dict, List, Union, Optional +from typing import Dict, List, Union, Optional, Sequence from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator -from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath from collections import defaultdict +from thinc.api import Model from .attrs import NAMES @@ -169,18 +170,42 @@ class ModelMetaSchema(BaseModel): # fmt: on -# Training data object in "simple training style" +# JSON training format -class SimpleTrainingSchema(BaseModel): - # TODO: write +class PipelineComponent(BaseModel): + factory: str + model: Model class Config: - title = "Schema for training data dict in passed to nlp.update" - extra = "forbid" + arbitrary_types_allowed = True -# JSON training format +class ConfigSchema(BaseModel): + optimizer: Optional["Optimizer"] + + class training(BaseModel): + patience: int = 10 + eval_frequency: int = 100 + dropout: float = 0.2 + init_tok2vec: Optional[FilePath] = None + max_epochs: int = 100 + orth_variant_level: float = 0.0 + gold_preproc: bool = False + max_length: int = 0 + use_gpu: int = 0 + scores: List[str] = ["ents_p", "ents_r", "ents_f"] + score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} + limit: int = 0 + batch_size: Union[Sequence[int], int] + + class nlp(BaseModel): + lang: str + vectors: Optional[str] + pipeline: Optional[Dict[str, PipelineComponent]] + + class Config: + extra = "allow" class TrainingSchema(BaseModel): @@ -189,3 +214,34 @@ class TrainingSchema(BaseModel): class Config: title = "Schema for training data in spaCy's JSON format" extra = "forbid" + + +# Project config Schema + + +class ProjectConfigAsset(BaseModel): + dest: StrictStr = Field(..., title="Destination of downloaded asset") + url: StrictStr = Field(..., title="URL of asset") + + +class ProjectConfigCommand(BaseModel): + # fmt: off + name: StrictStr = Field(..., title="Name of command") + help: Optional[StrictStr] = Field(None, title="Command description") + script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") + dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies") + dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs") + dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + # fmt: on + + +class ProjectConfigSchema(BaseModel): + # fmt: off + variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") + assets: List[ProjectConfigAsset] = Field([], title="Data assets") + run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") + commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") + # fmt: on + + class Config: + title = "Schema for project configuration file" From 275bab62df5b9914b29bcb93ce5732966a8c6c82 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 21 Jun 2020 21:35:01 +0200 Subject: [PATCH 121/203] Refactor CLI --- spacy/__main__.py | 5 +- spacy/cli/convert.py | 50 +++++++++++++--- spacy/cli/debug_data.py | 56 ++++++++++++++---- spacy/cli/download.py | 45 +++++++------- spacy/cli/evaluate.py | 47 ++++++++++++--- spacy/cli/info.py | 105 ++++++++++++++++++++------------- spacy/cli/init_model.py | 85 ++++++++++++++++++++------ spacy/cli/package.py | 75 +++++++++++++++-------- spacy/cli/pretrain.py | 26 +++++++- spacy/cli/profile.py | 17 ++++-- spacy/cli/project.py | 79 +++++++++++-------------- spacy/cli/train_from_config.py | 24 ++++---- spacy/cli/validate.py | 14 +++-- spacy/schemas.py | 4 +- spacy/util.py | 28 ++++++++- 15 files changed, 451 insertions(+), 209 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index f3b3a66f6..6015894b6 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,4 +1,7 @@ from spacy.cli import app +from typer.main import get_command if __name__ == "__main__": - app() + command = get_command(app) + # Ensure that the help messages always display the correct prompt + command(prog_name="python -m spacy") diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 95386e2b0..24d266504 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -34,10 +34,10 @@ class FileTypes(str, Enum): @app.command("convert") -def convert( +def convert_cli( # fmt: off - input_file: str = Arg(..., help="Input file"), - output_dir: str = Arg("-", help="Output directory. '-' for stdout."), + input_file: str = Arg(..., help="Input file", exists=True), + output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True), file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"), n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), @@ -45,7 +45,7 @@ def convert( morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), - ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)"), + ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), # fmt: on ): @@ -58,8 +58,39 @@ def convert( if isinstance(file_type, FileTypes): # We get an instance of the FileTypes from the CLI so we need its string value file_type = file_type.value - no_print = output_dir == "-" - msg = Printer(no_print=no_print) + silent = output_dir == "-" + convert( + input_file, + output_dir, + file_type=file_type, + n_sents=n_sents, + seg_sents=seg_sents, + model=model, + morphology=morphology, + merge_subtokens=merge_subtokens, + converter=converter, + ner_map_path=ner_map_path, + lang=lang, + silent=silent, + ) + + +def convert( + input_file: Path, + output_dir: Path, + *, + file_type: str = "json", + n_sents: int = 1, + seg_sents: bool = False, + model: Optional[str] = None, + morphology: bool = False, + merge_subtokens: bool = False, + converter: str = "auto", + ner_map_path: Optional[Path] = None, + lang: Optional[str] = None, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent, pretty=not silent) input_path = Path(input_file) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? @@ -85,7 +116,8 @@ def convert( converter = converter_autodetect else: msg.warn( - "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" + "Can't automatically detect NER format. Conversion may not " + "succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) @@ -102,7 +134,7 @@ def convert( merge_subtokens=merge_subtokens, lang=lang, model=model, - no_print=no_print, + no_print=silent, ner_map=ner_map, ) if output_dir != "-": @@ -124,7 +156,7 @@ def convert( srsly.write_jsonl("-", data) -def autodetect_ner_format(input_data): +def autodetect_ner_format(input_data: str) -> str: # guess format from the first 20 lines lines = input_data.split("\n")[:20] format_guesses = {"ner": 0, "iob": 0} diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 66a94845d..2cc3020e6 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, List, Sequence, Dict, Any, Tuple from pathlib import Path from collections import Counter import sys @@ -6,8 +6,9 @@ import srsly from wasabi import Printer, MESSAGES from ._app import app, Arg, Opt -from ..gold import GoldCorpus +from ..gold import GoldCorpus, Example from ..syntax import nonproj +from ..language import Language from ..util import load_model, get_lang_class @@ -21,12 +22,12 @@ BLANK_MODEL_THRESHOLD = 2000 @app.command("debug-data") -def debug_data( +def debug_data_cli( # fmt: off lang: str = Arg(..., help="Model language"), - train_path: Path = Arg(..., help="Location of JSON-formatted training data"), - dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), - tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), + train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False), base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"), pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), @@ -39,8 +40,36 @@ def debug_data( stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. """ - msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) + debug_data( + lang, + train_path, + dev_path, + tag_map_path=tag_map_path, + base_model=base_model, + pipeline=[p.strip() for p in pipeline.split(",")], + ignore_warnings=ignore_warnings, + verbose=verbose, + no_format=no_format, + silent=False, + ) + +def debug_data( + lang: str, + train_path: Path, + dev_path: Path, + *, + tag_map_path: Optional[Path] = None, + base_model: Optional[str] = None, + pipeline: List[str] = ["tagger", "parser", "ner"], + ignore_warnings: bool = False, + verbose: bool = False, + no_format: bool = True, + silent: bool = True, +): + msg = Printer( + no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings + ) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) @@ -52,7 +81,6 @@ def debug_data( tag_map = srsly.read_json(tag_map_path) # Initialize the model and pipeline - pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: @@ -449,7 +477,7 @@ def debug_data( sys.exit(1) -def _load_file(file_path, msg): +def _load_file(file_path: Path, msg: Printer) -> None: file_name = file_path.parts[-1] if file_path.suffix == ".json": with msg.loading(f"Loading {file_name}..."): @@ -468,7 +496,9 @@ def _load_file(file_path, msg): ) -def _compile_gold(examples, pipeline, nlp): +def _compile_gold( + examples: Sequence[Example], pipeline: List[str], nlp: Language +) -> Dict[str, Any]: data = { "ner": Counter(), "cats": Counter(), @@ -540,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp): return data -def _format_labels(labels, counts=False): +def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str: if counts: return ", ".join([f"'{l}' ({c})" for l, c in labels]) return ", ".join([f"'{l}'" for l in labels]) -def _get_examples_without_label(data, label): +def _get_examples_without_label(data: Sequence[Example], label: str) -> int: count = 0 for ex in data: labels = [ @@ -559,7 +589,7 @@ def _get_examples_without_label(data, label): return count -def _get_labels_from_model(nlp, pipe_name): +def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]: if pipe_name not in nlp.pipe_names: return set() pipe = nlp.get_pipe(pipe_name) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0f8edc28f..920250a61 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,31 +1,36 @@ -from typing import List +from typing import Optional, Sequence, Union import requests -import os -import subprocess import sys from wasabi import msg +import typer from ._app import app, Arg, Opt from .. import about -from ..util import is_package, get_base_version +from ..util import is_package, get_base_version, run_command @app.command( "download", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) -def download( +def download_cli( # fmt: off + ctx: typer.Context, model: str = Arg(..., help="Model to download (shortcut or name)"), direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"), - pip_args: List[str] = Arg(..., help="Additional arguments to be passed to `pip install` on model install"), # fmt: on ): """ Download compatible model from default download path using pip. If --direct flag is set, the command expects the full model name with version. - For direct downloads, the compatibility check will be skipped. + For direct downloads, the compatibility check will be skipped. All + additional arguments provided to this command will be passed to `pip install` + on model installation. """ + download(model, direct, *ctx.args) + + +def download(model: str, direct: bool = False, *pip_args) -> None: if not is_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " @@ -41,22 +46,20 @@ def download( components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] - dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) + download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) - if dl != 0: # if download subprocess doesn't return 0, exit - sys.exit(dl) - msg.good( - "Download and installation successful", - f"You can now load the model via spacy.load('{model_name}')", - ) + download_model(dl_tpl.format(m=model_name, v=version), pip_args) + msg.good( + "Download and installation successful", + f"You can now load the model via spacy.load('{model_name}')", + ) -def get_json(url, desc): +def get_json(url: str, desc: str) -> Union[dict, list]: r = requests.get(url) if r.status_code != 200: msg.fail( @@ -70,7 +73,7 @@ def get_json(url, desc): return r.json() -def get_compatibility(): +def get_compatibility() -> dict: version = get_base_version(about.__version__) comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] @@ -79,7 +82,7 @@ def get_compatibility(): return comp[version] -def get_version(model, comp): +def get_version(model: str, comp: dict) -> str: model = get_base_version(model) if model not in comp: msg.fail( @@ -89,10 +92,12 @@ def get_version(model, comp): return comp[model][0] -def download_model(filename, user_pip_args=None): +def download_model( + filename: str, user_pip_args: Optional[Sequence[str]] = None +) -> None: download_url = about.__download_url__ + "/" + filename pip_args = ["--no-cache-dir"] if user_pip_args: pip_args.extend(user_pip_args) cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] - return subprocess.call(cmd, env=os.environ.copy()) + run_command(cmd) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 263e98b1b..8d0f67316 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,29 +1,52 @@ -from typing import Optional +from typing import Optional, List from timeit import default_timer as timer -from wasabi import msg +from wasabi import Printer +from pathlib import Path from ._app import app, Arg, Opt +from ..tokens import Doc +from ..scorer import Scorer from ..gold import GoldCorpus from .. import util from .. import displacy @app.command("evaluate") -def evaluate( +def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: str = Arg(..., help="Location of JSON-formatted evaluation data"), + data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), - displacy_path: Optional[str] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML"), + displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ + evaluate( + model, + data_path, + gpu_id=gpu_id, + gold_preproc=gold_preproc, + displacy_path=displacy_path, + displacy_limit=displacy_limit, + silent=False, + ) + + +def evaluate( + model: str, + data_path: Path, + gpu_id: int = -1, + gold_preproc: bool = False, + displacy_path: Optional[Path] = None, + displacy_limit: int = 25, + silent: bool = True, +) -> Scorer: + msg = Printer(no_print=silent, pretty=not silent) util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) @@ -78,11 +101,17 @@ def evaluate( ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - if return_scores: - return scorer.scores + return scorer.scores -def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): +def render_parses( + docs: List[Doc], + output_path: Path, + model_name: str = "", + limit: int = 250, + deps: bool = True, + ents: bool = True, +): docs[0].user_data["title"] = model_name if ents: html = displacy.render(docs[:limit], style="ent", page=True) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 8ed74d545..e6156ee6d 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,7 +1,7 @@ -from typing import Optional +from typing import Optional, Dict, Any, Union import platform from pathlib import Path -from wasabi import msg +from wasabi import Printer import srsly from ._app import app, Arg, Opt @@ -11,7 +11,7 @@ from .. import about @app.command("info") -def info( +def info_cli( # fmt: off model: Optional[str] = Arg(None, help="Optional model name"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), @@ -23,60 +23,83 @@ def info( print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ + info(model, markdown=markdown, silent=silent) + + +def info( + model: Optional[str], *, markdown: bool = False, silent: bool = True +) -> Union[str, dict]: + msg = Printer(no_print=silent, pretty=not silent) if model: - if util.is_package(model): - model_path = util.get_package_path(model) - else: - model_path = model - meta_path = model_path / "meta.json" - if not meta_path.is_file(): - msg.fail("Can't find model meta.json", meta_path, exits=1) - meta = srsly.read_json(meta_path) - if model_path.resolve() != model_path: - meta["link"] = str(model_path) - meta["source"] = str(model_path.resolve()) - else: - meta["source"] = str(model_path) + title = f"Info about model '{model}'" + data = info_model(model, silent=silent) + else: + title = "Info about spaCy" + data = info_spacy(silent=silent) + markdown_data = get_markdown(data, title=title) + if markdown: if not silent: - title = f"Info about model '{model}'" - model_meta = { - k: v for k, v in meta.items() if k not in ("accuracy", "speed") - } - if markdown: - print_markdown(model_meta, title=title) - else: - msg.table(model_meta, title=title) - return meta - all_models, _ = get_model_pkgs() - data = { + print(markdown_data) + return markdown_data + if not silent: + msg.table(data, title=title) + return data + + +def info_spacy(*, silent: bool = True) -> Dict[str, any]: + """Generate info about the current spaCy intallation. + + silent (bool): Don't print anything, just return. + RETURNS (dict): The spaCy info. + """ + all_models, _ = get_model_pkgs(silent=silent) + models = ", ".join(f"{m['name']} ({m['version']})" for m in all_models.values()) + return { "spaCy version": about.__version__, "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": ", ".join( - f"{m['name']} ({m['version']})" for m in all_models.values() - ), + "Models": models, } - if not silent: - title = "Info about spaCy" - if markdown: - print_markdown(data, title=title) - else: - msg.table(data, title=title) - return data -def print_markdown(data, title=None): - """Print data in GitHub-flavoured Markdown format for issues etc. +def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: + """Generate info about a specific model. + + model (str): Model name of path. + silent (bool): Don't print anything, just return. + RETURNS (dict): The model meta. + """ + msg = Printer(no_print=silent, pretty=not silent) + if util.is_package(model): + model_path = util.get_package_path(model) + else: + model_path = model + meta_path = model_path / "meta.json" + if not meta_path.is_file(): + msg.fail("Can't find model meta.json", meta_path, exits=1) + meta = srsly.read_json(meta_path) + if model_path.resolve() != model_path: + meta["link"] = str(model_path) + meta["source"] = str(model_path.resolve()) + else: + meta["source"] = str(model_path) + return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")} + + +def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str: + """Get data in GitHub-flavoured Markdown format for issues etc. data (dict or list of tuples): Label/value pairs. title (str / None): Title, will be rendered as headline 2. + RETURNS (str): The Markdown string. """ markdown = [] for key, value in data.items(): if isinstance(value, str) and Path(value).exists(): continue markdown.append(f"* **{key}:** {value}") + result = "\n{}\n".format("\n".join(markdown)) if title: - print(f"\n## {title}") - print("\n{}\n".format("\n".join(markdown))) + result = f"\n## {title}\n{result}" + return result diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index e0fadd865..37f862ef2 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, List, Dict, Any, Union, IO import math from tqdm import tqdm import numpy @@ -10,11 +10,12 @@ import gzip import zipfile import srsly import warnings -from wasabi import msg +from wasabi import Printer from ._app import app, Arg, Opt from ..vectors import Vectors from ..errors import Errors, Warnings +from ..language import Language from ..util import ensure_path, get_lang_class, load_model, OOV_RANK from ..lookups import Lookups @@ -28,14 +29,14 @@ DEFAULT_OOV_PROB = -20 @app.command("init-model") -def init_model( +def init_model_cli( # fmt: off lang: str = Arg(..., help="Model language"), output_dir: Path = Arg(..., help="Model output directory"), - freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file"), - clusters_loc: Optional[str] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data"), - jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file"), - vectors_loc: Optional[str] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format"), + freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), + clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), + jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), + vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), @@ -49,6 +50,38 @@ def init_model( and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ + init_model( + lang, + output_dir, + freqs_loc=freqs_loc, + clusters_loc=clusters_loc, + jsonl_loc=jsonl_loc, + prune_vectors=prune_vectors, + truncate_vectors=truncate_vectors, + vectors_name=vectors_name, + model_name=model_name, + omit_extra_lookups=omit_extra_lookups, + base_model=base_model, + silent=False, + ) + + +def init_model( + lang: str, + output_dir: Path, + freqs_loc: Optional[Path] = None, + clusters_loc: Optional[Path] = None, + jsonl_loc: Optional[Path] = None, + vectors_loc: Optional[Path] = None, + prune_vectors: int = -1, + truncate_vectors: int = 0, + vectors_name: Optional[str] = None, + model_name: Optional[str] = None, + omit_extra_lookups: bool = False, + base_model: Optional[str] = None, + silent: bool = True, +) -> Language: + msg = Printer(no_print=silent, pretty=not silent) if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] @@ -71,7 +104,7 @@ def init_model( freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) - lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) + lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) @@ -86,7 +119,9 @@ def init_model( msg.good("Successfully created model") if vectors_loc is not None: - add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) + add_vectors( + msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name + ) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( @@ -98,7 +133,7 @@ def init_model( return nlp -def open_file(loc): +def open_file(loc: Union[str, Path]) -> IO: """Handle .gz, .tar.gz or unzipped files""" loc = ensure_path(loc) if tarfile.is_tarfile(str(loc)): @@ -114,7 +149,9 @@ def open_file(loc): return loc.open("r", encoding="utf8") -def read_attrs_from_deprecated(freqs_loc, clusters_loc): +def read_attrs_from_deprecated( + msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] +) -> List[Dict[str, Any]]: if freqs_loc is not None: with msg.loading("Counting frequencies..."): probs, _ = read_freqs(freqs_loc) @@ -142,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc): return lex_attrs -def create_model(lang, lex_attrs, name=None, base_model=None): +def create_model( + lang: str, + lex_attrs: List[Dict[str, Any]], + name: Optional[str] = None, + base_model: Optional[Union[str, Path]] = None, +) -> Language: if base_model: nlp = load_model(base_model) # keep the tokenizer but remove any existing pipeline components due to @@ -169,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None): return nlp -def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): +def add_vectors( + msg: Printer, + nlp: Language, + vectors_loc: Optional[Path], + truncate_vectors: int, + prune_vectors: int, + name: Optional[str] = None, +) -> None: vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -179,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): else: if vectors_loc: with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(vectors_loc) + vectors_data, vector_keys = read_vectors(msg, vectors_loc) msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) @@ -198,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): nlp.vocab.prune_vectors(prune_vectors) -def read_vectors(vectors_loc, truncate_vectors=0): +def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0): f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: @@ -218,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0): return vectors_data, vectors_keys -def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): +def read_freqs( + freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 +): counts = PreshCounter() total = 0 with freqs_loc.open() as f: @@ -247,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): return probs, oov_prob -def read_clusters(clusters_loc): +def read_clusters(clusters_loc: Path) -> dict: clusters = {} if ftfy is None: warnings.warn(Warnings.W004) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index d304be086..6ba9b0386 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,22 +1,24 @@ -from typing import Optional +from typing import Optional, Union, Any, Dict import shutil from pathlib import Path -from wasabi import msg, get_raw_input +from wasabi import Printer, get_raw_input import srsly +import sys from ._app import app, Arg, Opt +from ..schemas import validate, ModelMetaSchema from .. import util from .. import about @app.command("package") -def package( +def package_cli( # fmt: off - input_dir: str = Arg(..., help="Directory with model data"), - output_dir: str = Arg(..., help="Output parent directory"), - meta_path: Optional[str] = Opt(None, "--meta-path", "-m", help="Path to meta.json"), + input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), + output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), + meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"), - force: bool = Opt(False, "--force", "-f", help="Force overwriting existing model in output directory"), + force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), # fmt: on ): """ @@ -26,6 +28,25 @@ def package( set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ + package( + input_dir, + output_dir, + meta_path=meta_path, + create_meta=create_meta, + force=force, + silent=False, + ) + + +def package( + input_dir: Path, + output_dir: Path, + meta_path: Optional[Path] = None, + create_meta: bool = False, + force: bool = False, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) @@ -36,23 +57,20 @@ def package( if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) - meta_path = meta_path or input_path / "meta.json" - if meta_path.is_file(): - meta = srsly.read_json(meta_path) - if not create_meta: # only print if user doesn't want to overwrite - msg.good("Loaded meta.json from file", meta_path) - else: - meta = generate_meta(input_dir, meta, msg) - for key in ("lang", "name", "version"): - if key not in meta or meta[key] == "": - msg.fail( - f"No '{key}' setting found in meta.json", - "This setting is required to build your package.", - exits=1, - ) + meta_path = meta_path or input_dir / "meta.json" + if not meta_path.exists() or not meta_path.is_file(): + msg.fail("Can't load model meta.json", meta_path, exits=1) + meta = srsly.read_json(meta_path) + if not create_meta: # only print if user doesn't want to overwrite + msg.good("Loaded meta.json from file", meta_path) + else: + meta = generate_meta(input_dir, meta, msg) + errors = validate(ModelMetaSchema, meta) + if errors: + msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] - main_path = output_path / model_name_v + main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): @@ -66,21 +84,26 @@ def package( exits=1, ) Path.mkdir(package_path, parents=True) - shutil.copytree(str(input_path), str(package_path / model_name_v)) + shutil.copytree(str(input_dir), str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) msg.good(f"Successfully created package '{model_name_v}'", main_path) - msg.text("To build the package, run `python setup.py sdist` in this directory.") + with util.working_dir(main_path): + util.run_command([sys.executable, "setup.py", "sdist"]) + zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" + msg.good(f"Successfully created zipped Python package", zip_file) -def create_file(file_path, contents): +def create_file(file_path: Path, contents: str) -> None: file_path.touch() file_path.open("w", encoding="utf-8").write(contents) -def generate_meta(model_path, existing_meta, msg): +def generate_meta( + model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer +) -> Dict[str, Any]: meta = existing_meta or {} settings = [ ("lang", "Model language", meta.get("lang", "en")), diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 53afd750f..2962e5022 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -19,12 +19,12 @@ from ..gold import Example @app.command("pretrain") -def pretrain( +def pretrain_cli( # fmt: off - texts_loc: str =Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'"), + texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True), vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"), output_dir: Path = Arg(..., help="Directory to write models to on each epoch"), - config_path: Path = Arg(..., help="Path to config file"), + config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), @@ -45,6 +45,26 @@ def pretrain( all settings are the same between pretraining and training. Ideally, this is done by using the same config file for both commands. """ + pretrain( + texts_loc, + vectors_model, + output_dir, + config_path, + use_gpu=use_gpu, + resume_path=resume_path, + epoch_resume=epoch_resume, + ) + + +def pretrain( + texts_loc: Path, + vectors_model: str, + output_dir: Path, + config_path: Path, + use_gpu: int = -1, + resume_path: Optional[Path] = None, + epoch_resume: Optional[int] = None, +): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index fe3a4a2be..f4c893864 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Sequence, Union, Iterator import tqdm from pathlib import Path import srsly @@ -7,17 +7,18 @@ import pstats import sys import itertools import ml_datasets -from wasabi import msg +from wasabi import msg, Printer from ._app import app, Arg, Opt +from ..language import Language from ..util import load_model @app.command("profile") -def profile( +def profile_cli( # fmt: off model: str = Arg(..., help="Model to load"), - inputs: Optional[str] = Arg(None, help="Location of input file. '-' for stdin."), + inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True), n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), # fmt: on ): @@ -27,6 +28,10 @@ def profile( It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ + profile(model, inputs=inputs, n_texts=n_texts) + + +def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: @@ -46,12 +51,12 @@ def profile( s.strip_dirs().sort_stats("time").print_stats() -def parse_texts(nlp, texts): +def parse_texts(nlp: Language, texts: Sequence[str]) -> None: for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass -def _read_inputs(loc, msg): +def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]: if loc == "-": msg.info("Reading input from sys.stdin") file_ = sys.stdin diff --git a/spacy/cli/project.py b/spacy/cli/project.py index ce60c0a21..45cb163af 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,64 +1,25 @@ -from typing import List, Dict +from typing import List, Dict, Any import typer import srsly from pathlib import Path -import os -import subprocess -import sys from wasabi import msg import shlex from ._app import app, Arg, Opt from .. import about from ..schemas import ProjectConfigSchema, validate +from ..util import run_command + CONFIG_FILE = "project.yml" -SUBDIRS = [ - "assets", - "configs", - "packages", - "metrics", - "scripts", - "notebooks", - "training", -] +DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"] project_cli = typer.Typer(help="Command-line interface for spaCy projects") -def load_project_config(path): - config_path = path / CONFIG_FILE - if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - config = srsly.read_yaml(config_path) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) - return config - - -def create_dirs(project_dir: Path): - for subdir in SUBDIRS: - (project_dir / subdir).mkdir(parents=True) - - -def run_cmd(command: str): - status = subprocess.call(shlex.split(command), env=os.environ.copy()) - if status != 0: - sys.exit(status) - - -def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}): - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - msg.info(command) - run_cmd(command) - - @project_cli.command("clone") -def project_clone( +def project_clone_cli( # fmt: off name: str = Arg(..., help="The name of the template to fetch"), dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), @@ -70,13 +31,17 @@ def project_clone( @project_cli.command("run") -def project_run( +def project_run_cli( # fmt: off project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), subcommand: str = Arg(None, help="Name of command defined in project config") # fmt: on ): """Run scripts defined in the project.""" + project_run(project_dir, subcommand) + + +def project_run(project_dir: Path, subcommand: str) -> None: config = load_project_config(project_dir) config_commands = config.get("commands", []) variables = config.get("variables", {}) @@ -98,3 +63,27 @@ def project_run( app.add_typer(project_cli, name="project") + + +def load_project_config(path: Path) -> Dict[str, Any]: + config_path = path / CONFIG_FILE + if not config_path.exists(): + msg.fail("Can't find project config", config_path, exits=1) + config = srsly.read_yaml(config_path) + errors = validate(ProjectConfigSchema, config) + if errors: + msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) + return config + + +def create_dirs(project_dir: Path) -> None: + for subdir in DIRS: + (project_dir / subdir).mkdir(parents=True) + + +def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + msg.info(command) + run_command(shlex.split(command)) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 983433c0c..79c3bf259 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict from timeit import default_timer as timer import srsly import tqdm @@ -85,9 +85,9 @@ subword_features = true @app.command("train") def train_cli( # fmt: off - train_path: Path = Arg(..., help="Location of JSON-formatted training data"), - dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), - config_path: Path = Arg(..., help="Path to config file"), + train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), @@ -162,14 +162,14 @@ def train_cli( def train( - config_path, - data_paths, - raw_text=None, - output_path=None, - tag_map=None, - weights_data=None, - omit_extra_lookups=False, -): + config_path: Path, + data_paths: Dict[str, Path], + raw_text: Optional[Path] = None, + output_path: Optional[Path] = None, + tag_map: Optional[Path] = None, + weights_data: Optional[bytes] = None, + omit_extra_lookups: bool = False, +) -> None: msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 7f4129d4f..4271817f1 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,7 +1,8 @@ +from typing import Tuple from pathlib import Path import sys import requests -from wasabi import msg +from wasabi import msg, Printer from ._app import app from .. import about @@ -10,11 +11,15 @@ from ..util import get_package_path, get_model_meta, is_compatible_version @app.command("validate") -def validate(): +def validate_cli(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ + validate() + + +def validate() -> None: model_pkgs, compat = get_model_pkgs() spacy_version = get_base_version(about.__version__) current_compat = compat.get(spacy_version, {}) @@ -57,7 +62,8 @@ def validate(): sys.exit(1) -def get_model_pkgs(): +def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: + msg = Printer(no_print=silent, pretty=not silent) with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -95,7 +101,7 @@ def get_model_pkgs(): return pkgs, compat -def reformat_version(version): +def reformat_version(version: str) -> str: """Hack to reformat old versions ending on '-alpha' to match pip format.""" if version.endswith("-alpha"): return version.replace("-alpha", "a0") diff --git a/spacy/schemas.py b/spacy/schemas.py index a20bbf6ed..04f9bbffa 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union, Optional, Sequence +from typing import Dict, List, Union, Optional, Sequence, Any from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath @@ -164,7 +164,7 @@ class ModelMetaSchema(BaseModel): email: Optional[StrictStr] = Field(None, title="Model author email") url: Optional[StrictStr] = Field(None, title="Model author URL") sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") - vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") + vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors") accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") # fmt: on diff --git a/spacy/util.py b/spacy/util.py index ad3dc3635..7f27e9467 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,10 +1,10 @@ +from typing import List, Union import os import importlib import importlib.util import re from pathlib import Path import random -from typing import List import thinc from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config import functools @@ -17,6 +17,8 @@ import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion +import subprocess +from contextlib import contextmanager try: @@ -427,6 +429,30 @@ def get_package_path(name): return Path(pkg.__file__).parent +def run_command(command: List[str]) -> None: + """Run a command on the command line as a subprocess. + + command (list): The split command. + """ + status = subprocess.call(command, env=os.environ.copy()) + if status != 0: + sys.exit(status) + + +@contextmanager +def working_dir(path: Union[str, Path]) -> None: + """Change current working directory and returns to previous on exit. + + path (str / Path): The directory to navigate to. + """ + prev_cwd = Path.cwd() + os.chdir(str(path)) + try: + yield + finally: + os.chdir(prev_cwd) + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer. From e0c16c0577b3ccd48562f9e1692213ff7a068658 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 21 Jun 2020 22:25:34 +0200 Subject: [PATCH 122/203] Update wasabi pin --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a104b68ba..0d0715e24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ thinc==8.0.0a9 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.4.0,<1.1.0 +wasabi>=0.7.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies diff --git a/setup.cfg b/setup.cfg index c19b8d857..5a4b044b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,7 +44,7 @@ install_requires = preshed>=3.0.2,<3.1.0 thinc==8.0.0a9 blis>=0.4.0,<0.5.0 - wasabi>=0.4.0,<1.1.0 + wasabi>=0.7.0,<1.1.0 srsly>=2.0.0,<3.0.0 catalogue>=0.0.7,<1.1.0 ml_datasets>=0.1.1 From 40bb918a4c8507f5c54a722e0388eda1da1e2b7a Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 21 Jun 2020 22:34:10 +0200 Subject: [PATCH 123/203] Remove unicode declarations and tidy up --- spacy/lang/es/punctuation.py | 3 - spacy/lang/gu/__init__.py | 3 - spacy/lang/gu/examples.py | 4 -- spacy/lang/gu/stop_words.py | 3 - spacy/lang/hy/__init__.py | 3 - spacy/lang/hy/examples.py | 3 - spacy/lang/hy/lex_attrs.py | 3 - spacy/lang/hy/stop_words.py | 3 - spacy/lang/hy/tag_map.py | 3 - spacy/lang/ja/bunsetu.py | 92 ++++++++++++++++-------- spacy/lang/ja/syntax_iterators.py | 29 ++++---- spacy/lang/kn/examples.py | 4 -- spacy/lang/ml/__init__.py | 3 - spacy/lang/ml/examples.py | 4 -- spacy/lang/ml/lex_attrs.py | 3 - spacy/lang/ml/stop_words.py | 4 -- spacy/lang/pl/lemmatizer.py | 3 - spacy/lang/sv/lex_attrs.py | 3 - spacy/tests/lang/de/test_noun_chunks.py | 3 - spacy/tests/lang/el/test_noun_chunks.py | 3 - spacy/tests/lang/es/test_noun_chunks.py | 3 - spacy/tests/lang/fa/test_noun_chunks.py | 3 - spacy/tests/lang/fr/test_noun_chunks.py | 3 - spacy/tests/lang/gu/test_text.py | 3 - spacy/tests/lang/hy/test_text.py | 3 - spacy/tests/lang/hy/test_tokenizer.py | 3 - spacy/tests/lang/id/test_noun_chunks.py | 3 - spacy/tests/lang/ja/test_serialize.py | 4 -- spacy/tests/lang/ml/test_text.py | 3 - spacy/tests/lang/nb/test_noun_chunks.py | 3 - spacy/tests/lang/sv/test_lex_attrs.py | 3 - spacy/tests/lang/zh/test_serialize.py | 3 - spacy/tests/regression/test_issue5152.py | 3 - spacy/tests/regression/test_issue5230.py | 1 - spacy/tests/regression/test_issue5458.py | 3 - 35 files changed, 76 insertions(+), 147 deletions(-) diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py index f989221c2..e9552371e 100644 --- a/spacy/lang/es/punctuation.py +++ b/spacy/lang/es/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 1f080c7c2..bc8fc260c 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py index 202a8d022..1cf75fd32 100644 --- a/spacy/lang/gu/examples.py +++ b/spacy/lang/gu/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 85d33763d..2c859681b 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ એમ diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 6aaa965bb..8928e52ae 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index 323f77b1c..69e354688 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.hy.examples import sentences diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index b556d679c..f84472d60 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index d75aad6e2..46d0f6b51 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ նա diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py index 722270110..09be1fd8d 100644 --- a/spacy/lang/hy/tag_map.py +++ b/spacy/lang/hy/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py index 7c3eee336..e8c802246 100644 --- a/spacy/lang/ja/bunsetu.py +++ b/spacy/lang/ja/bunsetu.py @@ -1,21 +1,11 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS - - POS_PHRASE_MAP = { "NOUN": "NP", "NUM": "NP", "PRON": "NP", "PROPN": "NP", - "VERB": "VP", - "ADJ": "ADJP", - "ADV": "ADVP", - "CCONJ": "CCONJP", } @@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False): dep = t.dep_ head = t.head.i if debug: - print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + print( + t.i, + t.orth_, + pos, + pos_type, + dep, + head, + bunsetu_may_end, + phrase_type, + phrase, + bunsetu, + ) # DET is always an individual bunsetu if pos == "DET": @@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False): # entering new bunsetu elif pos_type and ( - pos_type != phrase_type or # different phrase type arises - bunsetu_may_end # same phrase type but bunsetu already ended + pos_type != phrase_type + or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended ): # exceptional case: NOUN to VERB - if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + if ( + phrase_type == "NP" + and pos_type == "VP" + and prev_dep == "compound" + and prev_head == t.i + ): bunsetu.append(t) phrase_type = "VP" phrase.append(t) # exceptional case: VERB to NOUN - elif phrase_type == "VP" and pos_type == "NP" and ( - prev_dep == 'compound' and prev_head == t.i or - dep == 'compound' and prev == head or - prev_dep == 'nmod' and prev_head == t.i + elif ( + phrase_type == "VP" + and pos_type == "NP" + and ( + prev_dep == "compound" + and prev_head == t.i + or dep == "compound" + and prev == head + or prev_dep == "nmod" + and prev_head == t.i + ) ): bunsetu.append(t) phrase_type = "NP" @@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False): # NOUN bunsetu elif phrase_type == "NP": bunsetu.append(t) - if not bunsetu_may_end and (( - (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + (pos_type == "NP" or pos == "SYM") + and (prev_head == t.i or prev_head == head) + and prev_dep in {"compound", "nummod"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True @@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False): # VERB bunsetu elif phrase_type == "VP": bunsetu.append(t) - if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + if ( + not bunsetu_may_end + and pos == "VERB" + and prev_head == t.i + and prev_dep == "compound" + ): phrase.append(t) else: bunsetu_may_end = True # ADJ bunsetu - elif phrase_type == "ADJP" and tag != '連体詞': + elif phrase_type == "ADJP" and tag != "連体詞": bunsetu.append(t) - if not bunsetu_may_end and (( - pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + pos == "NOUN" + and (prev_head == t.i or prev_head == head) + and prev_dep in {"amod", "compound"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index cd1e4fde7..3f6e4bfa3 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -1,24 +1,22 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB # XXX this can probably be pruned a bit labels = [ - "nsubj", - "nmod", - "dobj", - "nsubjpass", - "pcomp", - "pobj", - "obj", - "obl", - "dative", - "appos", - "attr", - "ROOT", + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", ] + def noun_chunks(obj): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. @@ -52,4 +50,5 @@ def noun_chunks(obj): seen.update(w.i for w in word.head.rights) yield unseen[0], word.i + 1, np_label + SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py index d82630432..3e055752e 100644 --- a/spacy/lang/kn/examples.py +++ b/spacy/lang/kn/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index d052ded1b..e92a7617f 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py index a2a0ed10e..9794eab29 100644 --- a/spacy/lang/ml/examples.py +++ b/spacy/lang/ml/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py index 468ad88f8..9ac19b6a7 100644 --- a/spacy/lang/ml/lex_attrs.py +++ b/spacy/lang/ml/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 8bd6a7e02..441e93586 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ അത് diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 8b8d7fe27..b80a1a143 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...parts_of_speech import NAMES diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py index 24d06a97a..f8ada9e2e 100644 --- a/spacy/lang/sv/lex_attrs.py +++ b/spacy/lang/sv/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index 8d76ddd79..ff9f8d5e5 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index 4f24865d0..38e72b0b2 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 66bbd8c3a..a7ec4e562 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py index a98aae061..767e91f6b 100644 --- a/spacy/tests/lang/fa/test_noun_chunks.py +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index ea93a5a35..5fd6897f7 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py index aa8d442a2..2d251166f 100644 --- a/spacy/tests/lang/gu/test_text.py +++ b/spacy/tests/lang/gu/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index cbdb77e4e..ac0f1e128 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.hy.lex_attrs import like_num diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py index 3eeb8b54e..e9efb224a 100644 --- a/spacy/tests/lang/hy/test_tokenizer.py +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index add76f9b9..445643933 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 018e645bb..9e703e63d 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest from spacy.lang.ja import Japanese from ...util import make_tempdir diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py index 2883cf5bb..aced78461 100644 --- a/spacy/tests/lang/ml/test_text.py +++ b/spacy/tests/lang/ml/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index 653491a64..c6a00354b 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py index abe6b0f7b..656c4706b 100644 --- a/spacy/tests/lang/sv/test_lex_attrs.py +++ b/spacy/tests/lang/sv/test_lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.sv.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 56f092ed8..d84920c3e 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.zh import Chinese from ...util import make_tempdir diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index 758ac9c14..a9a57746d 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index b46bf9063..9ffa3862c 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,4 +1,3 @@ -# coding: utf8 import warnings from unittest import TestCase import pytest diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 3281e2a8c..a7a2959df 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc From ef5f548fb0b8f4737a41a838c0d1123752e12346 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 21 Jun 2020 22:38:04 +0200 Subject: [PATCH 124/203] Tidy up and auto-format --- spacy/lang/en/tokenizer_exceptions.py | 24 +++++++++++++++--------- spacy/lang/ja/syntax_iterators.py | 2 +- spacy/lang/ja/tag_bigram_map.py | 11 +---------- spacy/lang/ja/tag_orth_map.py | 14 +++----------- spacy/lang/ta/examples.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tests/lang/ja/test_serialize.py | 4 ++-- spacy/tests/lang/ja/test_tokenizer.py | 16 ++++++++-------- spacy/tests/package/test_requirements.py | 8 +++++++- spacy/tests/test_misc.py | 3 --- 10 files changed, 39 insertions(+), 47 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index e024dd483..dc8a5c04d 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: +for word in [ + "who", + "what", + "when", + "where", + "why", + "how", + "there", + "that", + "this", + "these", + "those", +]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, @@ -396,14 +408,8 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], - "c'mon": [ - {ORTH: "c'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ], - "C'mon": [ - {ORTH: "C'm", NORM: "come", LEMMA: "come"}, - {ORTH: "on"} - ] + "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], + "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}], } _exc.update(_other_exc) diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index 3f6e4bfa3..bb0554cf9 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -24,7 +24,7 @@ def noun_chunks(obj): doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings.add(label) for label in labels] - conj = doc.vocab.strings.add("conj") + doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") seen = set() for i, word in enumerate(obj): diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py index 5ed9aec89..9d15fc520 100644 --- a/spacy/lang/ja/tag_bigram_map.py +++ b/spacy/lang/ja/tag_bigram_map.py @@ -1,21 +1,15 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB +from ...symbols import ADJ, AUX, NOUN, PART, VERB # mapping from tag bi-gram to pos of previous token TAG_BIGRAM_MAP = { # This covers only small part of AUX. ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), - ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), - # This covers acl, advcl, obl and root, but has side effect for compound. ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), # This covers almost all of the deps ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), - ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), ("副詞", "動詞-非自立可能"): (None, VERB), ("形容詞-一般", "動詞-非自立可能"): (None, VERB), @@ -25,12 +19,9 @@ TAG_BIGRAM_MAP = { ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), ("補助記号-読点", "動詞-非自立可能"): (None, VERB), - ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), - ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), - ("動詞-一般", "助詞-副助詞"): (None, PART), ("動詞-非自立可能", "助詞-副助詞"): (None, PART), ("助動詞", "助詞-副助詞"): (None, PART), diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py index 355cc655b..9d32cdea7 100644 --- a/spacy/lang/ja/tag_orth_map.py +++ b/spacy/lang/ja/tag_orth_map.py @@ -1,17 +1,9 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X +from ...symbols import DET, PART, PRON, SPACE, X # mapping from tag bi-gram to pos of previous token TAG_ORTH_MAP = { - "空白": { - " ": SPACE, - " ": X, - }, - "助詞-副助詞": { - "たり": PART, - }, + "空白": {" ": SPACE, " ": X}, + "助詞-副助詞": {"たり": PART}, "連体詞": { "あの": DET, "かの": DET, diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 245b8ba1a..c3c47e66e 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,5 +18,5 @@ sentences = [ "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", - "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.", ] diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 28bc51228..f732a9097 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,6 @@ import re -from .char_classes import ALPHA_LOWER, ALPHA +from .char_classes import ALPHA_LOWER from ..symbols import ORTH, POS, TAG, LEMMA, SPACE diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 9e703e63d..4d4174b03 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -7,7 +7,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None with make_tempdir() as d: file_path = d / "tokenizer" @@ -15,7 +15,7 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp = Japanese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() - assert nlp.tokenizer.split_mode == None + assert nlp.tokenizer.split_mode is None # split mode is (de)serialized correctly nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ee532cb81..f76a9067a 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -29,10 +29,9 @@ POS_TESTS = [ ] SENTENCE_TESTS = [ - ('あれ。これ。', ['あれ。', 'これ。']), - ('「伝染るんです。」という漫画があります。', - ['「伝染るんです。」という漫画があります。']), - ] + ("あれ。これ。", ["あれ。", "これ。"]), + ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]), +] # fmt: on @@ -48,7 +47,7 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags -#XXX This isn't working? Always passes +# XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] @@ -57,7 +56,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): @pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) -def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): +def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents] assert sents == expected_sents @@ -74,13 +73,14 @@ def test_ja_tokenizer_naughty_strings(ja_tokenizer, text): assert tokens.text_with_ws == text -@pytest.mark.parametrize("text,len_a,len_b,len_c", +@pytest.mark.parametrize( + "text,len_a,len_b,len_c", [ ("選挙管理委員会", 4, 3, 1), ("客室乗務員", 3, 2, 1), ("労働者協同組合", 4, 3, 1), ("機能性食品", 3, 2, 1), - ] + ], ) def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index a7c9a3ea4..6cc8fa6a8 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,7 +10,13 @@ def test_build_dependencies(): "mock", "flake8", ] - libs_ignore_setup = ["fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core"] + libs_ignore_setup = [ + "fugashi", + "natto-py", + "pythainlp", + "sudachipy", + "sudachidict_core", + ] # check requirements.txt req_dict = {} diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 5f9e72f79..f6724f632 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,7 +1,6 @@ import pytest import os import ctypes -import srsly from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util @@ -9,8 +8,6 @@ from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from .util import make_tempdir - @pytest.fixture def is_admin(): From 5ba1df5e78de64ae123b7c3fb8bf401c906e4637 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 00:15:06 +0200 Subject: [PATCH 125/203] Update project CLI --- spacy/cli/project.py | 89 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 8 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 45cb163af..8a97f67e0 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -4,20 +4,43 @@ import srsly from pathlib import Path from wasabi import msg import shlex +import os +import re from ._app import app, Arg, Opt from .. import about from ..schemas import ProjectConfigSchema, validate -from ..util import run_command +from ..util import ensure_path, run_command CONFIG_FILE = "project.yml" DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"] - +CACHES = [ + Path.home() / ".torch", + Path.home() / ".caches" / "torch", + os.environ.get("TORCH_HOME"), + Path.home() / ".keras", +] project_cli = typer.Typer(help="Command-line interface for spaCy projects") +@project_cli.callback(invoke_without_command=True) +def callback(): + # This runs before every project command and ensures DVC is installed + # TODO: check for "dvc" command instead of Python library? + try: + import dvc # noqa: F401 + except ImportError: + msg.fail( + "spaCy projects require DVC (Data Version Control)", + "You can install the Python package from pip (pip install dvc) or " + "conda (conda install -c conda-forge dvc). For more details, see the " + "documentation: https://dvc.org/doc/install", + exits=1, + ) + + @project_cli.command("clone") def project_clone_cli( # fmt: off @@ -27,7 +50,50 @@ def project_clone_cli( # fmt: on ): """Clone a project template from a repository.""" - print("Cloning", repo) + project_clone(name, dest, repo=repo) + + +def project_clone(name: str, dest: Path, repo: str = about.__projects__) -> None: + dest = ensure_path(dest) + if not dest or not dest.exists() or not dest.is_dir(): + msg.fail("Not a valid directory to clone project", dest, exits=1) + cmd = ["dvc", "get", repo, name, "-o", str(dest)] + msg.info(" ".join(cmd)) + run_command(cmd) + msg.good(f"Cloned project '{name}' from {repo}") + with msg.loading("Setting up directories..."): + for sub_dir in DIRS: + dir_path = dest / sub_dir + if not dir_path.exists(): + dir_path.mkdir(parents=True) + msg.good(f"Your project is now ready!", dest.resolve()) + + +@project_cli.command("get-assets") +def project_get_assets_cli( + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False) +): + """Use Data Version Control to get the assets for the project.""" + project_get_assets(path) + + +def project_get_assets(project_path: Path) -> None: + project_path = ensure_path(project_path) + config = load_project_config(project_path) + assets = config.get("assets", {}) + if not assets: + msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) + msg.info(f"Getting {len(assets)} asset(s)") + variables = config.get("variables", {}) + for asset in assets: + url = asset["url"].format(**variables) + dest = asset["dest"].format(**variables) + dest_path = project_path / dest + check_asset(url) + cmd = ["dvc", "get-url", url, str(dest_path)] + msg.info(" ".join(cmd)) + run_command(cmd) + msg.good(f"Got asset {dest}") @project_cli.command("run") @@ -76,14 +142,21 @@ def load_project_config(path: Path) -> Dict[str, Any]: return config -def create_dirs(project_dir: Path) -> None: - for subdir in DIRS: - (project_dir / subdir).mkdir(parents=True) - - def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) msg.info(command) run_command(shlex.split(command)) + + +def check_asset(url: str) -> None: + # If the asset URL is a regular GitHub URL it's likely a mistake + # TODO: support loading from GitHub URLs? Automatically convert to raw? + if re.match("(http(s?)):\/\/github.com", url): + msg.warn( + "Downloading from a regular GitHub URL. This will only download " + "the source of the page, not the actual file. If you want to " + "download the raw file, click on 'Download' on the GitHub page " + "and copy the raw.githubusercontent.com URL instead." + ) From 1e5b4d85249ebdec6819df21663215fc6d04e4c0 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 00:30:05 +0200 Subject: [PATCH 126/203] Fix DVC check --- spacy/cli/project.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 8a97f67e0..c33f6a395 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -3,6 +3,7 @@ import typer import srsly from pathlib import Path from wasabi import msg +import subprocess import shlex import os import re @@ -28,12 +29,11 @@ project_cli = typer.Typer(help="Command-line interface for spaCy projects") @project_cli.callback(invoke_without_command=True) def callback(): # This runs before every project command and ensures DVC is installed - # TODO: check for "dvc" command instead of Python library? try: - import dvc # noqa: F401 - except ImportError: + subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) + except Exception: msg.fail( - "spaCy projects require DVC (Data Version Control)", + "spaCy projects require DVC (Data Version Control) and the 'dvc' command", "You can install the Python package from pip (pip install dvc) or " "conda (conda install -c conda-forge dvc). For more details, see the " "documentation: https://dvc.org/doc/install", From 79dd824906b517312086cf3606e8e1d27a78cd2f Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 00:45:40 +0200 Subject: [PATCH 127/203] Tidy up --- spacy/__main__.py | 9 +++------ spacy/cli/__init__.py | 21 ++++++++++++++++++--- spacy/cli/_app.py | 42 +++++++++++++++++++++--------------------- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 6015894b6..f6b5066b7 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,7 +1,4 @@ -from spacy.cli import app -from typer.main import get_command - if __name__ == "__main__": - command = get_command(app) - # Ensure that the help messages always display the correct prompt - command(prog_name="python -m spacy") + from spacy.cli import setup_cli + + setup_cli() diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 59d099b34..14623000a 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,13 +1,28 @@ -from ._app import app # noqa: F401 +from wasabi import msg + +from ._app import app, setup_cli # noqa: F401 + +# These are the actual functions, NOT the wrapped CLI commands. The CLI commands +# are registered automatically and won't have to be imported here. from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train_cli # noqa: F401 +from .train_from_config import train # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_cli # noqa: F401 +from .project import project_clone, project_get_assets, project_run # noqa: F401 + + +@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) +def link(*args, **kwargs): + """As of spaCy v3.0, model symlinks are deprecated. You can load models + using their full names or from a directory path.""" + msg.warn( + "As of spaCy v3.0, model symlinks are deprecated. You can load models " + "using their full names or from a directory path." + ) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index ccc50ff63..d1c470b32 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -1,31 +1,31 @@ +from typing import Optional import typer -from wasabi import msg +from typer.main import get_command -def Arg(*args, help=None, **kwargs): +COMMAND = "python -m spacy" +NAME = "spacy" +HELP = """spaCy Command-line Interface + +DOCS: https://spacy.io/api/cli +""" + + +app = typer.Typer(name=NAME, help=HELP) + + +def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument: + """Wrapper for Typer's annotation to keep it short and set defaults.""" # Filter out help for now until it's officially supported return typer.Argument(*args, **kwargs) -def Opt(*args, **kwargs): +def Opt(*args, **kwargs) -> typer.Option: + """Wrapper for Typer's annotation to keep it short and set defaults.""" return typer.Option(*args, show_default=True, **kwargs) -app = typer.Typer( - name="spacy", - help="""spaCy Command-line Interface - - -DOCS: https://spacy.io/api/cli -""", -) - - -@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) -def link(*args, **kwargs): - """As of spaCy v3.0, model symlinks are deprecated. You can load models - using their full names or from a directory path.""" - msg.warn( - "As of spaCy v3.0, model symlinks are deprecated. You can load models " - "using their full names or from a directory path." - ) +def setup_cli() -> None: + # Ensure that the help messages always display the correct prompt + command = get_command(app) + command(prog_name=COMMAND) From fca3907d4e761519e08b785aba958bf7846585ac Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 00:57:28 +0200 Subject: [PATCH 128/203] Add correct uppercase variants for boolean flags --- spacy/cli/download.py | 2 +- spacy/cli/info.py | 2 +- spacy/cli/package.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 920250a61..adc8d09fa 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,7 +17,7 @@ def download_cli( # fmt: off ctx: typer.Context, model: str = Arg(..., help="Model to download (shortcut or name)"), - direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"), + direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), # fmt: on ): """ diff --git a/spacy/cli/info.py b/spacy/cli/info.py index e6156ee6d..3ac081c14 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -15,7 +15,7 @@ def info_cli( # fmt: off model: Optional[str] = Arg(None, help="Optional model name"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), - silent: bool = Opt(False, "--silent", "-s", help="Don't print anything (just return)"), + silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), # fmt: on ): """ diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 6ba9b0386..24d9a0a08 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -17,7 +17,7 @@ def package_cli( input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"), + create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), # fmt: on ): From 189ed567777eeaa248a0eab1908553bfe018b9b5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 01:07:48 +0200 Subject: [PATCH 129/203] Fix and simplify info --- spacy/__init__.py | 6 +----- spacy/cli/info.py | 12 +++++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index e4e1f6c8e..b525a5ba5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -8,7 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") from thinc.api import prefer_gpu, require_gpu from . import pipeline -from .cli.info import info as cli_info +from .cli.info import info from .glossary import explain from .about import __version__ from .errors import Errors, Warnings @@ -34,7 +34,3 @@ def load(name, **overrides): def blank(name, **kwargs): LangClass = util.get_lang_class(name) return LangClass(**kwargs) - - -def info(model=None, markdown=False, silent=False): - return cli_info(model, markdown, silent) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 3ac081c14..2722e7e58 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -5,7 +5,6 @@ from wasabi import Printer import srsly from ._app import app, Arg, Opt -from .validate import get_model_pkgs from .. import util from .. import about @@ -27,7 +26,7 @@ def info_cli( def info( - model: Optional[str], *, markdown: bool = False, silent: bool = True + model: Optional[str] = None, *, markdown: bool = False, silent: bool = True ) -> Union[str, dict]: msg = Printer(no_print=silent, pretty=not silent) if model: @@ -43,7 +42,7 @@ def info( return markdown_data if not silent: msg.table(data, title=title) - return data + return {k.lower().replace(" ", "_"): v for k, v in data.items()} def info_spacy(*, silent: bool = True) -> Dict[str, any]: @@ -52,8 +51,11 @@ def info_spacy(*, silent: bool = True) -> Dict[str, any]: silent (bool): Don't print anything, just return. RETURNS (dict): The spaCy info. """ - all_models, _ = get_model_pkgs(silent=silent) - models = ", ".join(f"{m['name']} ({m['version']})" for m in all_models.values()) + all_models = {} + for pkg_name in util.get_installed_models(): + package = pkg_name.replace("-", "_") + all_models[package] = util.get_package_version(pkg_name) + models = ", ".join(f"{name} ({version})" for name, version in all_models.items()) return { "spaCy version": about.__version__, "Location": str(Path(__file__).parent.parent), From dc5d535659b5090d9c2de2c079a2d70567b9fca0 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 01:17:11 +0200 Subject: [PATCH 130/203] Tidy up info --- spacy/cli/info.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 2722e7e58..9f1ec3855 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -34,34 +34,36 @@ def info( data = info_model(model, silent=silent) else: title = "Info about spaCy" - data = info_spacy(silent=silent) + data = info_spacy() + raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()} + if "Models" in data and isinstance(data["Models"], dict): + data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items()) markdown_data = get_markdown(data, title=title) if markdown: if not silent: print(markdown_data) return markdown_data if not silent: - msg.table(data, title=title) - return {k.lower().replace(" ", "_"): v for k, v in data.items()} + table_data = dict(data) + msg.table(table_data, title=title) + return raw_data -def info_spacy(*, silent: bool = True) -> Dict[str, any]: +def info_spacy() -> Dict[str, any]: """Generate info about the current spaCy intallation. - silent (bool): Don't print anything, just return. RETURNS (dict): The spaCy info. """ all_models = {} for pkg_name in util.get_installed_models(): package = pkg_name.replace("-", "_") all_models[package] = util.get_package_version(pkg_name) - models = ", ".join(f"{name} ({version})" for name, version in all_models.items()) return { "spaCy version": about.__version__, "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": models, + "Models": all_models, } From 95cc9d657d4ac84d7599e47365132c19fb68802d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 11:57:46 +0200 Subject: [PATCH 131/203] Update srsly pin [ci skip] --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0d0715e24..3b78c0688 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 wasabi>=0.7.0,<1.1.0 -srsly>=2.0.0,<3.0.0 +srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 5a4b044b4..6df69cb15 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,7 +45,7 @@ install_requires = thinc==8.0.0a9 blis>=0.4.0,<0.5.0 wasabi>=0.7.0,<1.1.0 - srsly>=2.0.0,<3.0.0 + srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 ml_datasets>=0.1.1 # Third-party dependencies From ea9fd3abcd70c1a5ee1cf0cb1e989b993bec680b Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 12:04:41 +0200 Subject: [PATCH 132/203] Replace plac with typer [ci skip] --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3b78c0688..55b234073 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,10 +8,10 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 +typer>=0.2.1,<1.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 -plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.3.0,<2.0.0 # Official Python utilities diff --git a/setup.cfg b/setup.cfg index 6df69cb15..20b2dfa1c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -48,10 +48,10 @@ install_requires = srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 ml_datasets>=0.1.1 + typer>=0.2.1,<1.0.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 - plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 # Official Python utilities From 3f2f5f9cb39a1fe183144b84f705ab3ade744a82 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 12:14:51 +0200 Subject: [PATCH 133/203] Remove ml_datasets from install dependencies --- setup.cfg | 1 - spacy/cli/profile.py | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 20b2dfa1c..5bda29c68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,6 @@ install_requires = wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets>=0.1.1 typer>=0.2.1,<1.0.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index f4c893864..ee9f3e707 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -6,7 +6,6 @@ import cProfile import pstats import sys import itertools -import ml_datasets from wasabi import msg, Printer from ._app import app, Arg, Opt @@ -32,6 +31,14 @@ def profile_cli( def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: + try: + import ml_datasets + except ImportError: + msg.fail( + "This command requires the ml_datasets library to be installed:" + "pip install ml_datasets", + exits=1, + ) if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: From a6b76440b766a16afba8716118275ea79b918ff4 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 14:53:31 +0200 Subject: [PATCH 134/203] Update project CLI --- spacy/cli/project.py | 126 +++++++++++++++++++++++++++++++------------ spacy/schemas.py | 10 ++-- spacy/tests/util.py | 9 +--- spacy/util.py | 9 ++++ 4 files changed, 110 insertions(+), 44 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index c33f6a395..454b99d25 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -7,15 +7,26 @@ import subprocess import shlex import os import re +import shutil -from ._app import app, Arg, Opt +from ._app import app, Arg, Opt, COMMAND from .. import about from ..schemas import ProjectConfigSchema, validate -from ..util import ensure_path, run_command +from ..util import ensure_path, run_command, make_tempdir, working_dir CONFIG_FILE = "project.yml" -DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"] +DIRS = [ + "assets", + "metas", + "configs", + "packages", + "metrics", + "scripts", + "notebooks", + "training", + "corpus", +] CACHES = [ Path.home() / ".torch", Path.home() / ".caches" / "torch", @@ -45,28 +56,37 @@ def callback(): def project_clone_cli( # fmt: off name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), + dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") # fmt: on ): """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo) + project_clone(name, dest, repo=repo, verbose=verbose) -def project_clone(name: str, dest: Path, repo: str = about.__projects__) -> None: +def project_clone( + name: str, dest: Path, *, repo: str = about.__projects__, verbose: bool = False +) -> None: dest = ensure_path(dest) - if not dest or not dest.exists() or not dest.is_dir(): - msg.fail("Not a valid directory to clone project", dest, exits=1) - cmd = ["dvc", "get", repo, name, "-o", str(dest)] - msg.info(" ".join(cmd)) - run_command(cmd) + check_clone_dest(dest) + # When cloning a subdirectory with DVC, it will create a folder of that name + # within the destination dir, so we use a tempdir and then copy it into the + # parent directory to create the cloned directory + with make_tempdir() as tmp_dir: + cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)] + if verbose: + cmd.append("-v") + print(" ".join(cmd)) + run_command(cmd) + shutil.move(str(tmp_dir / Path(name).name), str(dest)) msg.good(f"Cloned project '{name}' from {repo}") - with msg.loading("Setting up directories..."): - for sub_dir in DIRS: - dir_path = dest / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) + for sub_dir in DIRS: + dir_path = dest / sub_dir + if not dir_path.exists(): + dir_path.mkdir(parents=True) msg.good(f"Your project is now ready!", dest.resolve()) + print(f"To get the assets, run:\npython -m spacy project get-assets {dest}") @project_cli.command("get-assets") @@ -91,7 +111,6 @@ def project_get_assets(project_path: Path) -> None: dest_path = project_path / dest check_asset(url) cmd = ["dvc", "get-url", url, str(dest_path)] - msg.info(" ".join(cmd)) run_command(cmd) msg.good(f"Got asset {dest}") @@ -100,11 +119,33 @@ def project_get_assets(project_path: Path) -> None: def project_run_cli( # fmt: off project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - subcommand: str = Arg(None, help="Name of command defined in project config") + subcommand: str = Arg(None, help="Name of command defined in project config"), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): """Run scripts defined in the project.""" - project_run(project_dir, subcommand) + if show_help: + print_run_help(project_dir, subcommand) + else: + project_run(project_dir, subcommand) + + +def print_run_help(project_dir: Path, subcommand: str) -> None: + """Simulate a CLI help prompt using the info available in the project config.""" + config = load_project_config(project_dir) + config_commands = config.get("commands", []) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand: + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") + help_text = commands[subcommand].get("help") + if help_text: + msg.text(f"\n{help_text}\n") + else: + print(f"\nAvailable commands in {CONFIG_FILE}") + print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) def project_run(project_dir: Path, subcommand: str) -> None: @@ -112,20 +153,23 @@ def project_run(project_dir: Path, subcommand: str) -> None: config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand is None: - all_commands = config.get("run", []) - if not all_commands: - msg.warn("No run commands defined in project config", exits=0) - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - for command in all_commands: - if command not in commands: - msg.fail(f"Can't find command '{command}' in project config", exits=1) - msg.divider(command) - run_commands(commands[command]["script"], variables) - return - if subcommand not in commands: + if subcommand and subcommand not in commands: msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) - run_commands(commands[subcommand]["script"], variables) + with working_dir(project_dir): + if subcommand is None: + all_commands = config.get("run", []) + if not all_commands: + msg.warn("No run commands defined in project config", exits=0) + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + for command in all_commands: + if command not in commands: + msg.fail( + f"Can't find command '{command}' in project config", exits=1 + ) + msg.divider(command) + run_commands(commands[command]["script"], variables) + else: + run_commands(commands[subcommand]["script"], variables) app.add_typer(project_cli, name="project") @@ -146,7 +190,7 @@ def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) - msg.info(command) + print(command) run_command(shlex.split(command)) @@ -160,3 +204,19 @@ def check_asset(url: str) -> None: "download the raw file, click on 'Download' on the GitHub page " "and copy the raw.githubusercontent.com URL instead." ) + # url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/").replace("/tree/", "/") + + +def check_clone_dest(dest: Path) -> None: + """Check and validate that the destination path can be used to clone.""" + if not dest: + msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) + if dest.exists(): + # Directory already exists (not allowed, clone needs to create it) + msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) + if not dest.parent.exists(): + # We're not creating parents, parent dir should exist + msg.fail( + f"Can't clone project, parent directory doesn't exist: {dest.parent}", + exits=1, + ) diff --git a/spacy/schemas.py b/spacy/schemas.py index 04f9bbffa..43694b325 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -229,11 +229,15 @@ class ProjectConfigCommand(BaseModel): name: StrictStr = Field(..., title="Name of command") help: Optional[StrictStr] = Field(None, title="Command description") script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") - dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies") - dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs") - dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + deps: List[StrictStr] = Field([], title="Data Version Control dependencies") + outputs: List[StrictStr] = Field([], title="Data Version Control outputs") + outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") # fmt: on + class Config: + title = "A single named command specified in a project config" + extra = "forbid" + class ProjectConfigSchema(BaseModel): # fmt: off diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 3d0a023c9..01c4254c4 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -8,8 +8,8 @@ from pathlib import Path from spacy import Errors from spacy.tokens import Doc, Span from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA - from spacy.vocab import Vocab +from spacy.util import make_tempdir @contextlib.contextmanager @@ -19,13 +19,6 @@ def make_tempfile(mode="r"): f.close() -@contextlib.contextmanager -def make_tempdir(): - d = Path(tempfile.mkdtemp()) - yield d - shutil.rmtree(str(d)) - - def get_doc( vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None ): diff --git a/spacy/util.py b/spacy/util.py index 7f27e9467..feb863261 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -19,6 +19,8 @@ from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion import subprocess from contextlib import contextmanager +import tempfile +import shutil try: @@ -453,6 +455,13 @@ def working_dir(path: Union[str, Path]) -> None: os.chdir(prev_cwd) +@contextmanager +def make_tempdir(): + d = Path(tempfile.mkdtemp()) + yield d + shutil.rmtree(str(d)) + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer. From 0ee6d7a4d1dea48547c8c78d59bbc3d3a2c4ff45 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 14:54:38 +0200 Subject: [PATCH 135/203] Remove project stuff from this branch --- spacy/cli/__init__.py | 1 - spacy/cli/project.py | 162 ------------------------------------------ 2 files changed, 163 deletions(-) delete mode 100644 spacy/cli/project.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 14623000a..206f8dd3b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,6 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_get_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/project.py b/spacy/cli/project.py deleted file mode 100644 index c33f6a395..000000000 --- a/spacy/cli/project.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import List, Dict, Any -import typer -import srsly -from pathlib import Path -from wasabi import msg -import subprocess -import shlex -import os -import re - -from ._app import app, Arg, Opt -from .. import about -from ..schemas import ProjectConfigSchema, validate -from ..util import ensure_path, run_command - - -CONFIG_FILE = "project.yml" -DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"] -CACHES = [ - Path.home() / ".torch", - Path.home() / ".caches" / "torch", - os.environ.get("TORCH_HOME"), - Path.home() / ".keras", -] - -project_cli = typer.Typer(help="Command-line interface for spaCy projects") - - -@project_cli.callback(invoke_without_command=True) -def callback(): - # This runs before every project command and ensures DVC is installed - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "spaCy projects require DVC (Data Version Control) and the 'dvc' command", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) - - -@project_cli.command("clone") -def project_clone_cli( - # fmt: off - name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), - # fmt: on -): - """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo) - - -def project_clone(name: str, dest: Path, repo: str = about.__projects__) -> None: - dest = ensure_path(dest) - if not dest or not dest.exists() or not dest.is_dir(): - msg.fail("Not a valid directory to clone project", dest, exits=1) - cmd = ["dvc", "get", repo, name, "-o", str(dest)] - msg.info(" ".join(cmd)) - run_command(cmd) - msg.good(f"Cloned project '{name}' from {repo}") - with msg.loading("Setting up directories..."): - for sub_dir in DIRS: - dir_path = dest / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - msg.good(f"Your project is now ready!", dest.resolve()) - - -@project_cli.command("get-assets") -def project_get_assets_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False) -): - """Use Data Version Control to get the assets for the project.""" - project_get_assets(path) - - -def project_get_assets(project_path: Path) -> None: - project_path = ensure_path(project_path) - config = load_project_config(project_path) - assets = config.get("assets", {}) - if not assets: - msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Getting {len(assets)} asset(s)") - variables = config.get("variables", {}) - for asset in assets: - url = asset["url"].format(**variables) - dest = asset["dest"].format(**variables) - dest_path = project_path / dest - check_asset(url) - cmd = ["dvc", "get-url", url, str(dest_path)] - msg.info(" ".join(cmd)) - run_command(cmd) - msg.good(f"Got asset {dest}") - - -@project_cli.command("run") -def project_run_cli( - # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - subcommand: str = Arg(None, help="Name of command defined in project config") - # fmt: on -): - """Run scripts defined in the project.""" - project_run(project_dir, subcommand) - - -def project_run(project_dir: Path, subcommand: str) -> None: - config = load_project_config(project_dir) - config_commands = config.get("commands", []) - variables = config.get("variables", {}) - commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand is None: - all_commands = config.get("run", []) - if not all_commands: - msg.warn("No run commands defined in project config", exits=0) - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - for command in all_commands: - if command not in commands: - msg.fail(f"Can't find command '{command}' in project config", exits=1) - msg.divider(command) - run_commands(commands[command]["script"], variables) - return - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) - run_commands(commands[subcommand]["script"], variables) - - -app.add_typer(project_cli, name="project") - - -def load_project_config(path: Path) -> Dict[str, Any]: - config_path = path / CONFIG_FILE - if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - config = srsly.read_yaml(config_path) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) - return config - - -def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - msg.info(command) - run_command(shlex.split(command)) - - -def check_asset(url: str) -> None: - # If the asset URL is a regular GitHub URL it's likely a mistake - # TODO: support loading from GitHub URLs? Automatically convert to raw? - if re.match("(http(s?)):\/\/github.com", url): - msg.warn( - "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. If you want to " - "download the raw file, click on 'Download' on the GitHub page " - "and copy the raw.githubusercontent.com URL instead." - ) From 8131a65dee23283349a14858f1173d9ec476d748 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 22 Jun 2020 16:09:09 +0200 Subject: [PATCH 136/203] Update __init__.py --- spacy/cli/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 206f8dd3b..14623000a 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,6 +15,7 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 +from .project import project_clone, project_get_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) From 82a03ee18e8f2ca3982646c1076c1edf02ce0698 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 25 Jun 2020 12:26:53 +0200 Subject: [PATCH 137/203] Replace python with sys.executable --- spacy/cli/project.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 454b99d25..3cced4057 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -8,6 +8,7 @@ import shlex import os import re import shutil +import sys from ._app import app, Arg, Opt, COMMAND from .. import about @@ -190,8 +191,12 @@ def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) - print(command) - run_command(shlex.split(command)) + command = shlex.split(command) + # TODO: is this needed / a good idea? + if len(command) and command[0] == "python": + command[0] = sys.executable + print(" ".join(command)) + run_command(command) def check_asset(url: str) -> None: From 01c394eb230144b41df673f8ce54f39ed162fb8e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Thu, 25 Jun 2020 12:27:19 +0200 Subject: [PATCH 138/203] Update to latest Typer and remove hacks --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/cli/_app.py | 14 ++++---------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 55b234073..654c8e278 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 -typer>=0.2.1,<1.0.0 +typer>=0.3.0,<1.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index 01b18ef29..83085340d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - typer>=0.2.1,<1.0.0 + typer>=0.3.0,<1.0.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index d1c470b32..6f64dcb59 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -13,16 +13,10 @@ DOCS: https://spacy.io/api/cli app = typer.Typer(name=NAME, help=HELP) - -def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument: - """Wrapper for Typer's annotation to keep it short and set defaults.""" - # Filter out help for now until it's officially supported - return typer.Argument(*args, **kwargs) - - -def Opt(*args, **kwargs) -> typer.Option: - """Wrapper for Typer's annotation to keep it short and set defaults.""" - return typer.Option(*args, show_default=True, **kwargs) +# Wrappers for Typer's annotations. Initially created to set defaults and to +# keep the names short, but not needed at the moment. +Arg = typer.Argument +Opt = typer.Option def setup_cli() -> None: From 8c29268749fc7ffc47ed662d5fb65dc8c57157f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 26 Jun 2020 19:34:12 +0200 Subject: [PATCH 139/203] Improve spacy.gold (no GoldParse, no json format!) (#5555) * Update errors * Remove beam for now (maybe) Remove beam_utils Update setup.py Remove beam * Remove GoldParse WIP on removing goldparse Get ArcEager compiling after GoldParse excise Update setup.py Get spacy.syntax compiling after removing GoldParse Rename NewExample -> Example and clean up Clean html files Start updating tests Update Morphologizer * fix error numbers * fix merge conflict * informative error when calling to_array with wrong field * fix error catching * fixing language and scoring tests * start testing get_aligned * additional tests for new get_aligned function * Draft create_gold_state for arc_eager oracle * Fix import * Fix import * Remove TokenAnnotation code from nonproj * fixing NER one-to-many alignment * Fix many-to-one IOB codes * fix test for misaligned * attempt to fix cases with weird spaces * fix spaces * test_gold_biluo_different_tokenization works * allow None as BILUO annotation * fixed some tests + WIP roundtrip unit test * add spaces to json output format * minibatch utiltiy can deal with strings, docs or examples * fix augment (needs further testing) * various fixes in scripts - needs to be further tested * fix test_cli * cleanup * correct silly typo * add support for MORPH in to/from_array, fix morphologizer overfitting test * fix tagger * fix entity linker * ensure test keeps working with non-linked entities * pipe() takes docs, not examples * small bug fix * textcat bugfix * throw informative error when running the components with the wrong type of objects * fix parser tests to work with example (most still failing) * fix BiluoPushDown parsing entities * small fixes * bugfix tok2vec * fix renames and simple_ner labels * various small fixes * prevent writing dummy values like deps because that could interfer with sent_start values * fix the fix * implement split_sent with aligned SENT_START attribute * test for split sentences with various alignment issues, works * Return ArcEagerGoldParse from ArcEager * Update parser and NER gold stuff * Draft new GoldCorpus class * add links to to_dict * clean up * fix test checking for variants * Fix oracles * Start updating converters * Move converters under spacy.gold * Move things around * Fix naming * Fix name * Update converter to produce DocBin * Update converters * Allow DocBin to take list of Doc objects. * Make spacy convert output docbin * Fix import * Fix docbin * Fix compile in ArcEager * Fix import * Serialize all attrs by default * Update converter * Remove jsonl converter * Add json2docs converter * Draft Corpus class for DocBin * Work on train script * Update Corpus * Update DocBin * Allocate Doc before starting to add words * Make doc.from_array several times faster * Update train.py * Fix Corpus * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests * Skip tests that cause crashes * Skip test causing segfault * Remove GoldCorpus * Update imports * Update after removing GoldCorpus * Fix module name of corpus * Fix mimport * Work on parser oracle * Update arc_eager oracle * Restore ArcEager.get_cost function * Update transition system * Update test_arc_eager_oracle * Remove beam test * Update test * Unskip * Unskip tests * add links to to_dict * clean up * fix test checking for variants * Allow DocBin to take list of Doc objects. * Fix compile in ArcEager * Serialize all attrs by default Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter * Allocate Doc before starting to add words * Make doc.from_array several times faster * Start updating converters * Work on train script * Draft Corpus class for DocBin Update Corpus Fix Corpus * Update DocBin Add missing strings when serializing * Update train.py * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests Skip tests that cause crashes Skip test causing segfault * Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport * Work on parser oracle Update arc_eager oracle Restore ArcEager.get_cost function Update transition system * Update tests Remove beam test Update test Unskip Unskip tests * Add get_aligned_parse method in Example Fix Example.get_aligned_parse * Add kwargs to Corpus.dev_dataset to match train_dataset * Update nonproj * Use get_aligned_parse in ArcEager * Add another arc-eager oracle test * Remove Example.doc property Remove Example.doc Remove Example.doc Remove Example.doc Remove Example.doc * Update ArcEager oracle Fix Break oracle * Debugging * Fix Corpus * Fix eg.doc * Format * small fixes * limit arg for Corpus * fix test_roundtrip_docs_to_docbin * fix test_make_orth_variants * fix add_label test * Update tests * avoid writing temp dir in json2docs, fixing 4402 test * Update test * Add missing costs to NER oracle * Update test * Work on Example.get_aligned_ner method * Clean up debugging * Xfail tests * Remove prints * Remove print * Xfail some tests * Replace unseen labels for parser * Update test * Update test * Xfail test * Fix Corpus * fix imports * fix docs_to_json * various small fixes * cleanup * Support gold_preproc in Corpus * Support gold_preproc * Pass gold_preproc setting into corpus * Remove debugging * Fix gold_preproc * Fix json2docs converter * Fix convert command * Fix flake8 * Fix import * fix output_dir (converted to Path by typer) * fix var * bugfix: update states after creating golds to avoid out of bounds indexing * Improve efficiency of ArEager oracle * pull merge_sent into iob2docs to avoid Doc creation for each line * fix asserts * bugfix excl Span.end in iob2docs * Support max_length in Corpus * Fix arc_eager oracle * Filter out uannotated sentences in NER * Remove debugging in parser * Simplify NER alignment * Fix conversion of NER data * Fix NER init_gold_batch * Tweak efficiency of precomputable affine * Update onto-json default * Update gold test for NER * Fix parser test * Update test * Add NER data test * Fix convert for single file * Fix test * Hack scorer to avoid evaluating non-nered data * Fix handling of NER data in Example * Output unlabelled spans from O biluo tags in iob_utils * Fix unset variable * Return kept examples from init_gold_batch * Return examples from init_gold_batch * Dont return Example from init_gold_batch * Set spaces on gold doc after conversion * Add test * Fix spaces reading * Improve NER alignment * Improve handling of missing values in NER * Restore the 'cutting' in parser training * Add assertion * Print epochs * Restore random cuts in parser/ner training * Implement Doc.copy * Implement Example.copy * Copy examples at the start of Language.update * Don't unset example docs * Tweak parser model slightly * attempt to fix _guess_spaces * _add_entities_to_doc first, so that links don't get overwritten * fixing get_aligned_ner for one-to-many * fix indexing into x_text * small fix biluo_tags_from_offsets * Add onto-ner config * Simplify NER alignment * Fix NER scoring for partially annotated documents * fix indexing into x_text * fix test_cli failing tests by ignoring spans in doc.ents with empty label * Fix limit * Improve NER alignment * Fix count_train * Remove print statement * fix tests, we're not having nothing but None * fix clumsy fingers * Fix tests * Fix doc.ents * Remove empty docs in Corpus and improve limit * Update config Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> --- bin/ud/ud_train.py | 40 +- examples/experiments/onto-joint/defaults.cfg | 38 +- examples/experiments/onto-joint/pretrain.cfg | 1 - examples/experiments/onto-ner.cfg | 80 + .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 1 - .../ptb-joint-pos-dep/defaults.cfg | 1 - examples/training/conllu.py | 66 +- examples/training/create_kb.py | 2 +- examples/training/ner_multitask_objective.py | 32 +- examples/training/rehearsal.py | 13 +- examples/training/train_textcat.py | 7 +- setup.py | 7 +- spacy/about.py | 2 +- spacy/cli/__init__.py | 2 +- spacy/cli/convert.py | 268 ++-- spacy/cli/converters/__init__.py | 4 - spacy/cli/converters/iob2json.py | 65 - spacy/cli/converters/jsonl2json.py | 50 - spacy/cli/debug_data.py | 16 +- spacy/cli/evaluate.py | 15 +- spacy/cli/pretrain.py | 3 +- spacy/cli/{train_from_config.py => train.py} | 278 ++-- spacy/errors.py | 24 +- spacy/gold.pxd | 68 - spacy/gold.pyx | 1420 ----------------- spacy/gold/__init__.pxd | 0 spacy/gold/__init__.py | 11 + spacy/gold/align.pxd | 8 + spacy/gold/align.pyx | 101 ++ spacy/gold/augment.py | 111 ++ spacy/gold/converters/__init__.py | 6 + .../converters/conll_ner2docs.py} | 63 +- spacy/{cli => gold}/converters/conllu2json.py | 72 +- spacy/gold/converters/iob2docs.py | 64 + spacy/gold/converters/json2docs.py | 24 + spacy/gold/corpus.py | 122 ++ spacy/gold/example.pxd | 8 + spacy/gold/example.pyx | 432 +++++ spacy/gold/gold_io.pyx | 199 +++ spacy/gold/iob_utils.py | 209 +++ spacy/language.py | 57 +- spacy/ml/_biluo.py | 3 +- spacy/ml/_precomputable_affine.py | 6 +- spacy/ml/models/multi_task.py | 4 +- spacy/ml/models/parser.py | 7 +- spacy/ml/models/textcat.py | 32 +- spacy/ml/models/tok2vec.py | 2 +- spacy/ml/tb_framework.py | 5 +- spacy/pipeline/morphologizer.pyx | 19 +- spacy/pipeline/pipes.pyx | 410 ++--- spacy/pipeline/simple_ner.py | 8 +- spacy/pipeline/tok2vec.py | 17 +- spacy/scorer.py | 118 +- spacy/syntax/_beam_utils.pxd | 9 - spacy/syntax/_beam_utils.pyx | 329 ---- spacy/syntax/_parser_model.pyx | 5 +- spacy/syntax/arc_eager.pxd | 5 +- spacy/syntax/arc_eager.pyx | 642 +++++--- spacy/syntax/ner.pxd | 1 - spacy/syntax/ner.pyx | 168 +- spacy/syntax/nn_parser.pyx | 319 +--- spacy/syntax/nonproj.pyx | 57 +- spacy/syntax/transition_system.pxd | 15 +- spacy/syntax/transition_system.pyx | 84 +- spacy/tests/doc/test_array.py | 16 +- spacy/tests/parser/test_add_label.py | 15 +- spacy/tests/parser/test_arc_eager_oracle.py | 133 +- spacy/tests/parser/test_ner.py | 55 +- spacy/tests/parser/test_neural_parser.py | 8 +- spacy/tests/parser/test_nn_beam.py | 107 -- spacy/tests/parser/test_parse.py | 9 +- spacy/tests/parser/test_preset_sbd.py | 8 +- spacy/tests/pipeline/test_entity_linker.py | 16 +- spacy/tests/pipeline/test_morphologizer.py | 2 +- spacy/tests/pipeline/test_sentencizer.py | 2 +- spacy/tests/pipeline/test_textcat.py | 15 +- spacy/tests/regression/test_issue1501-2000.py | 15 +- spacy/tests/regression/test_issue4313.py | 4 + spacy/tests/regression/test_issue4402.py | 26 +- spacy/tests/regression/test_issue4529.py | 5 +- spacy/tests/regression/test_issue4665.py | 10 +- spacy/tests/test_cli.py | 44 +- spacy/tests/test_gold.py | 520 +++--- spacy/tests/test_language.py | 20 +- spacy/tests/test_new_example.py | 242 +++ spacy/tests/test_scorer.py | 41 +- spacy/tests/test_util.py | 13 +- spacy/tests/util.py | 23 +- spacy/tokenizer.pyx | 2 +- spacy/tokens/_serialize.py | 18 +- spacy/tokens/doc.pyx | 133 +- spacy/tokens/token.pyx | 7 +- spacy/util.py | 51 +- 93 files changed, 3820 insertions(+), 3995 deletions(-) create mode 100644 examples/experiments/onto-ner.cfg delete mode 100644 spacy/cli/converters/__init__.py delete mode 100644 spacy/cli/converters/iob2json.py delete mode 100644 spacy/cli/converters/jsonl2json.py rename spacy/cli/{train_from_config.py => train.py} (78%) delete mode 100644 spacy/gold.pxd create mode 100644 spacy/gold/__init__.pxd create mode 100644 spacy/gold/__init__.py create mode 100644 spacy/gold/align.pxd create mode 100644 spacy/gold/align.pyx create mode 100644 spacy/gold/augment.py create mode 100644 spacy/gold/converters/__init__.py rename spacy/{cli/converters/conll_ner2json.py => gold/converters/conll_ner2docs.py} (80%) rename spacy/{cli => gold}/converters/conllu2json.py (86%) create mode 100644 spacy/gold/converters/iob2docs.py create mode 100644 spacy/gold/converters/json2docs.py create mode 100644 spacy/gold/corpus.py create mode 100644 spacy/gold/example.pxd create mode 100644 spacy/gold/example.pyx create mode 100644 spacy/gold/gold_io.pyx create mode 100644 spacy/gold/iob_utils.py delete mode 100644 spacy/syntax/_beam_utils.pxd delete mode 100644 spacy/syntax/_beam_utils.pyx create mode 100644 spacy/tests/test_new_example.py diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index aa5050f3a..7bf5dbb5e 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -14,7 +14,7 @@ import spacy import spacy.util from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc -from spacy.gold import GoldParse, Example +from spacy.gold import Example from spacy.util import compounding, minibatch, minibatch_by_words from spacy.syntax.nonproj import projectivize from spacy.matcher import Matcher @@ -83,11 +83,11 @@ def read_data( sent["heads"].append(head) sent["deps"].append("ROOT" if dep == "root" else dep) sent["spaces"].append(space_after == "_") - sent["entities"] = ["-"] * len(sent["words"]) + sent["entities"] = ["-"] * len(sent["words"]) # TODO: doc-level format sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) if oracle_segments: docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(GoldParse(docs[-1], **sent)) + golds.append(sent) assert golds[-1].morphology is not None sent_annots.append(sent) @@ -151,28 +151,27 @@ def read_conllu(file_): def _make_gold(nlp, text, sent_annots, drop_deps=0.0): # Flatten the conll annotations, and adjust the head indices - flat = defaultdict(list) + gold = defaultdict(list) sent_starts = [] for sent in sent_annots: - flat["heads"].extend(len(flat["words"])+head for head in sent["heads"]) + gold["heads"].extend(len(gold["words"])+head for head in sent["heads"]) for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]: - flat[field].extend(sent[field]) + gold[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent["words"]) - 1)) # Construct text if necessary - assert len(flat["words"]) == len(flat["spaces"]) + assert len(gold["words"]) == len(gold["spaces"]) if text is None: text = "".join( - word + " " * space for word, space in zip(flat["words"], flat["spaces"]) + word + " " * space for word, space in zip(gold["words"], gold["spaces"]) ) doc = nlp.make_doc(text) - flat.pop("spaces") - gold = GoldParse(doc, **flat) - gold.sent_starts = sent_starts + gold.pop("spaces") + gold["sent_starts"] = sent_starts for i in range(len(gold.heads)): if random.random() < drop_deps: - gold.heads[i] = None - gold.labels[i] = None + gold["heads"][i] = None + gold["labels"][i] = None return doc, gold @@ -183,15 +182,10 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training, given the - GoldParse objects.""" + """Get out the training data format used by begin_training""" data = [] for doc, gold in zip(docs, golds): - example = Example(doc=doc) - example.add_doc_annotation(cats=gold.cats) - token_annotation_dict = gold.orig.to_dict() - example.add_token_annotation(**token_annotation_dict) - example.goldparse = gold + example = Example.from_dict(doc, gold) data.append(example) return data @@ -359,8 +353,8 @@ def initialize_pipeline(nlp, examples, config, device): nlp.parser.add_multitask_objective("tag") if config.multitask_sent: nlp.parser.add_multitask_objective("sent_start") - for ex in examples: - gold = ex.gold + for eg in examples: + gold = eg.gold for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) @@ -541,7 +535,7 @@ def main( else: batches = minibatch(examples, size=batch_sizes) losses = {} - n_train_words = sum(len(ex.doc) for ex in examples) + n_train_words = sum(len(eg.doc) for eg in examples) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: pbar.update(sum(len(ex.doc) for ex in batch)) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index f76336d84..e45758196 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -5,17 +5,16 @@ # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length or number of examples. -max_length = 0 +max_length = 5000 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 max_steps = 20000 -eval_frequency = 400 +eval_frequency = 200 # Other settings seed = 0 accumulate_gradient = 1 @@ -30,7 +29,7 @@ omit_extra_lookups = false [training.batch_size] @schedules = "compounding.v1" -start = 1000 +start = 100 stop = 1000 compound = 1.001 @@ -58,15 +57,11 @@ vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" -[nlp.pipeline.senter] -factory = "senter" [nlp.pipeline.ner] factory = "ner" learn_tokens = false min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 [nlp.pipeline.tagger] factory = "tagger" @@ -74,16 +69,7 @@ factory = "tagger" [nlp.pipeline.parser] factory = "parser" learn_tokens = false -min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 - -[nlp.pipeline.senter.model] -@architectures = "spacy.Tagger.v1" - -[nlp.pipeline.senter.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +min_action_freq = 30 [nlp.pipeline.tagger.model] @architectures = "spacy.Tagger.v1" @@ -96,8 +82,8 @@ width = ${nlp.pipeline.tok2vec.model:width} @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 128 -maxout_pieces = 3 -use_upper = false +maxout_pieces = 2 +use_upper = true [nlp.pipeline.parser.model.tok2vec] @architectures = "spacy.Tok2VecTensors.v1" @@ -107,8 +93,8 @@ width = ${nlp.pipeline.tok2vec.model:width} @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 128 -maxout_pieces = 3 -use_upper = false +maxout_pieces = 2 +use_upper = true [nlp.pipeline.ner.model.tok2vec] @architectures = "spacy.Tok2VecTensors.v1" @@ -117,10 +103,10 @@ width = ${nlp.pipeline.tok2vec.model:width} [nlp.pipeline.tok2vec.model] @architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} -width = 256 -depth = 6 +width = 128 +depth = 4 window_size = 1 -embed_size = 10000 +embed_size = 7000 maxout_pieces = 3 subword_features = true -dropout = null +dropout = ${training:dropout} diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 40885b6e8..83991f888 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg new file mode 100644 index 000000000..48fe25a67 --- /dev/null +++ b/examples/experiments/onto-ner.cfg @@ -0,0 +1,80 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 5000 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.2 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 500 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "ents_p", "ents_r", "ents_f"] +score_weights = {"ents_f": 1.0} +# These settings are invalid for the transformer models. +init_tok2vec = null +discard_oversize = false +omit_extra_lookups = false + +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = false +L2 = 1e-6 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +#[optimizer.learn_rate] +#@schedules = "warmup_linear.v1" +#warmup_steps = 250 +#total_steps = 20000 +#initial_rate = 0.001 + +[nlp] +lang = "en" +vectors = null + +[nlp.pipeline.ner] +factory = "ner" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 64 +maxout_pieces = 2 +use_upper = true + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 96 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true +dropout = ${training:dropout} diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 905b5b4e0..f1b702a4e 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 7383116e7..1c946ac60 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 diff --git a/examples/training/conllu.py b/examples/training/conllu.py index bf47be72a..0758775cf 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -12,7 +12,7 @@ import tqdm import spacy import spacy.util from spacy.tokens import Token, Doc -from spacy.gold import GoldParse, Example +from spacy.gold import Example from spacy.syntax.nonproj import projectivize from collections import defaultdict from spacy.matcher import Matcher @@ -33,31 +33,6 @@ random.seed(0) numpy.random.seed(0) -def minibatch_by_words(examples, size=5000): - random.shuffle(examples) - if isinstance(size, int): - size_ = itertools.repeat(size) - else: - size_ = size - examples = iter(examples) - while True: - batch_size = next(size_) - batch = [] - while batch_size >= 0: - try: - example = next(examples) - except StopIteration: - if batch: - yield batch - return - batch_size -= len(example.doc) - batch.append(example) - if batch: - yield batch - else: - break - - ################ # Data reading # ################ @@ -110,7 +85,7 @@ def read_data( sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) if oracle_segments: docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) - golds.append(GoldParse(docs[-1], **sent)) + golds.append(sent) sent_annots.append(sent) if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: @@ -159,20 +134,19 @@ def read_conllu(file_): def _make_gold(nlp, text, sent_annots): # Flatten the conll annotations, and adjust the head indices - flat = defaultdict(list) + gold = defaultdict(list) for sent in sent_annots: - flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"]) + gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"]) for field in ["words", "tags", "deps", "entities", "spaces"]: - flat[field].extend(sent[field]) + gold[field].extend(sent[field]) # Construct text if necessary - assert len(flat["words"]) == len(flat["spaces"]) + assert len(gold["words"]) == len(gold["spaces"]) if text is None: text = "".join( - word + " " * space for word, space in zip(flat["words"], flat["spaces"]) + word + " " * space for word, space in zip(gold["words"], gold["spaces"]) ) doc = nlp.make_doc(text) - flat.pop("spaces") - gold = GoldParse(doc, **flat) + gold.pop("spaces") return doc, gold @@ -182,15 +156,10 @@ def _make_gold(nlp, text, sent_annots): def golds_to_gold_data(docs, golds): - """Get out the training data format used by begin_training, given the - GoldParse objects.""" + """Get out the training data format used by begin_training.""" data = [] for doc, gold in zip(docs, golds): - example = Example(doc=doc) - example.add_doc_annotation(cats=gold.cats) - token_annotation_dict = gold.orig.to_dict() - example.add_token_annotation(**token_annotation_dict) - example.goldparse = gold + example = Example.from_dict(doc, gold) data.append(example) return data @@ -313,15 +282,15 @@ def initialize_pipeline(nlp, examples, config): nlp.parser.add_multitask_objective("sent_start") nlp.parser.moves.add_action(2, "subtok") nlp.add_pipe(nlp.create_pipe("tagger")) - for ex in examples: - for tag in ex.gold.tags: + for eg in examples: + for tag in eg.gold.tags: if tag is not None: nlp.tagger.add_label(tag) # Replace labels that didn't make the frequency cutoff actions = set(nlp.parser.labels) label_set = set([act.split("-")[1] for act in actions if "-" in act]) - for ex in examples: - gold = ex.gold + for eg in examples: + gold = eg.gold for i, label in enumerate(gold.labels): if label is not None and label not in label_set: gold.labels[i] = label.split("||")[0] @@ -415,13 +384,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): optimizer = initialize_pipeline(nlp, examples, config) for i in range(config.nr_epoch): - docs = [nlp.make_doc(example.doc.text) for example in examples] - batches = minibatch_by_words(examples, size=config.batch_size) + batches = spacy.minibatch_by_words(examples, size=config.batch_size) losses = {} - n_train_words = sum(len(doc) for doc in docs) + n_train_words = sum(len(eg.reference.doc) for eg in examples) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - pbar.update(sum(len(ex.doc) for ex in batch)) + pbar.update(sum(len(eg.reference.doc) for eg in batch)) nlp.update( examples=batch, sgd=optimizer, drop=config.dropout, losses=losses, ) diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index cbdb5c05b..5b17bb59e 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), ) -def main(model=None, output_dir=None): +def main(model, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py index 7561d4877..baa6d7f06 100644 --- a/examples/training/ner_multitask_objective.py +++ b/examples/training/ner_multitask_objective.py @@ -24,8 +24,10 @@ import random import plac import spacy import os.path + +from spacy.gold.example import Example from spacy.tokens import Doc -from spacy.gold import read_json_file, GoldParse +from spacy.gold import read_json_file random.seed(0) @@ -59,27 +61,25 @@ def main(n_iter=10): print(nlp.pipeline) print("Create data", len(TRAIN_DATA)) - optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA) + optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for example in TRAIN_DATA: - for token_annotation in example.token_annotations: - doc = Doc(nlp.vocab, words=token_annotation.words) - gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation) - - nlp.update( - examples=[(doc, gold)], # 1 example - drop=0.2, # dropout - make it harder to memorise data - sgd=optimizer, # callable to update weights - losses=losses, - ) + for example_dict in TRAIN_DATA: + doc = Doc(nlp.vocab, words=example_dict["words"]) + example = Example.from_dict(doc, example_dict) + nlp.update( + examples=[example], # 1 example + drop=0.2, # dropout - make it harder to memorise data + sgd=optimizer, # callable to update weights + losses=losses, + ) print(losses.get("nn_labeller", 0.0), losses["ner"]) # test the trained model - for example in TRAIN_DATA: - if example.text is not None: - doc = nlp(example.text) + for example_dict in TRAIN_DATA: + if "text" in example_dict: + doc = nlp(example_dict["text"]) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 98a96643b..8c94ab14e 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -4,9 +4,10 @@ import random import warnings import srsly import spacy -from spacy.gold import GoldParse +from spacy.gold import Example from spacy.util import minibatch, compounding +# TODO: further fix & test this script for v.3 ? (read_gold_data is never called) LABEL = "ANIMAL" TRAIN_DATA = [ @@ -36,15 +37,13 @@ def read_raw_data(nlp, jsonl_loc): def read_gold_data(nlp, gold_loc): - docs = [] - golds = [] + examples = [] for json_obj in srsly.read_jsonl(gold_loc): doc = nlp.make_doc(json_obj["text"]) ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]] - gold = GoldParse(doc, entities=ents) - docs.append(doc) - golds.append(gold) - return list(zip(docs, golds)) + example = Example.from_dict(doc, {"entities": ents}) + examples.append(example) + return examples def main(model_name, unlabelled_loc): diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index c5e679467..cb65b8c8b 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -19,7 +19,7 @@ from ml_datasets import loaders import spacy from spacy import util from spacy.util import minibatch, compounding -from spacy.gold import Example, GoldParse +from spacy.gold import Example @plac.annotations( @@ -62,11 +62,10 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non train_examples = [] for text, cats in zip(train_texts, train_cats): doc = nlp.make_doc(text) - gold = GoldParse(doc, cats=cats) + example = Example.from_dict(doc, {"cats": cats}) for cat in cats: textcat.add_label(cat) - ex = Example.from_gold(gold, doc=doc) - train_examples.append(ex) + train_examples.append(example) with nlp.select_pipes(enable="textcat"): # only train textcat optimizer = nlp.begin_training() diff --git a/setup.py b/setup.py index d16615f5f..eacb2d35d 100755 --- a/setup.py +++ b/setup.py @@ -23,6 +23,8 @@ Options.docstrings = True PACKAGES = find_packages() MOD_NAMES = [ + "spacy.gold.align", + "spacy.gold.example", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", @@ -37,11 +39,10 @@ MOD_NAMES = [ "spacy.tokenizer", "spacy.syntax.nn_parser", "spacy.syntax._parser_model", - "spacy.syntax._beam_utils", "spacy.syntax.nonproj", "spacy.syntax.transition_system", "spacy.syntax.arc_eager", - "spacy.gold", + "spacy.gold.gold_io", "spacy.tokens.doc", "spacy.tokens.span", "spacy.tokens.token", @@ -120,7 +121,7 @@ class build_ext_subclass(build_ext, build_ext_options): def clean(path): for path in path.glob("**/*"): - if path.is_file() and path.suffix in (".so", ".cpp"): + if path.is_file() and path.suffix in (".so", ".cpp", ".html"): print(f"Deleting {path.name}") path.unlink() diff --git a/spacy/about.py b/spacy/about.py index 54753b5a1..c3b2cb091 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev9" +__version__ = "3.0.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 206f8dd3b..1179c15dd 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -8,7 +8,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train # noqa: F401 +from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 24d266504..b008e2f93 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,53 +4,56 @@ from pathlib import Path from wasabi import Printer import srsly import re +import sys from ._app import app, Arg, Opt -from .converters import conllu2json, iob2json, conll_ner2json -from .converters import ner_jsonl2json +from ..gold import docs_to_json +from ..tokens import DocBin +from ..gold.converters import iob2docs, conll_ner2docs, json2docs # Converters are matched by file extension except for ner/iob, which are # matched by file extension and content. To add a converter, add a new # entry to this dict with the file extension mapped to the converter function # imported from /converters. + CONVERTERS = { - "conllubio": conllu2json, - "conllu": conllu2json, - "conll": conllu2json, - "ner": conll_ner2json, - "iob": iob2json, - "jsonl": ner_jsonl2json, + # "conllubio": conllu2docs, TODO + # "conllu": conllu2docs, TODO + # "conll": conllu2docs, TODO + "ner": conll_ner2docs, + "iob": iob2docs, + "json": json2docs, } -# File types -FILE_TYPES_STDOUT = ("json", "jsonl") + +# File types that can be written to stdout +FILE_TYPES_STDOUT = ("json") class FileTypes(str, Enum): json = "json" - jsonl = "jsonl" - msg = "msg" + spacy = "spacy" @app.command("convert") def convert_cli( # fmt: off - input_file: str = Arg(..., help="Input file", exists=True), + input_path: str = Arg(..., help="Input file or directory", exists=True), output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True), - file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"), + file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"), n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), - ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), + ner_map: Optional[Path] = Opt(None, "--ner-map", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), # fmt: on ): """ - Convert files into JSON format for use with train command and other + Convert files into json or DocBin format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json @@ -58,9 +61,15 @@ def convert_cli( if isinstance(file_type, FileTypes): # We get an instance of the FileTypes from the CLI so we need its string value file_type = file_type.value + input_path = Path(input_path) + output_dir = "-" if output_dir == Path("-") else output_dir + cli_args = locals() silent = output_dir == "-" + msg = Printer(no_print=silent) + verify_cli_args(msg, **cli_args) + converter = _get_converter(msg, converter, input_path) convert( - input_file, + input_path, output_dir, file_type=file_type, n_sents=n_sents, @@ -69,92 +78,78 @@ def convert_cli( morphology=morphology, merge_subtokens=merge_subtokens, converter=converter, - ner_map_path=ner_map_path, + ner_map=ner_map, lang=lang, silent=silent, + msg=msg, ) def convert( - input_file: Path, - output_dir: Path, - *, - file_type: str = "json", - n_sents: int = 1, - seg_sents: bool = False, - model: Optional[str] = None, - morphology: bool = False, - merge_subtokens: bool = False, - converter: str = "auto", - ner_map_path: Optional[Path] = None, - lang: Optional[str] = None, - silent: bool = True, + input_path: Path, + output_dir: Path, + *, + file_type: str = "json", + n_sents: int = 1, + seg_sents: bool = False, + model: Optional[str] = None, + morphology: bool = False, + merge_subtokens: bool = False, + converter: str = "auto", + ner_map: Optional[Path] = None, + lang: Optional[str] = None, + silent: bool = True, + msg: Optional[Path] = None, ) -> None: - msg = Printer(no_print=silent, pretty=not silent) - input_path = Path(input_file) - if file_type not in FILE_TYPES_STDOUT and output_dir == "-": - # TODO: support msgpack via stdout in srsly? - msg.fail( - f"Can't write .{file_type} data to stdout", - "Please specify an output directory.", - exits=1, - ) - if not input_path.exists(): - msg.fail("Input file not found", input_path, exits=1) - if output_dir != "-" and not Path(output_dir).exists(): - msg.fail("Output directory not found", output_dir, exits=1) - input_data = input_path.open("r", encoding="utf-8").read() - if converter == "auto": - converter = input_path.suffix[1:] - if converter == "ner" or converter == "iob": - converter_autodetect = autodetect_ner_format(input_data) - if converter_autodetect == "ner": - msg.info("Auto-detected token-per-line NER format") - converter = converter_autodetect - elif converter_autodetect == "iob": - msg.info("Auto-detected sentence-per-line NER format") - converter = converter_autodetect - else: - msg.warn( - "Can't automatically detect NER format. Conversion may not " - "succeed. See https://spacy.io/api/cli#convert" - ) - if converter not in CONVERTERS: - msg.fail(f"Can't find converter for {converter}", exits=1) - ner_map = None - if ner_map_path is not None: - ner_map = srsly.read_json(ner_map_path) - # Use converter function to convert data - func = CONVERTERS[converter] - data = func( - input_data, - n_sents=n_sents, - seg_sents=seg_sents, - append_morphology=morphology, - merge_subtokens=merge_subtokens, - lang=lang, - model=model, - no_print=silent, - ner_map=ner_map, - ) - if output_dir != "-": - # Export data to a file - suffix = f".{file_type}" - output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) - if file_type == "json": - srsly.write_json(output_file, data) - elif file_type == "jsonl": - srsly.write_jsonl(output_file, data) - elif file_type == "msg": - srsly.write_msgpack(output_file, data) - msg.good(f"Generated output file ({len(data)} documents): {output_file}") - else: - # Print to stdout - if file_type == "json": - srsly.write_json("-", data) - elif file_type == "jsonl": - srsly.write_jsonl("-", data) + if not msg: + msg = Printer(no_print=silent) + ner_map = srsly.read_json(ner_map) if ner_map is not None else None + for input_loc in walk_directory(input_path): + input_data = input_loc.open("r", encoding="utf-8").read() + # Use converter function to convert data + func = CONVERTERS[converter] + docs = func( + input_data, + n_sents=n_sents, + seg_sents=seg_sents, + append_morphology=morphology, + merge_subtokens=merge_subtokens, + lang=lang, + model=model, + no_print=silent, + ner_map=ner_map, + ) + if output_dir == "-": + _print_docs_to_stdout(docs, file_type) + else: + if input_loc != input_path: + subpath = input_loc.relative_to(input_path) + output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}") + else: + output_file = Path(output_dir) / input_loc.parts[-1] + output_file = output_file.with_suffix(f".{file_type}") + _write_docs_to_file(docs, output_file, file_type) + msg.good(f"Generated output file ({len(docs)} documents): {output_file}") + + +def _print_docs_to_stdout(docs, output_type): + if output_type == "json": + srsly.write_json("-", docs_to_json(docs)) + else: + sys.stdout.buffer.write(DocBin(docs=docs).to_bytes()) + + +def _write_docs_to_file(docs, output_file, output_type): + if not output_file.parent.exists(): + output_file.parent.mkdir(parents=True) + if output_type == "json": + srsly.write_json(output_file, docs_to_json(docs)) + else: + data = DocBin(docs=docs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + def autodetect_ner_format(input_data: str) -> str: # guess format from the first 20 lines @@ -173,3 +168,86 @@ def autodetect_ner_format(input_data: str) -> str: if format_guesses["ner"] == 0 and format_guesses["iob"] > 0: return "iob" return None + + +def walk_directory(path): + if not path.is_dir(): + return [path] + paths = [path] + locs = [] + seen = set() + for path in paths: + if str(path) in seen: + continue + seen.add(str(path)) + if path.parts[-1].startswith("."): + continue + elif path.is_dir(): + paths.extend(path.iterdir()) + else: + locs.append(path) + return locs + + +def verify_cli_args( + msg, + input_path, + output_dir, + file_type, + n_sents, + seg_sents, + model, + morphology, + merge_subtokens, + converter, + ner_map, + lang, +): + input_path = Path(input_path) + if file_type not in FILE_TYPES_STDOUT and output_dir == "-": + # TODO: support msgpack via stdout in srsly? + msg.fail( + f"Can't write .{file_type} data to stdout", + "Please specify an output directory.", + exits=1, + ) + if not input_path.exists(): + msg.fail("Input file not found", input_path, exits=1) + if output_dir != "-" and not Path(output_dir).exists(): + msg.fail("Output directory not found", output_dir, exits=1) + if input_path.is_dir(): + input_locs = walk_directory(input_path) + if len(input_locs) == 0: + msg.fail("No input files in directory", input_path, exits=1) + file_types = list(set([loc.suffix[1:] for loc in input_locs])) + if len(file_types) >= 2: + file_types = ",".join(file_types) + msg.fail("All input files must be same type", file_types, exits=1) + converter = _get_converter(msg, converter, input_path) + if converter not in CONVERTERS: + msg.fail(f"Can't find converter for {converter}", exits=1) + return converter + + +def _get_converter(msg, converter, input_path): + if input_path.is_dir(): + input_path = walk_directory(input_path)[0] + if converter == "auto": + converter = input_path.suffix[1:] + if converter == "ner" or converter == "iob": + with input_path.open() as file_: + input_data = file_.read() + converter_autodetect = autodetect_ner_format(input_data) + if converter_autodetect == "ner": + msg.info("Auto-detected token-per-line NER format") + converter = converter_autodetect + elif converter_autodetect == "iob": + msg.info("Auto-detected sentence-per-line NER format") + converter = converter_autodetect + else: + msg.warn( + "Can't automatically detect NER format. " + "Conversion may not succeed. " + "See https://spacy.io/api/cli#convert" + ) + return converter diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py deleted file mode 100644 index 9dcbf5b13..000000000 --- a/spacy/cli/converters/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .conllu2json import conllu2json # noqa: F401 -from .iob2json import iob2json # noqa: F401 -from .conll_ner2json import conll_ner2json # noqa: F401 -from .jsonl2json import ner_jsonl2json # noqa: F401 diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py deleted file mode 100644 index b6ac234fc..000000000 --- a/spacy/cli/converters/iob2json.py +++ /dev/null @@ -1,65 +0,0 @@ -from wasabi import Printer - -from ...gold import iob_to_biluo -from ...util import minibatch -from .conll_ner2json import n_sents_info - - -def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): - """ - Convert IOB files with one sentence per line and tags separated with '|' - into JSON format for use with train cli. IOB and IOB2 are accepted. - - Sample formats: - - I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O - I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O - I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O - I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O - """ - msg = Printer(no_print=no_print) - docs = read_iob(input_data.split("\n")) - if n_sents > 0: - n_sents_info(msg, n_sents) - docs = merge_sentences(docs, n_sents) - return docs - - -def read_iob(raw_sents): - sentences = [] - for line in raw_sents: - if not line.strip(): - continue - tokens = [t.split("|") for t in line.split()] - if len(tokens[0]) == 3: - words, pos, iob = zip(*tokens) - elif len(tokens[0]) == 2: - words, iob = zip(*tokens) - pos = ["-"] * len(words) - else: - raise ValueError( - "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" - ) - biluo = iob_to_biluo(iob) - sentences.append( - [ - {"orth": w, "tag": p, "ner": ent} - for (w, p, ent) in zip(words, pos, biluo) - ] - ) - sentences = [{"tokens": sent} for sent in sentences] - paragraphs = [{"sentences": [sent]} for sent in sentences] - docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)] - return docs - - -def merge_sentences(docs, n_sents): - merged = [] - for group in minibatch(docs, size=n_sents): - group = list(group) - first = group.pop(0) - to_extend = first["paragraphs"][0]["sentences"] - for sent in group: - to_extend.extend(sent["paragraphs"][0]["sentences"]) - merged.append(first) - return merged diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py deleted file mode 100644 index 525063b22..000000000 --- a/spacy/cli/converters/jsonl2json.py +++ /dev/null @@ -1,50 +0,0 @@ -import srsly - -from ...gold import docs_to_json -from ...util import get_lang_class, minibatch - - -def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): - if lang is None: - raise ValueError("No --lang specified, but tokenization required") - json_docs = [] - input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] - nlp = get_lang_class(lang)() - sentencizer = nlp.create_pipe("sentencizer") - for i, batch in enumerate(minibatch(input_examples, size=n_sents)): - docs = [] - for record in batch: - raw_text = record["text"] - if "entities" in record: - ents = record["entities"] - else: - ents = record["spans"] - ents = [(e["start"], e["end"], e["label"]) for e in ents] - doc = nlp.make_doc(raw_text) - sentencizer(doc) - spans = [doc.char_span(s, e, label=L) for s, e, L in ents] - doc.ents = _cleanup_spans(spans) - docs.append(doc) - json_docs.append(docs_to_json(docs, id=i)) - return json_docs - - -def _cleanup_spans(spans): - output = [] - seen = set() - for span in spans: - if span is not None: - # Trim whitespace - while len(span) and span[0].is_space: - span = span[1:] - while len(span) and span[-1].is_space: - span = span[:-1] - if not len(span): - continue - for i in range(span.start, span.end): - if i in seen: - break - else: - output.append(span) - seen.update(range(span.start, span.end)) - return output diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 2cc3020e6..09c513d89 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -6,7 +6,7 @@ import srsly from wasabi import Printer, MESSAGES from ._app import app, Arg, Opt -from ..gold import GoldCorpus, Example +from ..gold import Corpus, Example from ..syntax import nonproj from ..language import Language from ..util import load_model, get_lang_class @@ -99,7 +99,7 @@ def debug_data( loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): - corpus = GoldCorpus(train_path, dev_path) + corpus = Corpus(train_path, dev_path) try: train_dataset = list(corpus.train_dataset(nlp)) train_dataset_unpreprocessed = list( @@ -518,12 +518,12 @@ def _compile_gold( "texts": set(), } for example in examples: - gold = example.gold - doc = example.doc - valid_words = [x for x in gold.words if x is not None] + gold = example.reference + doc = example.predicted + valid_words = [x for x in gold if x is not None] data["words"].update(valid_words) data["n_words"] += len(valid_words) - data["n_misaligned_words"] += len(gold.words) - len(valid_words) + data["n_misaligned_words"] += len(gold) - len(valid_words) data["texts"].add(doc.text) if len(nlp.vocab.vectors): for word in valid_words: @@ -578,10 +578,10 @@ def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str: def _get_examples_without_label(data: Sequence[Example], label: str) -> int: count = 0 - for ex in data: + for eg in data: labels = [ label.split("-")[1] - for label in ex.gold.ner + for label in eg.gold.ner if label not in ("O", "-", None) ] if label not in labels: diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 8d0f67316..a18e51623 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,10 +3,10 @@ from timeit import default_timer as timer from wasabi import Printer from pathlib import Path -from ._app import app, Arg, Opt +from ..gold import Corpus from ..tokens import Doc +from ._app import app, Arg, Opt from ..scorer import Scorer -from ..gold import GoldCorpus from .. import util from .. import displacy @@ -20,7 +20,9 @@ def evaluate_cli( gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - # fmt: on + return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), + + # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an @@ -34,6 +36,7 @@ def evaluate_cli( displacy_path=displacy_path, displacy_limit=displacy_limit, silent=False, + return_scores=return_scores, ) @@ -45,6 +48,7 @@ def evaluate( displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, + return_scores: bool = False, ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) util.fix_random_seed() @@ -57,7 +61,7 @@ def evaluate( msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) - corpus = GoldCorpus(data_path, data_path) + corpus = Corpus(data_path, data_path) if model.startswith("blank:"): nlp = util.get_lang_class(model.replace("blank:", ""))() else: @@ -101,7 +105,8 @@ def evaluate( ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - return scorer.scores + if return_scores: + return scorer.scores def render_parses( diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 2962e5022..18c429c60 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -179,8 +179,7 @@ def pretrain( skip_counter = 0 loss_func = pretrain_config["loss_func"] for epoch in range(epoch_resume, pretrain_config["max_epochs"]): - examples = [Example(doc=text) for text in texts] - batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) + batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"]) for batch_id, batch in enumerate(batches): docs, count = make_docs( nlp, diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train.py similarity index 78% rename from spacy/cli/train_from_config.py rename to spacy/cli/train.py index 79c3bf259..f3f0649e9 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train.py @@ -1,16 +1,18 @@ -from typing import Optional, Dict +from typing import Optional, Dict, List, Union, Sequence from timeit import default_timer as timer + import srsly import tqdm +from pydantic import BaseModel, FilePath from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import use_pytorch_for_gpu_memory +from thinc.api import Model, use_pytorch_for_gpu_memory import random from ._app import app, Arg, Opt -from ..gold import GoldCorpus +from ..gold import Corpus from ..lookups import Lookups from .. import util from ..errors import Errors @@ -82,6 +84,41 @@ subword_features = true """ +class PipelineComponent(BaseModel): + factory: str + model: Model + + class Config: + arbitrary_types_allowed = True + + +class ConfigSchema(BaseModel): + optimizer: Optional["Optimizer"] + + class training(BaseModel): + patience: int = 10 + eval_frequency: int = 100 + dropout: float = 0.2 + init_tok2vec: Optional[FilePath] = None + max_epochs: int = 100 + orth_variant_level: float = 0.0 + gold_preproc: bool = False + max_length: int = 0 + use_gpu: int = 0 + scores: List[str] = ["ents_p", "ents_r", "ents_f"] + score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} + limit: int = 0 + batch_size: Union[Sequence[int], int] + + class nlp(BaseModel): + lang: str + vectors: Optional[str] + pipeline: Optional[Dict[str, PipelineComponent]] + + class Config: + extra = "allow" + + @app.command("train") def train_cli( # fmt: off @@ -104,33 +141,8 @@ def train_cli( command. """ util.set_env_log(verbose) + verify_cli_args(**locals()) - # Make sure all files and paths exists if they are needed - if not config_path or not config_path.exists(): - msg.fail("Config file not found", config_path, exits=1) - if not train_path or not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path or not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) - if output_path is not None: - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty.", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) - if code_path is not None: - if not code_path.exists(): - msg.fail("Path to Python code not found", code_path, exits=1) - try: - util.import_file("python_code", code_path) - except Exception as e: - msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) tag_map = {} @@ -139,8 +151,6 @@ def train_cli( weights_data = None if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) with init_tok2vec.open("rb") as file_: weights_data = file_.read() @@ -184,71 +194,20 @@ def train( nlp = util.load_model_from_config(nlp_config) optimizer = training["optimizer"] limit = training["limit"] - msg.info("Loading training corpus") - corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) - - # verify textcat config + corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit) if "textcat" in nlp_config["pipeline"]: - textcat_labels = set(nlp.get_pipe("textcat").labels) - textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][ - "exclusive_classes" - ] - - # check whether the setting 'exclusive_classes' corresponds to the provided training data - if textcat_multilabel: - multilabel_found = False - for ex in corpus.train_examples: - cats = ex.doc_annotation.cats - textcat_labels.update(cats.keys()) - if list(cats.values()).count(1.0) != 1: - multilabel_found = True - if not multilabel_found: - msg.warn( - "The textcat training instances look like they have " - "mutually exclusive classes. Set 'exclusive_classes' " - "to 'true' in the config to train a classifier with " - "mutually exclusive classes more accurately." - ) - else: - for ex in corpus.train_examples: - cats = ex.doc_annotation.cats - textcat_labels.update(cats.keys()) - if list(cats.values()).count(1.0) != 1: - msg.fail( - "Some textcat training instances do not have exactly " - "one positive label. Set 'exclusive_classes' " - "to 'false' in the config to train a classifier with classes " - "that are not mutually exclusive." - ) - msg.info( - f"Initialized textcat component for {len(textcat_labels)} unique labels" - ) - nlp.get_pipe("textcat").labels = tuple(textcat_labels) - - # if 'positive_label' is provided: double check whether it's in the data and the task is binary - if nlp_config["pipeline"]["textcat"].get("positive_label", None): - textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) - pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] - if pos_label not in textcat_labels: - msg.fail( - f"The textcat's 'positive_label' config setting '{pos_label}' " - f"does not match any label in the training data.", - exits=1, - ) - if len(textcat_labels) != 2: - msg.fail( - f"A textcat 'positive_label' '{pos_label}' was " - f"provided for training data that does not appear to be a " - f"binary classification problem with two labels.", - exits=1, - ) - + verify_textcat_config(nlp, nlp_config) if training.get("resume", False): msg.info("Resuming training") nlp.resume_training() else: msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - nlp.begin_training(lambda: corpus.train_examples) + train_examples = list(corpus.train_dataset( + nlp, + shuffle=False, + gold_preproc=training["gold_preproc"] + )) + nlp.begin_training(lambda: train_examples) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) @@ -279,6 +238,7 @@ def train( ) tok2vec.from_bytes(weights_data) + msg.info("Loading training corpus") train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) @@ -311,18 +271,15 @@ def train( update_meta(training, nlp, info) nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) - # Clean up the objects to faciliate garbage collection. - for eg in batch: - eg.doc = None - eg.goldparse = None - eg.doc_annotation = None - eg.token_annotation = None except Exception as e: - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}", - exits=1, - ) + if output_path is not None: + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}", + exits=1, + ) + else: + raise e finally: if output_path is not None: final_model_path = output_path / "model-final" @@ -335,21 +292,19 @@ def train( def create_train_batches(nlp, corpus, cfg): - epochs_todo = cfg.get("max_epochs", 0) + max_epochs = cfg.get("max_epochs", 0) + train_examples = list(corpus.train_dataset( + nlp, + shuffle=True, + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"] + )) + + epoch = 0 while True: - train_examples = list( - corpus.train_dataset( - nlp, - noise_level=0.0, # I think this is deprecated? - orth_variant_level=cfg["orth_variant_level"], - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"], - ignore_misaligned=True, - ) - ) if len(train_examples) == 0: raise ValueError(Errors.E988) - random.shuffle(train_examples) + epoch += 1 batches = util.minibatch_by_words( train_examples, size=cfg["batch_size"], @@ -358,15 +313,12 @@ def create_train_batches(nlp, corpus, cfg): # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: first = next(batches) - yield first + yield epoch, first except StopIteration: raise ValueError(Errors.E986) for batch in batches: - yield batch - epochs_todo -= 1 - # We intentionally compare exactly to 0 here, so that max_epochs < 1 - # will not break. - if epochs_todo == 0: + yield epoch, batch + if max_epochs >= 1 and epoch >= max_epochs: break @@ -377,7 +329,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True ) ) - n_words = sum(len(ex.doc) for ex in dev_examples) + + n_words = sum(len(ex.predicted) for ex in dev_examples) start_time = timer() if optimizer.averages: @@ -395,7 +348,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): except KeyError as e: raise KeyError( Errors.E983.format( - dict_name="score_weights", key=str(e), keys=list(scores.keys()) + dict="score_weights", key=str(e), keys=list(scores.keys()) ) ) @@ -438,7 +391,7 @@ def train_while_improving( Every iteration, the function yields out a tuple with: - * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs. + * batch: A list of Example objects. * info: A dict with various information about the last update (see below). * is_best_checkpoint: A value in None, False, True, indicating whether this was the best evaluation so far. You should use this to save the model @@ -470,7 +423,7 @@ def train_while_improving( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 ) - for step, batch in enumerate(train_data): + for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) with nlp.select_pipes(enable=to_enable): for subbatch in subdivide_batch(batch, accumulate_gradient): @@ -492,6 +445,7 @@ def train_while_improving( score, other_scores = (None, None) is_best_checkpoint = None info = { + "epoch": epoch, "step": step, "score": score, "other_scores": other_scores, @@ -512,7 +466,7 @@ def train_while_improving( def subdivide_batch(batch, accumulate_gradient): batch = list(batch) - batch.sort(key=lambda eg: len(eg.doc)) + batch.sort(key=lambda eg: len(eg.predicted)) sub_len = len(batch) // accumulate_gradient start = 0 for i in range(accumulate_gradient): @@ -530,9 +484,9 @@ def setup_printer(training, nlp): score_widths = [max(len(col), 6) for col in score_cols] loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_widths = [max(len(col), 8) for col in loss_cols] - table_header = ["#"] + loss_cols + score_cols + ["Score"] + table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] - table_widths = [6] + loss_widths + score_widths + [6] + table_widths = [3, 6] + loss_widths + score_widths + [6] table_aligns = ["r" for _ in table_widths] msg.row(table_header, widths=table_widths) @@ -547,9 +501,7 @@ def setup_printer(training, nlp): except KeyError as e: raise KeyError( Errors.E983.format( - dict_name="scores (losses)", - key=str(e), - keys=list(info["losses"].keys()), + dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()) ) ) @@ -560,13 +512,13 @@ def setup_printer(training, nlp): except KeyError as e: raise KeyError( Errors.E983.format( - dict_name="scores (other)", + dict="scores (other)", key=str(e), keys=list(info["other_scores"].keys()), ) ) data = ( - [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) msg.row(data, widths=table_widths, aligns=table_aligns) @@ -580,3 +532,67 @@ def update_meta(training, nlp, info): nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + + +def verify_cli_args( + train_path, + dev_path, + config_path, + output_path=None, + code_path=None, + init_tok2vec=None, + raw_text=None, + verbose=False, + use_gpu=-1, + tag_map_path=None, + omit_extra_lookups=False, +): + # Make sure all files and paths exists if they are needed + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if not train_path or not train_path.exists(): + msg.fail("Training data not found", train_path, exits=1) + if not dev_path or not dev_path.exists(): + msg.fail("Development data not found", dev_path, exits=1) + if output_path is not None: + if not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) + if code_path is not None: + if not code_path.exists(): + msg.fail("Path to Python code not found", code_path, exits=1) + try: + util.import_file("python_code", code_path) + except Exception as e: + msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) + if init_tok2vec is not None and not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + + +def verify_textcat_config(nlp, nlp_config): + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if nlp_config["pipeline"]["textcat"].get("positive_label", None): + textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) + pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] + if pos_label not in textcat_labels: + msg.fail( + f"The textcat's 'positive_label' config setting '{pos_label}' " + f"does not match any label in the training data.", + exits=1, + ) + if len(textcat_labels) != 2: + msg.fail( + f"A textcat 'positive_label' '{pos_label}' was " + f"provided for training data that does not appear to be a " + f"binary classification problem with two labels.", + exits=1, + ) diff --git a/spacy/errors.py b/spacy/errors.py index c3c820987..e152bb1ff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -132,6 +132,8 @@ class Warnings(object): "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") # TODO: fix numbering after merging develop into master + W093 = ("Could not find any data to train the {name} on. Is your " + "input data correctly formatted ?") W094 = ("Model '{model}' ({model_version}) specifies an under-constrained " "spaCy version requirement: {version}. This can lead to compatibility " "problems with older versions, or as new spaCy versions are " @@ -575,9 +577,6 @@ class Errors(object): "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.") E187 = ("Only unicode strings are supported as labels.") - E188 = ("Could not match the gold entity links to entities in the doc - " - "make sure the gold EL data refers to valid results of the " - "named entity recognizer in the `nlp` pipeline.") E189 = ("Each argument to `get_doc` should be of equal length.") E190 = ("Token head out of range in `Doc.from_array()` for token index " "'{index}' with value '{value}' (equivalent to relative head " @@ -602,10 +601,17 @@ class Errors(object): "can not be combined with adding a pretrained Tok2Vec layer.") # TODO: fix numbering after merging develop into master - E983 = ("Invalid key for '{dict_name}': {key}. Available keys: " + E978 = ("The {method} method of component {name} takes a list of Example objects, " + "but found {types} instead.") + E979 = ("Cannot convert {type} to an Example object.") + E980 = ("Each link annotation should refer to a dictionary with at most one " + "identifier mapping to 1.0, and all others to 0.0.") + E981 = ("The offsets of the annotations for 'links' need to refer exactly " + "to the offsets of the 'entities' annotations.") + E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " + "into {values}, but found {value}.") + E983 = ("Invalid key for '{dict}': {key}. Available keys: " "{keys}") - E984 = ("Could not parse the {input} - double check the data is written " - "in the correct format as expected by spaCy.") E985 = ("The pipeline component '{component}' is already available in the base " "model. The settings in the component block in the config file are " "being ignored. If you want to replace this component instead, set " @@ -637,11 +643,7 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E998 = ("To create GoldParse objects from Example objects without a " - "Doc, get_gold_parses() should be called with a Vocab object.") - E999 = ("Encountered an unexpected format for the dictionary holding " - "gold annotations: {gold_dict}") - + @add_codes class TempErrors(object): diff --git a/spacy/gold.pxd b/spacy/gold.pxd deleted file mode 100644 index bf724868f..000000000 --- a/spacy/gold.pxd +++ /dev/null @@ -1,68 +0,0 @@ -from cymem.cymem cimport Pool - -from .typedefs cimport attr_t -from .syntax.transition_system cimport Transition - -from .tokens import Doc - - -cdef struct GoldParseC: - int* tags - int* heads - int* has_dep - int* sent_start - attr_t* labels - int** brackets - Transition* ner - - -cdef class GoldParse: - cdef Pool mem - - cdef GoldParseC c - cdef readonly TokenAnnotation orig - - cdef int length - cdef public int loss - cdef public list words - cdef public list tags - cdef public list pos - cdef public list morphs - cdef public list lemmas - cdef public list sent_starts - cdef public list heads - cdef public list labels - cdef public dict orths - cdef public list ner - cdef public dict brackets - cdef public dict cats - cdef public dict links - - cdef readonly list cand_to_gold - cdef readonly list gold_to_cand - - -cdef class TokenAnnotation: - cdef public list ids - cdef public list words - cdef public list tags - cdef public list pos - cdef public list morphs - cdef public list lemmas - cdef public list heads - cdef public list deps - cdef public list entities - cdef public list sent_starts - cdef public dict brackets_by_start - - -cdef class DocAnnotation: - cdef public object cats - cdef public object links - - -cdef class Example: - cdef public object doc - cdef public TokenAnnotation token_annotation - cdef public DocAnnotation doc_annotation - cdef public object goldparse diff --git a/spacy/gold.pyx b/spacy/gold.pyx index af98eda8b..e69de29bb 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,1420 +0,0 @@ -# cython: profile=True -import re -import random -import numpy -import tempfile -import shutil -import itertools -from pathlib import Path -import srsly -import warnings - -from .syntax import nonproj -from .tokens import Doc, Span -from .errors import Errors, AlignmentError, Warnings -from . import util - - -punct_re = re.compile(r"\W") - - -def tags_to_entities(tags): - entities = [] - start = None - for i, tag in enumerate(tags): - if tag is None: - continue - if tag.startswith("O"): - # TODO: We shouldn't be getting these malformed inputs. Fix this. - if start is not None: - start = None - continue - elif tag == "-": - continue - elif tag.startswith("I"): - if start is None: - raise ValueError(Errors.E067.format(tags=tags[:i + 1])) - continue - if tag.startswith("U"): - entities.append((tag[2:], i, i)) - elif tag.startswith("B"): - start = i - elif tag.startswith("L"): - entities.append((tag[2:], start, i)) - start = None - else: - raise ValueError(Errors.E068.format(tag=tag)) - return entities - - -def merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_cats = {} - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in brackets) - m_cats.update(cats) - i += len(ids) - return [(m_deps, (m_cats, m_brackets))] - - -def _normalize_for_alignment(tokens): - return [w.replace(" ", "").lower() for w in tokens] - - -def align(tokens_a, tokens_b): - """Calculate alignment tables between two tokenizations. - - tokens_a (List[str]): The candidate tokenization. - tokens_b (List[str]): The reference tokenization. - RETURNS: (tuple): A 5-tuple consisting of the following information: - * cost (int): The number of misaligned tokens. - * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. - For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns - to `tokens_b[6]`. If there's no one-to-one alignment for a token, - it has the value -1. - * b2a (List[int]): The same as `a2b`, but mapping the other direction. - * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` - to indices in `tokens_b`, where multiple tokens of `tokens_a` align to - the same token of `tokens_b`. - * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other - direction. - """ - tokens_a = _normalize_for_alignment(tokens_a) - tokens_b = _normalize_for_alignment(tokens_b) - cost = 0 - a2b = numpy.empty(len(tokens_a), dtype="i") - b2a = numpy.empty(len(tokens_b), dtype="i") - a2b.fill(-1) - b2a.fill(-1) - a2b_multi = {} - b2a_multi = {} - i = 0 - j = 0 - offset_a = 0 - offset_b = 0 - while i < len(tokens_a) and j < len(tokens_b): - a = tokens_a[i][offset_a:] - b = tokens_b[j][offset_b:] - if a == b: - if offset_a == offset_b == 0: - a2b[i] = j - b2a[j] = i - elif offset_a == 0: - cost += 2 - a2b_multi[i] = j - elif offset_b == 0: - cost += 2 - b2a_multi[j] = i - offset_a = offset_b = 0 - i += 1 - j += 1 - elif a == "": - assert offset_a == 0 - cost += 1 - i += 1 - elif b == "": - assert offset_b == 0 - cost += 1 - j += 1 - elif b.startswith(a): - cost += 1 - if offset_a == 0: - a2b_multi[i] = j - i += 1 - offset_a = 0 - offset_b += len(a) - elif a.startswith(b): - cost += 1 - if offset_b == 0: - b2a_multi[j] = i - j += 1 - offset_b = 0 - offset_a += len(b) - else: - assert "".join(tokens_a) != "".join(tokens_b) - raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) - return cost, a2b, b2a, a2b_multi, b2a_multi - - -class GoldCorpus(object): - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. - - DOCS: https://spacy.io/api/goldcorpus - """ - def __init__(self, train, dev, gold_preproc=False, limit=None): - """Create a GoldCorpus. - - train (str / Path): File or directory of training data. - dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. - """ - self.limit = limit - if isinstance(train, str) or isinstance(train, Path): - train = self.read_examples(self.walk_corpus(train)) - dev = self.read_examples(self.walk_corpus(dev)) - # Write temp directory with one doc per file, so we can shuffle and stream - self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) - self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) - - def __del__(self): - shutil.rmtree(self.tmp_dir) - - @staticmethod - def write_msgpack(directory, examples, limit=0): - if not directory.exists(): - directory.mkdir() - n = 0 - for i, example in enumerate(examples): - ex_dict = example.to_dict() - text = example.text - srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) - n += 1 - if limit and n >= limit: - break - - @staticmethod - def walk_corpus(path): - path = util.ensure_path(path) - if not path.is_dir(): - return [path] - paths = [path] - locs = [] - seen = set() - for path in paths: - if str(path) in seen: - continue - seen.add(str(path)) - if path.parts[-1].startswith("."): - continue - elif path.is_dir(): - paths.extend(path.iterdir()) - elif path.parts[-1].endswith((".json", ".jsonl")): - locs.append(path) - return locs - - @staticmethod - def read_examples(locs, limit=0): - """ Yield training examples """ - i = 0 - for loc in locs: - loc = util.ensure_path(loc) - file_name = loc.parts[-1] - if file_name.endswith("json"): - examples = read_json_file(loc) - elif file_name.endswith("jsonl"): - gold_tuples = srsly.read_jsonl(loc) - first_gold_tuple = next(gold_tuples) - gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) - # TODO: proper format checks with schemas - if isinstance(first_gold_tuple, dict): - if first_gold_tuple.get("paragraphs", None): - examples = read_json_object(gold_tuples) - elif first_gold_tuple.get("doc_annotation", None): - examples = [] - for ex_dict in gold_tuples: - doc = ex_dict.get("doc", None) - if doc is None: - doc = ex_dict.get("text", None) - if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): - raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(Example.from_dict(ex_dict, doc=doc)) - else: - raise ValueError(Errors.E984.format(input="JSONL format")) - else: - raise ValueError(Errors.E984.format(input="JSONL format")) - - elif file_name.endswith("msg"): - text, ex_dict = srsly.read_msgpack(loc) - examples = [Example.from_dict(ex_dict, doc=text)] - else: - supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=loc, formats=supported)) - try: - for example in examples: - yield example - i += 1 - if limit and i >= limit: - return - except KeyError as e: - msg = "Missing key {}".format(e) - raise KeyError(Errors.E996.format(file=file_name, msg=msg)) - except UnboundLocalError as e: - msg = "Unexpected document structure" - raise ValueError(Errors.E996.format(file=file_name, msg=msg)) - - @property - def dev_examples(self): - locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_examples(locs, limit=self.limit) - - @property - def train_examples(self): - locs = (self.tmp_dir / "train").iterdir() - yield from self.read_examples(locs, limit=self.limit) - - def count_train(self): - """Returns count of words in train examples""" - n = 0 - i = 0 - for example in self.train_examples: - n += len(example.token_annotation.words) - if self.limit and i >= self.limit: - break - i += 1 - return n - - def train_dataset(self, nlp, gold_preproc=False, max_length=None, - noise_level=0.0, orth_variant_level=0.0, - ignore_misaligned=False): - locs = list((self.tmp_dir / 'train').iterdir()) - random.shuffle(locs) - train_examples = self.read_examples(locs, limit=self.limit) - gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc, - max_length=max_length, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - make_projective=True, - ignore_misaligned=ignore_misaligned) - yield from gold_examples - - def train_dataset_without_preprocessing(self, nlp, gold_preproc=False, - ignore_misaligned=False): - examples = self.iter_gold_docs(nlp, self.train_examples, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) - yield from examples - - def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_gold_docs(nlp, self.dev_examples, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned) - yield from examples - - @classmethod - def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, - make_projective=False, ignore_misaligned=False): - """ Setting gold_preproc will result in creating a doc per sentence """ - for example in examples: - if gold_preproc: - split_examples = example.split_sents() - example_golds = [] - for split_example in split_examples: - split_example_docs = cls._make_docs(nlp, split_example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - split_example_golds = cls._make_golds(split_example_docs, - vocab=nlp.vocab, make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - example_golds.extend(split_example_golds) - else: - example_docs = cls._make_docs(nlp, example, - gold_preproc, noise_level=noise_level, - orth_variant_level=orth_variant_level) - example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - for ex in example_golds: - if ex.goldparse is not None: - if (not max_length) or len(ex.doc) < max_length: - yield ex - - @classmethod - def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): - var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level) - # gold_preproc is not used ?! - if example.text is not None: - var_text = add_noise(var_example.text, noise_level) - var_doc = nlp.make_doc(var_text) - var_example.doc = var_doc - else: - var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level)) - var_example.doc = var_doc - return [var_example] - - @classmethod - def _make_golds(cls, examples, vocab=None, make_projective=False, - ignore_misaligned=False): - filtered_examples = [] - for example in examples: - gold_parses = example.get_gold_parses(vocab=vocab, - make_projective=make_projective, - ignore_misaligned=ignore_misaligned) - assert len(gold_parses) == 1 - doc, gold = gold_parses[0] - if doc: - assert doc == example.doc - example.goldparse = gold - filtered_examples.append(example) - return filtered_examples - - -def make_orth_variants(nlp, example, orth_variant_level=0.0): - if random.random() >= orth_variant_level: - return example - if not example.token_annotation: - return example - raw = example.text - lower = False - if random.random() >= 0.5: - lower = True - if raw is not None: - raw = raw.lower() - ndsv = nlp.Defaults.single_orth_variants - ndpv = nlp.Defaults.paired_orth_variants - # modify words in paragraph_tuples - variant_example = Example(doc=raw) - token_annotation = example.token_annotation - words = token_annotation.words - tags = token_annotation.tags - if not words or not tags: - # add the unmodified annotation - token_dict = token_annotation.to_dict() - variant_example.set_token_annotation(**token_dict) - else: - if lower: - words = [w.lower() for w in words] - # single variants - punct_choices = [random.choice(x["variants"]) for x in ndsv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndsv)): - if tags[word_idx] in ndsv[punct_idx]["tags"] \ - and words[word_idx] in ndsv[punct_idx]["variants"]: - words[word_idx] = punct_choices[punct_idx] - # paired variants - punct_choices = [random.choice(x["variants"]) for x in ndpv] - for word_idx in range(len(words)): - for punct_idx in range(len(ndpv)): - if tags[word_idx] in ndpv[punct_idx]["tags"] \ - and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): - # backup option: random left vs. right from pair - pair_idx = random.choice([0, 1]) - # best option: rely on paired POS tags like `` / '' - if len(ndpv[punct_idx]["tags"]) == 2: - pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) - # next best option: rely on position in variants - # (may not be unambiguous, so order of variants matters) - else: - for pair in ndpv[punct_idx]["variants"]: - if words[word_idx] in pair: - pair_idx = pair.index(words[word_idx]) - words[word_idx] = punct_choices[punct_idx][pair_idx] - - token_dict = token_annotation.to_dict() - token_dict["words"] = words - token_dict["tags"] = tags - variant_example.set_token_annotation(**token_dict) - # modify raw to match variant_paragraph_tuples - if raw is not None: - variants = [] - for single_variants in ndsv: - variants.extend(single_variants["variants"]) - for paired_variants in ndpv: - variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"]))) - # store variants in reverse length order to be able to prioritize - # longer matches (e.g., "---" before "--") - variants = sorted(variants, key=lambda x: len(x)) - variants.reverse() - variant_raw = "" - raw_idx = 0 - # add initial whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 - for word in variant_example.token_annotation.words: - match_found = False - # skip whitespace words - if word.isspace(): - match_found = True - # add identical word - elif word not in variants and raw[raw_idx:].startswith(word): - variant_raw += word - raw_idx += len(word) - match_found = True - # add variant word - else: - for variant in variants: - if not match_found and \ - raw[raw_idx:].startswith(variant): - raw_idx += len(variant) - variant_raw += word - match_found = True - # something went wrong, abort - # (add a warning message?) - if not match_found: - return example - # add following whitespace - while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): - variant_raw += raw[raw_idx] - raw_idx += 1 - variant_example.doc = variant_raw - return variant_example - return variant_example - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return "".join(_corrupt(c, noise_level) for c in orig) - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c in [".", "'", "!", "?", ","]: - return "\n" - else: - return c.lower() - - -def read_json_object(json_corpus_section): - """Take a list of JSON-formatted documents (e.g. from an already loaded - training data file) and yield annotations in the GoldParse format. - - json_corpus_section (list): The data. - YIELDS (Example): The reformatted data - one training example per paragraph - """ - for json_doc in json_corpus_section: - examples = json_to_examples(json_doc) - for ex in examples: - yield ex - - -def json_to_examples(doc): - """Convert an item in the JSON-formatted training data to the format - used by GoldParse. - - doc (dict): One entry in the training data. - YIELDS (Example): The reformatted data - one training example per paragraph - """ - paragraphs = [] - for paragraph in doc["paragraphs"]: - example = Example(doc=paragraph.get("raw", None)) - words = [] - ids = [] - tags = [] - pos = [] - morphs = [] - lemmas = [] - heads = [] - labels = [] - ner = [] - sent_starts = [] - brackets = [] - for sent in paragraph["sentences"]: - sent_start_i = len(words) - for i, token in enumerate(sent["tokens"]): - words.append(token["orth"]) - ids.append(token.get('id', sent_start_i + i)) - tags.append(token.get('tag', "-")) - pos.append(token.get("pos", "")) - morphs.append(token.get("morph", "")) - lemmas.append(token.get("lemma", "")) - heads.append(token.get("head", 0) + sent_start_i + i) - labels.append(token.get("dep", "")) - # Ensure ROOT label is case-insensitive - if labels[-1].lower() == "root": - labels[-1] = "ROOT" - ner.append(token.get("ner", "-")) - if i == 0: - sent_starts.append(1) - else: - sent_starts.append(0) - if "brackets" in sent: - brackets.extend((b["first"] + sent_start_i, - b["last"] + sent_start_i, b["label"]) - for b in sent["brackets"]) - cats = {} - for cat in paragraph.get("cats", {}): - cats[cat["label"]] = cat["value"] - example.set_token_annotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=labels, entities=ner, sent_starts=sent_starts, - brackets=brackets) - example.set_doc_annotation(cats=cats) - yield example - - -def read_json_file(loc, docs_filter=None, limit=None): - loc = util.ensure_path(loc) - if loc.is_dir(): - parsed = False - for filename in loc.iterdir(): - parsed = True - yield from read_json_file(loc / filename, limit=limit) - if not parsed: - raise ValueError(Errors.E984.format(input="JSON directory")) - else: - parsed = False - for doc in _json_iterate(loc): - if docs_filter is not None and not docs_filter(doc): - continue - for json_data in json_to_examples(doc): - parsed = True - yield json_data - if not parsed: - raise ValueError(Errors.E984.format(input="JSON file")) - - -def _json_iterate(loc): - # We should've made these files jsonl...But since we didn't, parse out - # the docs one-by-one to reduce memory usage. - # It's okay to read in the whole file -- just don't parse it into JSON. - cdef bytes py_raw - loc = util.ensure_path(loc) - with loc.open("rb") as file_: - py_raw = file_.read() - cdef long file_length = len(py_raw) - if file_length > 2 ** 30: - warnings.warn(Warnings.W027.format(size=file_length)) - - raw = <char*>py_raw - cdef int square_depth = 0 - cdef int curly_depth = 0 - cdef int inside_string = 0 - cdef int escape = 0 - cdef long start = -1 - cdef char c - cdef char quote = ord('"') - cdef char backslash = ord("\\") - cdef char open_square = ord("[") - cdef char close_square = ord("]") - cdef char open_curly = ord("{") - cdef char close_curly = ord("}") - for i in range(file_length): - c = raw[i] - if escape: - escape = False - continue - if c == backslash: - escape = True - continue - if c == quote: - inside_string = not inside_string - continue - if inside_string: - continue - if c == open_square: - square_depth += 1 - elif c == close_square: - square_depth -= 1 - elif c == open_curly: - if square_depth == 1 and curly_depth == 0: - start = i - curly_depth += 1 - elif c == close_curly: - curly_depth -= 1 - if square_depth == 1 and curly_depth == 0: - py_str = py_raw[start : i + 1].decode("utf8") - try: - yield srsly.json_loads(py_str) - except Exception: - print(py_str) - raise - start = -1 - - -def iob_to_biluo(tags): - out = [] - tags = list(tags) - while tags: - out.extend(_consume_os(tags)) - out.extend(_consume_ent(tags)) - return out - - -def biluo_to_iob(tags): - out = [] - for tag in tags: - tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) - out.append(tag) - return out - - -def _consume_os(tags): - while tags and tags[0] == "O": - yield tags.pop(0) - - -def _consume_ent(tags): - if not tags: - return [] - tag = tags.pop(0) - target_in = "I" + tag[1:] - target_last = "L" + tag[1:] - length = 1 - while tags and tags[0] in {target_in, target_last}: - length += 1 - tags.pop(0) - label = tag[2:] - if length == 1: - if len(label) == 0: - raise ValueError(Errors.E177.format(tag=tag)) - return ["U-" + label] - else: - start = "B-" + label - end = "L-" + label - middle = [f"I-{label}" for _ in range(1, length - 1)] - return [start] + middle + [end] - - -cdef class TokenAnnotation: - def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None, - lemmas=None, heads=None, deps=None, entities=None, sent_starts=None, - brackets=None): - self.ids = ids if ids else [] - self.words = words if words else [] - self.tags = tags if tags else [] - self.pos = pos if pos else [] - self.morphs = morphs if morphs else [] - self.lemmas = lemmas if lemmas else [] - self.heads = heads if heads else [] - self.deps = deps if deps else [] - self.entities = entities if entities else [] - self.sent_starts = sent_starts if sent_starts else [] - self.brackets_by_start = {} - if brackets: - for b_start, b_end, b_label in brackets: - self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label)) - - @property - def brackets(self): - brackets = [] - for start, ends_labels in self.brackets_by_start.items(): - for end, label in ends_labels: - brackets.append((start, end, label)) - return brackets - - @classmethod - def from_dict(cls, token_dict): - return cls(ids=token_dict.get("ids", None), - words=token_dict.get("words", None), - tags=token_dict.get("tags", None), - pos=token_dict.get("pos", None), - morphs=token_dict.get("morphs", None), - lemmas=token_dict.get("lemmas", None), - heads=token_dict.get("heads", None), - deps=token_dict.get("deps", None), - entities=token_dict.get("entities", None), - sent_starts=token_dict.get("sent_starts", None), - brackets=token_dict.get("brackets", None)) - - def to_dict(self): - return {"ids": self.ids, - "words": self.words, - "tags": self.tags, - "pos": self.pos, - "morphs": self.morphs, - "lemmas": self.lemmas, - "heads": self.heads, - "deps": self.deps, - "entities": self.entities, - "sent_starts": self.sent_starts, - "brackets": self.brackets} - - def get_id(self, i): - return self.ids[i] if i < len(self.ids) else i - - def get_word(self, i): - return self.words[i] if i < len(self.words) else "" - - def get_tag(self, i): - return self.tags[i] if i < len(self.tags) else "-" - - def get_pos(self, i): - return self.pos[i] if i < len(self.pos) else "" - - def get_morph(self, i): - return self.morphs[i] if i < len(self.morphs) else "" - - def get_lemma(self, i): - return self.lemmas[i] if i < len(self.lemmas) else "" - - def get_head(self, i): - return self.heads[i] if i < len(self.heads) else i - - def get_dep(self, i): - return self.deps[i] if i < len(self.deps) else "" - - def get_entity(self, i): - return self.entities[i] if i < len(self.entities) else "-" - - def get_sent_start(self, i): - return self.sent_starts[i] if i < len(self.sent_starts) else None - - def __str__(self): - return str(self.to_dict()) - - def __repr__(self): - return self.__str__() - - -cdef class DocAnnotation: - def __init__(self, cats=None, links=None): - self.cats = cats if cats else {} - self.links = links if links else {} - - @classmethod - def from_dict(cls, doc_dict): - return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None)) - - def to_dict(self): - return {"cats": self.cats, "links": self.links} - - def __str__(self): - return str(self.to_dict()) - - def __repr__(self): - return self.__str__() - - -cdef class Example: - def __init__(self, doc_annotation=None, token_annotation=None, doc=None, - goldparse=None): - """ Doc can either be text, or an actual Doc """ - self.doc = doc - self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() - self.token_annotation = token_annotation if token_annotation else TokenAnnotation() - self.goldparse = goldparse - - @classmethod - def from_gold(cls, goldparse, doc=None): - doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) - token_annotation = goldparse.get_token_annotation() - return cls(doc_annotation, token_annotation, doc) - - @classmethod - def from_dict(cls, example_dict, doc=None): - token_dict = example_dict.get("token_annotation", {}) - token_annotation = TokenAnnotation.from_dict(token_dict) - doc_dict = example_dict.get("doc_annotation", {}) - doc_annotation = DocAnnotation.from_dict(doc_dict) - return cls(doc_annotation, token_annotation, doc) - - def to_dict(self): - """ Note that this method does NOT export the doc, only the annotations ! """ - token_dict = self.token_annotation.to_dict() - doc_dict = self.doc_annotation.to_dict() - return {"token_annotation": token_dict, "doc_annotation": doc_dict} - - @property - def text(self): - if self.doc is None: - return None - if isinstance(self.doc, Doc): - return self.doc.text - return self.doc - - @property - def gold(self): - if self.goldparse is None: - doc, gold = self.get_gold_parses()[0] - self.goldparse = gold - return self.goldparse - - def set_token_annotation(self, ids=None, words=None, tags=None, pos=None, - morphs=None, lemmas=None, heads=None, deps=None, - entities=None, sent_starts=None, brackets=None): - self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, - pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, - deps=deps, entities=entities, - sent_starts=sent_starts, brackets=brackets) - - def set_doc_annotation(self, cats=None, links=None): - if cats: - self.doc_annotation.cats = cats - if links: - self.doc_annotation.links = links - - def split_sents(self): - """ Split the token annotations into multiple Examples based on - sent_starts and return a list of the new Examples""" - if not self.token_annotation.words: - return [self] - s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] - s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] - s_brackets = [] - sent_start_i = 0 - cdef TokenAnnotation t = self.token_annotation - split_examples = [] - cdef int b_start, b_end - cdef unicode b_label - for i in range(len(t.words)): - if i > 0 and t.sent_starts[i] == 1: - s_example.set_token_annotation(ids=s_ids, - words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs, - lemmas=s_lemmas, heads=s_heads, deps=s_deps, - entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) - split_examples.append(s_example) - s_example = Example(doc=None, doc_annotation=self.doc_annotation) - s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], [] - s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] - s_sent_starts, s_brackets = [], [] - sent_start_i = i - s_ids.append(t.get_id(i)) - s_words.append(t.get_word(i)) - s_tags.append(t.get_tag(i)) - s_pos.append(t.get_pos(i)) - s_morphs.append(t.get_morph(i)) - s_lemmas.append(t.get_lemma(i)) - s_heads.append(t.get_head(i) - sent_start_i) - s_deps.append(t.get_dep(i)) - s_ents.append(t.get_entity(i)) - s_sent_starts.append(t.get_sent_start(i)) - for b_end, b_label in t.brackets_by_start.get(i, []): - s_brackets.append( - (i - sent_start_i, b_end - sent_start_i, b_label) - ) - i += 1 - s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, - pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads, - deps=s_deps, entities=s_ents, sent_starts=s_sent_starts, - brackets=s_brackets) - split_examples.append(s_example) - return split_examples - - - def get_gold_parses(self, merge=True, vocab=None, make_projective=False, - ignore_misaligned=False): - """Return a list of (doc, GoldParse) objects. - If merge is set to True, keep all Token annotations as one big list.""" - d = self.doc_annotation - # merge == do not modify Example - if merge: - t = self.token_annotation - doc = self.doc - if doc is None or not isinstance(doc, Doc): - if not vocab: - raise ValueError(Errors.E998) - doc = Doc(vocab, words=t.words) - try: - gp = GoldParse.from_annotation(doc, d, t, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - return [(doc, gp)] - # not merging: one GoldParse per sentence, defining docs with the words - # from each sentence - else: - parses = [] - split_examples = self.split_sents() - for split_example in split_examples: - if not vocab: - raise ValueError(Errors.E998) - split_doc = Doc(vocab, words=split_example.token_annotation.words) - try: - gp = GoldParse.from_annotation(split_doc, d, - split_example.token_annotation, - make_projective=make_projective) - except AlignmentError: - if ignore_misaligned: - gp = None - else: - raise - if gp is not None: - parses.append((split_doc, gp)) - return parses - - @classmethod - def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False): - """ - Return a list of Example objects, from a variety of input formats. - make_doc needs to be provided when the examples contain text strings and keep_raw_text=False - """ - if isinstance(examples, Example): - return [examples] - if isinstance(examples, tuple): - examples = [examples] - converted_examples = [] - for ex in examples: - if isinstance(ex, Example): - converted_examples.append(ex) - # convert string to Doc to Example - elif isinstance(ex, str): - if keep_raw_text: - converted_examples.append(Example(doc=ex)) - else: - doc = make_doc(ex) - converted_examples.append(Example(doc=doc)) - # convert Doc to Example - elif isinstance(ex, Doc): - converted_examples.append(Example(doc=ex)) - # convert tuples to Example - elif isinstance(ex, tuple) and len(ex) == 2: - doc, gold = ex - gold_dict = {} - # convert string to Doc - if isinstance(doc, str) and not keep_raw_text: - doc = make_doc(doc) - # convert dict to GoldParse - if isinstance(gold, dict): - gold_dict = gold - if doc is not None or gold.get("words", None) is not None: - gold = GoldParse(doc, **gold) - else: - gold = None - if gold is not None: - converted_examples.append(Example.from_gold(goldparse=gold, doc=doc)) - else: - raise ValueError(Errors.E999.format(gold_dict=gold_dict)) - else: - converted_examples.append(ex) - return converted_examples - - -cdef class GoldParse: - """Collection for training annotations. - - DOCS: https://spacy.io/api/goldparse - """ - @classmethod - def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): - return cls(doc, words=token_annotation.words, - tags=token_annotation.tags, - pos=token_annotation.pos, - morphs=token_annotation.morphs, - lemmas=token_annotation.lemmas, - heads=token_annotation.heads, - deps=token_annotation.deps, - entities=token_annotation.entities, - sent_starts=token_annotation.sent_starts, - cats=doc_annotation.cats, - links=doc_annotation.links, - make_projective=make_projective) - - def get_token_annotation(self): - ids = None - if self.words: - ids = list(range(len(self.words))) - - return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, - pos=self.pos, morphs=self.morphs, - lemmas=self.lemmas, heads=self.heads, - deps=self.labels, entities=self.ner, - sent_starts=self.sent_starts) - - def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, - lemmas=None, heads=None, deps=None, entities=None, - sent_starts=None, make_projective=False, cats=None, - links=None): - """Create a GoldParse. The fields will not be initialized if len(doc) is zero. - - doc (Doc): The document the annotations refer to. - words (iterable): A sequence of unicode word strings. - tags (iterable): A sequence of strings, representing tag annotations. - pos (iterable): A sequence of strings, representing UPOS annotations. - morphs (iterable): A sequence of strings, representing morph - annotations. - lemmas (iterable): A sequence of strings, representing lemma - annotations. - heads (iterable): A sequence of integers, representing syntactic - head offsets. - deps (iterable): A sequence of strings, representing the syntactic - relation types. - entities (iterable): A sequence of named entity annotations, either as - BILUO tag strings, or as `(start_char, end_char, label)` tuples, - representing the entity positions. - sent_starts (iterable): A sequence of sentence position tags, 1 for - the first word in a sentence, 0 for all others. - cats (dict): Labels for text classification. Each key in the dictionary - may be a string or an int, or a `(start_char, end_char, label)` - tuple, indicating that the label is applied to only part of the - document (usually a sentence). Unlike entity annotations, label - annotations can overlap, i.e. a single word can be covered by - multiple labelled spans. The TextCategorizer component expects - true examples of a label to have the value 1.0, and negative - examples of a label to have the value 0.0. Labels not in the - dictionary are treated as missing - the gradient for those labels - will be zero. - links (dict): A dict with `(start_char, end_char)` keys, - and the values being dicts with kb_id:value entries, - representing the external IDs in a knowledge base (KB) - mapped to either 1.0 or 0.0, indicating positive and - negative examples respectively. - make_projective (bool): Whether to projectivize the dependency tree. - RETURNS (GoldParse): The newly constructed object. - """ - self.mem = Pool() - self.loss = 0 - self.length = len(doc) - - self.cats = {} if cats is None else dict(cats) - self.links = {} if links is None else dict(links) - - # temporary doc for aligning entity annotation - entdoc = None - - # avoid allocating memory if the doc does not contain any tokens - if self.length == 0: - self.words = [] - self.tags = [] - self.heads = [] - self.labels = [] - self.ner = [] - self.morphs = [] - # set a minimal orig so that the scorer can score an empty doc - self.orig = TokenAnnotation(ids=[]) - else: - if not words: - words = [token.text for token in doc] - if not tags: - tags = [None for _ in words] - if not pos: - pos = [None for _ in words] - if not morphs: - morphs = [None for _ in words] - if not lemmas: - lemmas = [None for _ in words] - if not heads: - heads = [None for _ in words] - if not deps: - deps = [None for _ in words] - if not sent_starts: - sent_starts = [None for _ in words] - if entities is None: - entities = ["-" for _ in words] - elif len(entities) == 0: - entities = ["O" for _ in words] - else: - # Translate the None values to '-', to make processing easier. - # See Issue #2603 - entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], str): - # Assume we have entities specified by character offset. - # Create a temporary Doc corresponding to provided words - # (to preserve gold tokenization) and text (to preserve - # character offsets). - entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) - entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) - entdoc_entities = biluo_tags_from_offsets(entdoc, entities) - # There may be some additional whitespace tokens in the - # temporary doc, so check that the annotations align with - # the provided words while building a list of BILUO labels. - entities = [] - words_offset = 0 - for i in range(len(entdoc_words)): - if words[i + words_offset] == entdoc_words[i]: - entities.append(entdoc_entities[i]) - else: - words_offset -= 1 - if len(entities) != len(words): - warnings.warn(Warnings.W029.format(text=doc.text)) - entities = ["-" for _ in words] - - # These are filled by the tagger/parser/entity recogniser - self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) - self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int)) - self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t)) - self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int)) - self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int)) - self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) - - self.words = [None] * len(doc) - self.tags = [None] * len(doc) - self.pos = [None] * len(doc) - self.morphs = [None] * len(doc) - self.lemmas = [None] * len(doc) - self.heads = [None] * len(doc) - self.labels = [None] * len(doc) - self.ner = [None] * len(doc) - self.sent_starts = [None] * len(doc) - - # This needs to be done before we align the words - if make_projective and any(heads) and any(deps) : - heads, deps = nonproj.projectivize(heads, deps) - - # Do many-to-one alignment for misaligned tokens. - # If we over-segment, we'll have one gold word that covers a sequence - # of predicted words - # If we under-segment, we'll have one predicted word that covers a - # sequence of gold words. - # If we "mis-segment", we'll have a sequence of predicted words covering - # a sequence of gold words. That's many-to-many -- we don't do that - # except for NER spans where the start and end can be aligned. - cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) - - self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] - self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] - - self.orig = TokenAnnotation(ids=list(range(len(words))), - words=words, tags=tags, pos=pos, morphs=morphs, - lemmas=lemmas, heads=heads, deps=deps, entities=entities, - sent_starts=sent_starts, brackets=[]) - - for i, gold_i in enumerate(self.cand_to_gold): - if doc[i].text.isspace(): - self.words[i] = doc[i].text - self.tags[i] = "_SP" - self.pos[i] = "SPACE" - self.morphs[i] = None - self.lemmas[i] = None - self.heads[i] = None - self.labels[i] = None - self.ner[i] = None - self.sent_starts[i] = 0 - if gold_i is None: - if i in i2j_multi: - self.words[i] = words[i2j_multi[i]] - self.tags[i] = tags[i2j_multi[i]] - self.pos[i] = pos[i2j_multi[i]] - self.morphs[i] = morphs[i2j_multi[i]] - self.lemmas[i] = lemmas[i2j_multi[i]] - self.sent_starts[i] = sent_starts[i2j_multi[i]] - is_last = i2j_multi[i] != i2j_multi.get(i+1) - # Set next word in multi-token span as head, until last - if not is_last: - self.heads[i] = i+1 - self.labels[i] = "subtok" - else: - head_i = heads[i2j_multi[i]] - if head_i: - self.heads[i] = self.gold_to_cand[head_i] - self.labels[i] = deps[i2j_multi[i]] - ner_tag = entities[i2j_multi[i]] - # Assign O/- for many-to-one O/- NER tags - if ner_tag in ("O", "-"): - self.ner[i] = ner_tag - else: - self.words[i] = words[gold_i] - self.tags[i] = tags[gold_i] - self.pos[i] = pos[gold_i] - self.morphs[i] = morphs[gold_i] - self.lemmas[i] = lemmas[gold_i] - self.sent_starts[i] = sent_starts[gold_i] - if heads[gold_i] is None: - self.heads[i] = None - else: - self.heads[i] = self.gold_to_cand[heads[gold_i]] - self.labels[i] = deps[gold_i] - self.ner[i] = entities[gold_i] - # Assign O/- for one-to-many O/- NER tags - for j, cand_j in enumerate(self.gold_to_cand): - if cand_j is None: - if j in j2i_multi: - i = j2i_multi[j] - ner_tag = entities[j] - if ner_tag in ("O", "-"): - self.ner[i] = ner_tag - - # If there is entity annotation and some tokens remain unaligned, - # align all entities at the character level to account for all - # possible token misalignments within the entity spans - if any([e not in ("O", "-") for e in entities]) and None in self.ner: - # If the temporary entdoc wasn't created above, initialize it - if not entdoc: - entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) - entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) - # Get offsets based on gold words and BILUO entities - entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) - aligned_offsets = [] - aligned_spans = [] - # Filter offsets to identify those that align with doc tokens - for offset in entdoc_offsets: - span = doc.char_span(offset[0], offset[1]) - if span and not span.text.isspace(): - aligned_offsets.append(offset) - aligned_spans.append(span) - # Convert back to BILUO for doc tokens and assign NER for all - # aligned spans - biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) - for span in aligned_spans: - for i in range(span.start, span.end): - self.ner[i] = biluo_tags[i] - - # Prevent whitespace that isn't within entities from being tagged as - # an entity. - for i in range(len(self.ner)): - if self.tags[i] == "_SP": - prev_ner = self.ner[i-1] if i >= 1 else None - next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None - if prev_ner == "O" or next_ner == "O": - self.ner[i] = "O" - - cycle = nonproj.contains_cycle(self.heads) - if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), - doc_tokens=" ".join(words[:50]))) - - def __len__(self): - """Get the number of gold-standard tokens. - - RETURNS (int): The number of gold-standard tokens. - """ - return self.length - - @property - def is_projective(self): - """Whether the provided syntactic annotations form a projective - dependency tree. - """ - return not nonproj.is_nonproj_tree(self.heads) - - -def docs_to_json(docs, id=0, ner_missing_tag="O"): - """Convert a list of Doc objects into the JSON-serializable format used by - the spacy train command. - - docs (iterable / Doc): The Doc object(s) to convert. - id (int): Id for the JSON. - RETURNS (dict): The data in spaCy's JSON format - - each input doc will be treated as a paragraph in the output doc - """ - if isinstance(docs, Doc): - docs = [docs] - json_doc = {"id": id, "paragraphs": []} - for i, doc in enumerate(docs): - json_para = {'raw': doc.text, "sentences": [], "cats": []} - for cat, val in doc.cats.items(): - json_cat = {"label": cat, "value": val} - json_para["cats"].append(json_cat) - ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) - for j, sent in enumerate(doc.sents): - json_sent = {"tokens": [], "brackets": []} - for token in sent: - json_token = {"id": token.i, "orth": token.text} - if doc.is_tagged: - json_token["tag"] = token.tag_ - json_token["pos"] = token.pos_ - json_token["morph"] = token.morph_ - json_token["lemma"] = token.lemma_ - if doc.is_parsed: - json_token["head"] = token.head.i-token.i - json_token["dep"] = token.dep_ - json_token["ner"] = biluo_tags[token.i] - json_sent["tokens"].append(json_token) - json_para["sentences"].append(json_sent) - json_doc["paragraphs"].append(json_para) - return json_doc - - -def biluo_tags_from_offsets(doc, entities, missing="O"): - """Encode labelled spans into per-token tags, using the - Begin/In/Last/Unit/Out scheme (BILUO). - - doc (Doc): The document that the entity offsets refer to. The output tags - will refer to the token boundaries within the document. - entities (iterable): A sequence of `(start, end, label)` triples. `start` - and `end` should be character-offset integers denoting the slice into - the original string. - RETURNS (list): A list of unicode strings, describing the tags. Each tag - string will be of the form either "", "O" or "{action}-{label}", where - action is one of "B", "I", "L", "U". The string "-" is used where the - entity offsets don't align with the tokenization in the `Doc` object. - The training algorithm will view these as missing values. "O" denotes a - non-entity token. "B" denotes the beginning of a multi-token entity, - "I" the inside of an entity of three or more tokens, and "L" the end - of an entity of two or more tokens. "U" denotes a single-token entity. - - EXAMPLE: - >>> text = 'I like London.' - >>> entities = [(len('I like '), len('I like London'), 'LOC')] - >>> doc = nlp.tokenizer(text) - >>> tags = biluo_tags_from_offsets(doc, entities) - >>> assert tags == ["O", "O", 'U-LOC', "O"] - """ - # Ensure no overlapping entity labels exist - tokens_in_ents = {} - - starts = {token.idx: token.i for token in doc} - ends = {token.idx + len(token): token.i for token in doc} - biluo = ["-" for _ in doc] - # Handle entity cases - for start_char, end_char, label in entities: - for token_index in range(start_char, end_char): - if token_index in tokens_in_ents.keys(): - raise ValueError(Errors.E103.format( - span1=(tokens_in_ents[token_index][0], - tokens_in_ents[token_index][1], - tokens_in_ents[token_index][2]), - span2=(start_char, end_char, label))) - tokens_in_ents[token_index] = (start_char, end_char, label) - - start_token = starts.get(start_char) - end_token = ends.get(end_char) - # Only interested if the tokenization is correct - if start_token is not None and end_token is not None: - if start_token == end_token: - biluo[start_token] = f"U-{label}" - else: - biluo[start_token] = f"B-{label}" - for i in range(start_token+1, end_token): - biluo[i] = f"I-{label}" - biluo[end_token] = f"L-{label}" - # Now distinguish the O cases from ones where we miss the tokenization - entity_chars = set() - for start_char, end_char, label in entities: - for i in range(start_char, end_char): - entity_chars.add(i) - for token in doc: - for i in range(token.idx, token.idx + len(token)): - if i in entity_chars: - break - else: - biluo[token.i] = missing - if "-" in biluo: - ent_str = str(entities) - warnings.warn(Warnings.W030.format( - text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, - entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str - )) - return biluo - - -def spans_from_biluo_tags(doc, tags): - """Encode per-token tags following the BILUO scheme into Span object, e.g. - to overwrite the doc.ents. - - doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or - "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of Span objects. - """ - token_offsets = tags_to_entities(tags) - spans = [] - for label, start_idx, end_idx in token_offsets: - span = Span(doc, start_idx, end_idx + 1, label=label) - spans.append(span) - return spans - - -def offsets_from_biluo_tags(doc, tags): - """Encode per-token tags following the BILUO scheme into entity offsets. - - doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or - "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of `(start, end, label)` triples. `start` and - `end` will be character-offset integers denoting the slice into the - original string. - """ - spans = spans_from_biluo_tags(doc, tags) - return [(span.start_char, span.end_char, span.label_) for span in spans] - - -def is_punct_label(label): - return label == "P" or label.lower() == "punct" diff --git a/spacy/gold/__init__.pxd b/spacy/gold/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py new file mode 100644 index 000000000..9416bdd81 --- /dev/null +++ b/spacy/gold/__init__.py @@ -0,0 +1,11 @@ +from .corpus import Corpus +from .example import Example +from .align import align + +from .iob_utils import iob_to_biluo, biluo_to_iob +from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags +from .iob_utils import spans_from_biluo_tags +from .iob_utils import tags_to_entities + +from .gold_io import docs_to_json +from .gold_io import read_json_file diff --git a/spacy/gold/align.pxd b/spacy/gold/align.pxd new file mode 100644 index 000000000..ea3615863 --- /dev/null +++ b/spacy/gold/align.pxd @@ -0,0 +1,8 @@ +cdef class Alignment: + cdef public object cost + cdef public object i2j + cdef public object j2i + cdef public object i2j_multi + cdef public object j2i_multi + cdef public object cand_to_gold + cdef public object gold_to_cand diff --git a/spacy/gold/align.pyx b/spacy/gold/align.pyx new file mode 100644 index 000000000..80ba0346a --- /dev/null +++ b/spacy/gold/align.pyx @@ -0,0 +1,101 @@ +import numpy +from ..errors import Errors, AlignmentError + + +cdef class Alignment: + def __init__(self, spacy_words, gold_words): + # Do many-to-one alignment for misaligned tokens. + # If we over-segment, we'll have one gold word that covers a sequence + # of predicted words + # If we under-segment, we'll have one predicted word that covers a + # sequence of gold words. + # If we "mis-segment", we'll have a sequence of predicted words covering + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. + cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words) + self.cost = cost + self.i2j = i2j + self.j2i = j2i + self.i2j_multi = i2j_multi + self.j2i_multi = j2i_multi + self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] + self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] + + +def align(tokens_a, tokens_b): + """Calculate alignment tables between two tokenizations. + + tokens_a (List[str]): The candidate tokenization. + tokens_b (List[str]): The reference tokenization. + RETURNS: (tuple): A 5-tuple consisting of the following information: + * cost (int): The number of misaligned tokens. + * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. + For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns + to `tokens_b[6]`. If there's no one-to-one alignment for a token, + it has the value -1. + * b2a (List[int]): The same as `a2b`, but mapping the other direction. + * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` + to indices in `tokens_b`, where multiple tokens of `tokens_a` align to + the same token of `tokens_b`. + * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other + direction. + """ + tokens_a = _normalize_for_alignment(tokens_a) + tokens_b = _normalize_for_alignment(tokens_b) + cost = 0 + a2b = numpy.empty(len(tokens_a), dtype="i") + b2a = numpy.empty(len(tokens_b), dtype="i") + a2b.fill(-1) + b2a.fill(-1) + a2b_multi = {} + b2a_multi = {} + i = 0 + j = 0 + offset_a = 0 + offset_b = 0 + while i < len(tokens_a) and j < len(tokens_b): + a = tokens_a[i][offset_a:] + b = tokens_b[j][offset_b:] + if a == b: + if offset_a == offset_b == 0: + a2b[i] = j + b2a[j] = i + elif offset_a == 0: + cost += 2 + a2b_multi[i] = j + elif offset_b == 0: + cost += 2 + b2a_multi[j] = i + offset_a = offset_b = 0 + i += 1 + j += 1 + elif a == "": + assert offset_a == 0 + cost += 1 + i += 1 + elif b == "": + assert offset_b == 0 + cost += 1 + j += 1 + elif b.startswith(a): + cost += 1 + if offset_a == 0: + a2b_multi[i] = j + i += 1 + offset_a = 0 + offset_b += len(a) + elif a.startswith(b): + cost += 1 + if offset_b == 0: + b2a_multi[j] = i + j += 1 + offset_b = 0 + offset_a += len(b) + else: + assert "".join(tokens_a) != "".join(tokens_b) + raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b)) + return cost, a2b, b2a, a2b_multi, b2a_multi + + +def _normalize_for_alignment(tokens): + return [w.replace(" ", "").lower() for w in tokens] diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py new file mode 100644 index 000000000..45cfc0abe --- /dev/null +++ b/spacy/gold/augment.py @@ -0,0 +1,111 @@ +import random +import itertools + + +def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants( + nlp, raw_text, orig_dict["token_annotation"], orth_variant_level + ) + doc = nlp.make_doc(variant_text) + orig_dict["token_annotation"] = variant_token_annot + return example.from_dict(doc, orig_dict) + + +def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): + if random.random() >= orth_variant_level: + return raw_text, orig_token_dict + if not orig_token_dict: + return raw_text, orig_token_dict + raw = raw_text + token_dict = orig_token_dict + lower = False + if random.random() >= 0.5: + lower = True + if raw is not None: + raw = raw.lower() + ndsv = nlp.Defaults.single_orth_variants + ndpv = nlp.Defaults.paired_orth_variants + words = token_dict.get("words", []) + tags = token_dict.get("tags", []) + # keep unmodified if words or tags are not defined + if words and tags: + if lower: + words = [w.lower() for w in words] + # single variants + punct_choices = [random.choice(x["variants"]) for x in ndsv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndsv)): + if ( + tags[word_idx] in ndsv[punct_idx]["tags"] + and words[word_idx] in ndsv[punct_idx]["variants"] + ): + words[word_idx] = punct_choices[punct_idx] + # paired variants + punct_choices = [random.choice(x["variants"]) for x in ndpv] + for word_idx in range(len(words)): + for punct_idx in range(len(ndpv)): + if tags[word_idx] in ndpv[punct_idx]["tags"] and words[ + word_idx + ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): + # backup option: random left vs. right from pair + pair_idx = random.choice([0, 1]) + # best option: rely on paired POS tags like `` / '' + if len(ndpv[punct_idx]["tags"]) == 2: + pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) + # next best option: rely on position in variants + # (may not be unambiguous, so order of variants matters) + else: + for pair in ndpv[punct_idx]["variants"]: + if words[word_idx] in pair: + pair_idx = pair.index(words[word_idx]) + words[word_idx] = punct_choices[punct_idx][pair_idx] + token_dict["words"] = words + token_dict["tags"] = tags + # modify raw + if raw is not None: + variants = [] + for single_variants in ndsv: + variants.extend(single_variants["variants"]) + for paired_variants in ndpv: + variants.extend( + list(itertools.chain.from_iterable(paired_variants["variants"])) + ) + # store variants in reverse length order to be able to prioritize + # longer matches (e.g., "---" before "--") + variants = sorted(variants, key=lambda x: len(x)) + variants.reverse() + variant_raw = "" + raw_idx = 0 + # add initial whitespace + while raw_idx < len(raw) and raw[raw_idx].isspace(): + variant_raw += raw[raw_idx] + raw_idx += 1 + for word in words: + match_found = False + # skip whitespace words + if word.isspace(): + match_found = True + # add identical word + elif word not in variants and raw[raw_idx:].startswith(word): + variant_raw += word + raw_idx += len(word) + match_found = True + # add variant word + else: + for variant in variants: + if not match_found and raw[raw_idx:].startswith(variant): + raw_idx += len(variant) + variant_raw += word + match_found = True + # something went wrong, abort + # (add a warning message?) + if not match_found: + return raw_text, orig_token_dict + # add following whitespace + while raw_idx < len(raw) and raw[raw_idx].isspace(): + variant_raw += raw[raw_idx] + raw_idx += 1 + raw = variant_raw + return raw, token_dict diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py new file mode 100644 index 000000000..3e366933a --- /dev/null +++ b/spacy/gold/converters/__init__.py @@ -0,0 +1,6 @@ +from .iob2docs import iob2docs # noqa: F401 +from .conll_ner2docs import conll_ner2docs # noqa: F401 +from .json2docs import json2docs + +# TODO: Update this one +# from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2docs.py similarity index 80% rename from spacy/cli/converters/conll_ner2json.py rename to spacy/gold/converters/conll_ner2docs.py index b607d5913..0b348142a 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/gold/converters/conll_ner2docs.py @@ -1,17 +1,18 @@ from wasabi import Printer +from .. import tags_to_entities from ...gold import iob_to_biluo from ...lang.xx import MultiLanguage -from ...tokens.doc import Doc +from ...tokens import Doc, Span from ...util import load_model -def conll_ner2json( +def conll_ner2docs( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ Convert files in the CoNLL-2003 NER format and similar - whitespace-separated columns into JSON format for use with train cli. + whitespace-separated columns into Doc objects. The first column is the tokens, the final column is the IOB tags. If an additional second column is present, the second column is the tags. @@ -81,17 +82,25 @@ def conll_ner2json( "No document delimiters found. Use `-n` to automatically group " "sentences into documents." ) + + if model: + nlp = load_model(model) + else: + nlp = MultiLanguage() output_docs = [] - for doc in input_data.strip().split(doc_delimiter): - doc = doc.strip() - if not doc: + for conll_doc in input_data.strip().split(doc_delimiter): + conll_doc = conll_doc.strip() + if not conll_doc: continue - output_doc = [] - for sent in doc.split("\n\n"): - sent = sent.strip() - if not sent: + words = [] + sent_starts = [] + pos_tags = [] + biluo_tags = [] + for conll_sent in conll_doc.split("\n\n"): + conll_sent = conll_sent.strip() + if not conll_sent: continue - lines = [line.strip() for line in sent.split("\n") if line.strip()] + lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: raise ValueError( @@ -99,25 +108,19 @@ def conll_ner2json( "Try checking whitespace and delimiters. See " "https://spacy.io/api/cli#convert" ) - words = cols[0] - iob_ents = cols[-1] - if len(cols) > 2: - tags = cols[1] - else: - tags = ["-"] * len(words) - biluo_ents = iob_to_biluo(iob_ents) - output_doc.append( - { - "tokens": [ - {"orth": w, "tag": tag, "ner": ent} - for (w, tag, ent) in zip(words, tags, biluo_ents) - ] - } - ) - output_docs.append( - {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]} - ) - output_doc = [] + length = len(cols[0]) + words.extend(cols[0]) + sent_starts.extend([True] + [False] * (length - 1)) + biluo_tags.extend(iob_to_biluo(cols[-1])) + pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length) + + doc = Doc(nlp.vocab, words=words) + for i, token in enumerate(doc): + token.tag_ = pos_tags[i] + token.is_sent_start = sent_starts[i] + entities = tags_to_entities(biluo_tags) + doc.ents = [Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities] + output_docs.append(doc) return output_docs diff --git a/spacy/cli/converters/conllu2json.py b/spacy/gold/converters/conllu2json.py similarity index 86% rename from spacy/cli/converters/conllu2json.py rename to spacy/gold/converters/conllu2json.py index 1ece755b8..73fdf57e7 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/gold/converters/conllu2json.py @@ -1,10 +1,10 @@ import re +from .conll_ner2docs import n_sents_info from ...gold import Example -from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets +from ...gold import iob_to_biluo, spans_from_biluo_tags from ...language import Language from ...tokens import Doc, Token -from .conll_ner2json import n_sents_info from wasabi import Printer @@ -12,7 +12,6 @@ def conllu2json( input_data, n_sents=10, append_morphology=False, - lang=None, ner_map=None, merge_subtokens=False, no_print=False, @@ -44,10 +43,7 @@ def conllu2json( raw += example.text sentences.append( generate_sentence( - example.token_annotation, - has_ner_tags, - MISC_NER_PATTERN, - ner_map=ner_map, + example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, ) ) # Real-sized documents could be extracted using the comments on the @@ -145,21 +141,22 @@ def get_entities(lines, tag_pattern, ner_map=None): return iob_to_biluo(iob) -def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None): +def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): sentence = {} tokens = [] - for i, id_ in enumerate(token_annotation.ids): + token_annotation = example_dict["token_annotation"] + for i, id_ in enumerate(token_annotation["ids"]): token = {} token["id"] = id_ - token["orth"] = token_annotation.get_word(i) - token["tag"] = token_annotation.get_tag(i) - token["pos"] = token_annotation.get_pos(i) - token["lemma"] = token_annotation.get_lemma(i) - token["morph"] = token_annotation.get_morph(i) - token["head"] = token_annotation.get_head(i) - id_ - token["dep"] = token_annotation.get_dep(i) + token["orth"] = token_annotation["words"][i] + token["tag"] = token_annotation["tags"][i] + token["pos"] = token_annotation["pos"][i] + token["lemma"] = token_annotation["lemmas"][i] + token["morph"] = token_annotation["morphs"][i] + token["head"] = token_annotation["heads"][i] - i + token["dep"] = token_annotation["deps"][i] if has_ner_tags: - token["ner"] = token_annotation.get_entity(i) + token["ner"] = example_dict["doc_annotation"]["entities"][i] tokens.append(token) sentence["tokens"] = tokens return sentence @@ -267,40 +264,25 @@ def example_from_conllu_sentence( doc = merge_conllu_subtokens(lines, doc) # create Example from custom Doc annotation - ids, words, tags, heads, deps = [], [], [], [], [] - pos, lemmas, morphs, spaces = [], [], [], [] + words, spaces, tags, morphs, lemmas = [], [], [], [], [] for i, t in enumerate(doc): - ids.append(i) words.append(t._.merged_orth) + lemmas.append(t._.merged_lemma) + spaces.append(t._.merged_spaceafter) + morphs.append(t._.merged_morph) if append_morphology and t._.merged_morph: tags.append(t.tag_ + "__" + t._.merged_morph) else: tags.append(t.tag_) - pos.append(t.pos_) - morphs.append(t._.merged_morph) - lemmas.append(t._.merged_lemma) - heads.append(t.head.i) - deps.append(t.dep_) - spaces.append(t._.merged_spaceafter) - ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - ents = biluo_tags_from_offsets(doc, ent_offsets) - raw = "" - for word, space in zip(words, spaces): - raw += word - if space: - raw += " " - example = Example(doc=raw) - example.set_token_annotation( - ids=ids, - words=words, - tags=tags, - pos=pos, - morphs=morphs, - lemmas=lemmas, - heads=heads, - deps=deps, - entities=ents, - ) + + doc_x = Doc(vocab, words=words, spaces=spaces) + ref_dict = Example(doc_x, reference=doc).to_dict() + ref_dict["words"] = words + ref_dict["lemmas"] = lemmas + ref_dict["spaces"] = spaces + ref_dict["tags"] = tags + ref_dict["morphs"] = morphs + example = Example.from_dict(doc_x, ref_dict) return example diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py new file mode 100644 index 000000000..51321a470 --- /dev/null +++ b/spacy/gold/converters/iob2docs.py @@ -0,0 +1,64 @@ +from wasabi import Printer + +from .conll_ner2docs import n_sents_info +from ...gold import iob_to_biluo, tags_to_entities +from ...tokens import Doc, Span +from ...util import minibatch + + +def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs): + """ + Convert IOB files with one sentence per line and tags separated with '|' + into Doc objects so they can be saved. IOB and IOB2 are accepted. + + Sample formats: + + I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O + I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O + I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O + I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O + """ + msg = Printer(no_print=no_print) + if n_sents > 0: + n_sents_info(msg, n_sents) + docs = read_iob(input_data.split("\n"), vocab, n_sents) + return docs + + +def read_iob(raw_sents, vocab, n_sents): + docs = [] + for group in minibatch(raw_sents, size=n_sents): + tokens = [] + words = [] + tags = [] + iob = [] + sent_starts = [] + for line in group: + if not line.strip(): + continue + sent_tokens = [t.split("|") for t in line.split()] + if len(sent_tokens[0]) == 3: + sent_words, sent_tags, sent_iob = zip(*sent_tokens) + elif len(sent_tokens[0]) == 2: + sent_words, sent_iob = zip(*sent_tokens) + sent_tags = ["-"] * len(sent_words) + else: + raise ValueError( + "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" + ) + words.extend(sent_words) + tags.extend(sent_tags) + iob.extend(sent_iob) + tokens.extend(sent_tokens) + sent_starts.append(True) + sent_starts.extend([False for _ in sent_words[1:]]) + doc = Doc(vocab, words=words) + for i, tag in enumerate(tags): + doc[i].tag_ = tag + for i, sent_start in enumerate(sent_starts): + doc[i].is_sent_start = sent_start + biluo = iob_to_biluo(iob) + entities = tags_to_entities(biluo) + doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities] + docs.append(doc) + return docs diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py new file mode 100644 index 000000000..50ad16faf --- /dev/null +++ b/spacy/gold/converters/json2docs.py @@ -0,0 +1,24 @@ +import srsly +from ..gold_io import json_iterate, json_to_annotations +from ..example import annotations2doc +from ..example import _fix_legacy_dict_data, _parse_example_dict_data +from ...util import load_model +from ...lang.xx import MultiLanguage + + +def json2docs(input_data, model=None, **kwargs): + nlp = load_model(model) if model is not None else MultiLanguage() + if not isinstance(input_data, bytes): + if not isinstance(input_data, str): + input_data = srsly.json_dumps(input_data) + input_data = input_data.encode("utf8") + docs = [] + for json_doc in json_iterate(input_data): + for json_para in json_to_annotations(json_doc): + example_dict = _fix_legacy_dict_data(json_para) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + if json_para.get("raw"): + assert tok_dict.get("SPACY") + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + docs.append(doc) + return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py new file mode 100644 index 000000000..086c95fb2 --- /dev/null +++ b/spacy/gold/corpus.py @@ -0,0 +1,122 @@ +import random +from .. import util +from .example import Example +from ..tokens import DocBin, Doc + + +class Corpus: + """An annotated corpus, reading train and dev datasets from + the DocBin (.spacy) format. + + DOCS: https://spacy.io/api/goldcorpus + """ + + def __init__(self, train_loc, dev_loc, limit=0): + """Create a Corpus. + + train (str / Path): File or directory of training data. + dev (str / Path): File or directory of development data. + limit (int): Max. number of examples returned + RETURNS (Corpus): The newly created object. + """ + self.train_loc = train_loc + self.dev_loc = dev_loc + self.limit = limit + + @staticmethod + def walk_corpus(path): + path = util.ensure_path(path) + if not path.is_dir(): + return [path] + paths = [path] + locs = [] + seen = set() + for path in paths: + if str(path) in seen: + continue + seen.add(str(path)) + if path.parts[-1].startswith("."): + continue + elif path.is_dir(): + paths.extend(path.iterdir()) + elif path.parts[-1].endswith(".spacy"): + locs.append(path) + return locs + + def make_examples(self, nlp, reference_docs, max_length=0): + for reference in reference_docs: + if max_length >= 1 and len(reference) >= max_length: + if reference.is_sentenced: + for ref_sent in reference.sents: + yield Example( + nlp.make_doc(ref_sent.text), + ref_sent.as_doc() + ) + else: + yield Example( + nlp.make_doc(reference.text), + reference + ) + + def make_examples_gold_preproc(self, nlp, reference_docs): + for reference in reference_docs: + if reference.is_sentenced: + ref_sents = [sent.as_doc() for sent in reference.sents] + else: + ref_sents = [reference] + for ref_sent in ref_sents: + yield Example( + Doc( + nlp.vocab, + words=[w.text for w in ref_sent], + spaces=[bool(w.whitespace_) for w in ref_sent] + ), + ref_sent + ) + + def read_docbin(self, vocab, locs): + """ Yield training examples as example dicts """ + i = 0 + for loc in locs: + loc = util.ensure_path(loc) + if loc.parts[-1].endswith(".spacy"): + with loc.open("rb") as file_: + doc_bin = DocBin().from_bytes(file_.read()) + docs = doc_bin.get_docs(vocab) + for doc in docs: + if len(doc): + yield doc + i += 1 + if self.limit >= 1 and i >= self.limit: + break + + def count_train(self, nlp): + """Returns count of words in train examples""" + n = 0 + i = 0 + for example in self.train_dataset(nlp): + n += len(example.predicted) + if self.limit >= 0 and i >= self.limit: + break + i += 1 + return n + + def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, + max_length=0, **kwargs): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) + if gold_preproc: + examples = self.make_examples_gold_preproc(nlp, ref_docs) + else: + examples = self.make_examples(nlp, ref_docs, max_length) + if shuffle: + examples = list(examples) + random.shuffle(examples) + yield from examples + + def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) + if gold_preproc: + examples = self.make_examples_gold_preproc(nlp, ref_docs) + else: + examples = self.make_examples(nlp, ref_docs, max_length=0) + yield from examples diff --git a/spacy/gold/example.pxd b/spacy/gold/example.pxd new file mode 100644 index 000000000..736969ecd --- /dev/null +++ b/spacy/gold/example.pxd @@ -0,0 +1,8 @@ +from ..tokens.doc cimport Doc +from .align cimport Alignment + + +cdef class Example: + cdef readonly Doc x + cdef readonly Doc y + cdef readonly Alignment _alignment diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx new file mode 100644 index 000000000..febbf50fc --- /dev/null +++ b/spacy/gold/example.pyx @@ -0,0 +1,432 @@ +import warnings + +import numpy + +from ..tokens import Token +from ..tokens.doc cimport Doc +from ..tokens.span cimport Span +from ..tokens.span import Span +from ..attrs import IDS +from .align cimport Alignment +from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc +from .iob_utils import spans_from_biluo_tags +from .align import Alignment +from ..errors import Errors, AlignmentError +from ..syntax import nonproj +from ..util import get_words_and_spaces + + +cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): + """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """ + attrs, array = _annot2array(vocab, tok_annot, doc_annot) + output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) + if "entities" in doc_annot: + _add_entities_to_doc(output, doc_annot["entities"]) + if array.size: + output = output.from_array(attrs, array) + # links are currently added with ENT_KB_ID on the token level + output.cats.update(doc_annot.get("cats", {})) + return output + + +cdef class Example: + def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): + """ Doc can either be text, or an actual Doc """ + msg = "Example.__init__ got None for '{arg}'. Requires Doc." + if predicted is None: + raise TypeError(msg.format(arg="predicted")) + if reference is None: + raise TypeError(msg.format(arg="reference")) + self.x = predicted + self.y = reference + self._alignment = alignment + + property predicted: + def __get__(self): + return self.x + + def __set__(self, doc): + self.x = doc + + property reference: + def __get__(self): + return self.y + + def __set__(self, doc): + self.y = doc + + def copy(self): + return Example( + self.x.copy(), + self.y.copy() + ) + + @classmethod + def from_dict(cls, Doc predicted, dict example_dict): + if example_dict is None: + raise ValueError("Example.from_dict expected dict, received None") + if not isinstance(predicted, Doc): + raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}") + example_dict = _fix_legacy_dict_data(example_dict) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + if "ORTH" not in tok_dict: + tok_dict["ORTH"] = [tok.text for tok in predicted] + tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] + if not _has_field(tok_dict, "SPACY"): + tok_dict["SPACY"] = _guess_spaces(predicted.text, tok_dict["ORTH"]) + return Example( + predicted, + annotations2doc(predicted.vocab, tok_dict, doc_dict) + ) + + @property + def alignment(self): + if self._alignment is None: + spacy_words = [token.orth_ for token in self.predicted] + gold_words = [token.orth_ for token in self.reference] + if gold_words == []: + gold_words = spacy_words + self._alignment = Alignment(spacy_words, gold_words) + return self._alignment + + def get_aligned(self, field, as_string=False): + """Return an aligned array for a token attribute.""" + i2j_multi = self.alignment.i2j_multi + cand_to_gold = self.alignment.cand_to_gold + + vocab = self.reference.vocab + gold_values = self.reference.to_array([field]) + output = [None] * len(self.predicted) + for i, gold_i in enumerate(cand_to_gold): + if self.predicted[i].text.isspace(): + output[i] = None + if gold_i is None: + if i in i2j_multi: + output[i] = gold_values[i2j_multi[i]] + else: + output[i] = None + else: + output[i] = gold_values[gold_i] + if as_string and field not in ["ENT_IOB", "SENT_START"]: + output = [vocab.strings[o] if o is not None else o for o in output] + return output + + def get_aligned_parse(self, projectivize=True): + cand_to_gold = self.alignment.cand_to_gold + gold_to_cand = self.alignment.gold_to_cand + aligned_heads = [None] * self.x.length + aligned_deps = [None] * self.x.length + heads = [token.head.i for token in self.y] + deps = [token.dep_ for token in self.y] + heads, deps = nonproj.projectivize(heads, deps) + for cand_i in range(self.x.length): + gold_i = cand_to_gold[cand_i] + if gold_i is not None: # Alignment found + gold_head = gold_to_cand[heads[gold_i]] + if gold_head is not None: + aligned_heads[cand_i] = gold_head + aligned_deps[cand_i] = deps[gold_i] + return aligned_heads, aligned_deps + + def get_aligned_ner(self): + if not self.y.is_nered: + return [None] * len(self.x) # should this be 'missing' instead of 'None' ? + x_text = self.x.text + # Get a list of entities, and make spans for non-entity tokens. + # We then work through the spans in order, trying to find them in + # the text and using that to get the offset. Any token that doesn't + # get a tag set this way is tagged None. + # This could maybe be improved? It at least feels easy to reason about. + y_spans = list(self.y.ents) + y_spans.sort() + x_text_offset = 0 + x_spans = [] + for y_span in y_spans: + if x_text.count(y_span.text) >= 1: + start_char = x_text.index(y_span.text) + x_text_offset + end_char = start_char + len(y_span.text) + x_span = self.x.char_span(start_char, end_char, label=y_span.label) + if x_span is not None: + x_spans.append(x_span) + x_text = self.x.text[end_char:] + x_text_offset = end_char + x_tags = biluo_tags_from_offsets( + self.x, + [(e.start_char, e.end_char, e.label_) for e in x_spans], + missing=None + ) + gold_to_cand = self.alignment.gold_to_cand + for token in self.y: + if token.ent_iob_ == "O": + cand_i = gold_to_cand[token.i] + if cand_i is not None and x_tags[cand_i] is None: + x_tags[cand_i] = "O" + i2j_multi = self.alignment.i2j_multi + for i, tag in enumerate(x_tags): + if tag is None and i in i2j_multi: + gold_i = i2j_multi[i] + if gold_i is not None and self.y[gold_i].ent_iob_ == "O": + x_tags[i] = "O" + return x_tags + + def to_dict(self): + return { + "doc_annotation": { + "cats": dict(self.reference.cats), + "entities": biluo_tags_from_doc(self.reference), + "links": self._links_to_dict() + }, + "token_annotation": { + "ids": [t.i+1 for t in self.reference], + "words": [t.text for t in self.reference], + "tags": [t.tag_ for t in self.reference], + "lemmas": [t.lemma_ for t in self.reference], + "pos": [t.pos_ for t in self.reference], + "morphs": [t.morph_ for t in self.reference], + "heads": [t.head.i for t in self.reference], + "deps": [t.dep_ for t in self.reference], + "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference] + } + } + + def _links_to_dict(self): + links = {} + for ent in self.reference.ents: + if ent.kb_id_: + links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0} + return links + + + def split_sents(self): + """ Split the token annotations into multiple Examples based on + sent_starts and return a list of the new Examples""" + if not self.reference.is_sentenced: + return [self] + + sent_starts = self.get_aligned("SENT_START") + sent_starts.append(1) # appending virtual start of a next sentence to facilitate search + + output = [] + pred_start = 0 + for sent in self.reference.sents: + new_ref = sent.as_doc() + pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts + new_pred = self.predicted[pred_start : pred_end].as_doc() + output.append(Example(new_pred, new_ref)) + pred_start = pred_end + + return output + + property text: + def __get__(self): + return self.x.text + + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return str(self.to_dict()) + + +def _annot2array(vocab, tok_annot, doc_annot): + attrs = [] + values = [] + + for key, value in doc_annot.items(): + if value: + if key == "entities": + pass + elif key == "links": + entities = doc_annot.get("entities", {}) + if not entities: + raise ValueError(Errors.E981) + ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) + tok_annot["ENT_KB_ID"] = ent_kb_ids + elif key == "cats": + pass + else: + raise ValueError(f"Unknown doc attribute: {key}") + + for key, value in tok_annot.items(): + if key not in IDS: + raise ValueError(f"Unknown token attribute: {key}") + elif key in ["ORTH", "SPACY"]: + pass + elif key == "HEAD": + attrs.append(key) + values.append([h-i for i, h in enumerate(value)]) + elif key == "SENT_START": + attrs.append(key) + values.append(value) + elif key == "MORPH": + attrs.append(key) + values.append([vocab.morphology.add(v) for v in value]) + else: + attrs.append(key) + values.append([vocab.strings.add(v) for v in value]) + + array = numpy.asarray(values, dtype="uint64") + return attrs, array.T + + +def _add_entities_to_doc(doc, ner_data): + if ner_data is None: + return + elif ner_data == []: + doc.ents = [] + elif isinstance(ner_data[0], tuple): + return _add_entities_to_doc( + doc, + biluo_tags_from_offsets(doc, ner_data) + ) + elif isinstance(ner_data[0], str) or ner_data[0] is None: + return _add_entities_to_doc( + doc, + spans_from_biluo_tags(doc, ner_data) + ) + elif isinstance(ner_data[0], Span): + # Ugh, this is super messy. Really hard to set O entities + doc.ents = ner_data + doc.ents = [span for span in ner_data if span.label_] + else: + raise ValueError("Unexpected type for NER data") + + +def _parse_example_dict_data(example_dict): + return ( + example_dict["token_annotation"], + example_dict["doc_annotation"] + ) + + +def _fix_legacy_dict_data(example_dict): + token_dict = example_dict.get("token_annotation", {}) + doc_dict = example_dict.get("doc_annotation", {}) + for key, value in example_dict.items(): + if value: + if key in ("token_annotation", "doc_annotation"): + pass + elif key == "ids": + pass + elif key in ("cats", "links"): + doc_dict[key] = value + elif key in ("ner", "entities"): + doc_dict["entities"] = value + else: + token_dict[key] = value + # Remap keys + remapping = { + "words": "ORTH", + "tags": "TAG", + "pos": "POS", + "lemmas": "LEMMA", + "deps": "DEP", + "heads": "HEAD", + "sent_starts": "SENT_START", + "morphs": "MORPH", + "spaces": "SPACY", + } + old_token_dict = token_dict + token_dict = {} + for key, value in old_token_dict.items(): + if key in ("text", "ids", "brackets"): + pass + elif key in remapping: + token_dict[remapping[key]] = value + else: + raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) + text = example_dict.get("text", example_dict.get("raw")) + if "HEAD" in token_dict and "SENT_START" in token_dict: + # If heads are set, we don't also redundantly specify SENT_START. + token_dict.pop("SENT_START") + warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set") + return { + "token_annotation": token_dict, + "doc_annotation": doc_dict + } + +def _has_field(annot, field): + if field not in annot: + return False + elif annot[field] is None: + return False + elif len(annot[field]) == 0: + return False + elif all([value is None for value in annot[field]]): + return False + else: + return True + + +def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): + if isinstance(biluo_or_offsets[0], (list, tuple)): + # Convert to biluo if necessary + # This is annoying but to convert the offsets we need a Doc + # that has the target tokenization. + reference = Doc(vocab, words=words, spaces=spaces) + biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) + else: + biluo = biluo_or_offsets + ent_iobs = [] + ent_types = [] + for iob_tag in biluo_to_iob(biluo): + if iob_tag in (None, "-"): + ent_iobs.append("") + ent_types.append("") + else: + ent_iobs.append(iob_tag.split("-")[0]) + if iob_tag.startswith("I") or iob_tag.startswith("B"): + ent_types.append(iob_tag.split("-", 1)[1]) + else: + ent_types.append("") + return ent_iobs, ent_types + +def _parse_links(vocab, words, links, entities): + reference = Doc(vocab, words=words) + starts = {token.idx: token.i for token in reference} + ends = {token.idx + len(token): token.i for token in reference} + ent_kb_ids = ["" for _ in reference] + entity_map = [(ent[0], ent[1]) for ent in entities] + + # links annotations need to refer 1-1 to entity annotations - throw error otherwise + for index, annot_dict in links.items(): + start_char, end_char = index + if (start_char, end_char) not in entity_map: + raise ValueError(Errors.E981) + + for index, annot_dict in links.items(): + true_kb_ids = [] + for key, value in annot_dict.items(): + if value == 1.0: + true_kb_ids.append(key) + if len(true_kb_ids) > 1: + raise ValueError(Errors.E980) + + if len(true_kb_ids) == 1: + start_char, end_char = index + start_token = starts.get(start_char) + end_token = ends.get(end_char) + for i in range(start_token, end_token+1): + ent_kb_ids[i] = true_kb_ids[0] + + return ent_kb_ids + + +def _guess_spaces(text, words): + if text is None: + return [True] * len(words) + spaces = [] + text_pos = 0 + # align words with text + for word in words: + try: + word_start = text[text_pos:].index(word) + except ValueError: + spaces.append(True) + continue + text_pos += word_start + len(word) + if text_pos < len(text) and text[text_pos] == " ": + spaces.append(True) + else: + spaces.append(False) + return spaces diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx new file mode 100644 index 000000000..8dbb5f395 --- /dev/null +++ b/spacy/gold/gold_io.pyx @@ -0,0 +1,199 @@ +import warnings +import srsly +from .. import util +from ..errors import Warnings +from ..tokens import Doc +from .iob_utils import biluo_tags_from_offsets, tags_to_entities +import json + + +def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): + """Convert a list of Doc objects into the JSON-serializable format used by + the spacy train command. + + docs (iterable / Doc): The Doc object(s) to convert. + doc_id (int): Id for the JSON. + RETURNS (dict): The data in spaCy's JSON format + - each input doc will be treated as a paragraph in the output doc + """ + if isinstance(docs, Doc): + docs = [docs] + json_doc = {"id": doc_id, "paragraphs": []} + for i, doc in enumerate(docs): + json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []} + for cat, val in doc.cats.items(): + json_cat = {"label": cat, "value": val} + json_para["cats"].append(json_cat) + for ent in doc.ents: + ent_tuple = (ent.start_char, ent.end_char, ent.label_) + json_para["entities"].append(ent_tuple) + if ent.kb_id_: + link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} + json_para["links"].append(link_dict) + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) + for j, sent in enumerate(doc.sents): + json_sent = {"tokens": [], "brackets": []} + for token in sent: + json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} + if doc.is_tagged: + json_token["tag"] = token.tag_ + json_token["pos"] = token.pos_ + json_token["morph"] = token.morph_ + json_token["lemma"] = token.lemma_ + if doc.is_parsed: + json_token["head"] = token.head.i-token.i + json_token["dep"] = token.dep_ + json_sent["tokens"].append(json_token) + json_para["sentences"].append(json_sent) + json_doc["paragraphs"].append(json_para) + return json_doc + + +def read_json_file(loc, docs_filter=None, limit=None): + """Read Example dictionaries from a json file or directory.""" + loc = util.ensure_path(loc) + if loc.is_dir(): + for filename in loc.iterdir(): + yield from read_json_file(loc / filename, limit=limit) + else: + with loc.open("rb") as file_: + utf8_str = file_.read() + for json_doc in json_iterate(utf8_str): + if docs_filter is not None and not docs_filter(json_doc): + continue + for json_paragraph in json_to_annotations(json_doc): + yield json_paragraph + + +def json_to_annotations(doc): + """Convert an item in the JSON-formatted training data to the format + used by Example. + + doc (dict): One entry in the training data. + YIELDS (tuple): The reformatted data - one training example per paragraph + """ + for paragraph in doc["paragraphs"]: + example = {"text": paragraph.get("raw", None)} + words = [] + spaces = [] + ids = [] + tags = [] + ner_tags = [] + pos = [] + morphs = [] + lemmas = [] + heads = [] + labels = [] + sent_starts = [] + brackets = [] + for sent in paragraph["sentences"]: + sent_start_i = len(words) + for i, token in enumerate(sent["tokens"]): + words.append(token["orth"]) + spaces.append(token.get("space", None)) + ids.append(token.get('id', sent_start_i + i)) + tags.append(token.get("tag", None)) + pos.append(token.get("pos", None)) + morphs.append(token.get("morph", None)) + lemmas.append(token.get("lemma", None)) + if "head" in token: + heads.append(token["head"] + sent_start_i + i) + else: + heads.append(None) + if "dep" in token: + labels.append(token["dep"]) + # Ensure ROOT label is case-insensitive + if labels[-1].lower() == "root": + labels[-1] = "ROOT" + else: + labels.append(None) + ner_tags.append(token.get("ner", None)) + if i == 0: + sent_starts.append(1) + else: + sent_starts.append(0) + if "brackets" in sent: + brackets.extend((b["first"] + sent_start_i, + b["last"] + sent_start_i, b["label"]) + for b in sent["brackets"]) + + example["token_annotation"] = dict( + ids=ids, + words=words, + spaces=spaces, + sent_starts=sent_starts, + brackets=brackets + ) + # avoid including dummy values that looks like gold info was present + if any(tags): + example["token_annotation"]["tags"] = tags + if any(pos): + example["token_annotation"]["pos"] = pos + if any(morphs): + example["token_annotation"]["morphs"] = morphs + if any(lemmas): + example["token_annotation"]["lemmas"] = lemmas + if any(head is not None for head in heads): + example["token_annotation"]["heads"] = heads + if any(labels): + example["token_annotation"]["deps"] = labels + + cats = {} + for cat in paragraph.get("cats", {}): + cats[cat["label"]] = cat["value"] + example["doc_annotation"] = dict( + cats=cats, + entities=ner_tags, + links=paragraph.get("links", []) # TODO: fix/test + ) + yield example + +def json_iterate(bytes utf8_str): + # We should've made these files jsonl...But since we didn't, parse out + # the docs one-by-one to reduce memory usage. + # It's okay to read in the whole file -- just don't parse it into JSON. + cdef long file_length = len(utf8_str) + if file_length > 2 ** 30: + warnings.warn(Warnings.W027.format(size=file_length)) + + raw = <char*>utf8_str + cdef int square_depth = 0 + cdef int curly_depth = 0 + cdef int inside_string = 0 + cdef int escape = 0 + cdef long start = -1 + cdef char c + cdef char quote = ord('"') + cdef char backslash = ord("\\") + cdef char open_square = ord("[") + cdef char close_square = ord("]") + cdef char open_curly = ord("{") + cdef char close_curly = ord("}") + for i in range(file_length): + c = raw[i] + if escape: + escape = False + continue + if c == backslash: + escape = True + continue + if c == quote: + inside_string = not inside_string + continue + if inside_string: + continue + if c == open_square: + square_depth += 1 + elif c == close_square: + square_depth -= 1 + elif c == open_curly: + if square_depth == 1 and curly_depth == 0: + start = i + curly_depth += 1 + elif c == close_curly: + curly_depth -= 1 + if square_depth == 1 and curly_depth == 0: + substr = utf8_str[start : i + 1].decode("utf8") + yield srsly.json_loads(substr) + start = -1 diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py new file mode 100644 index 000000000..cd606fecf --- /dev/null +++ b/spacy/gold/iob_utils.py @@ -0,0 +1,209 @@ +import warnings +from ..errors import Errors, Warnings +from ..tokens import Span + + +def iob_to_biluo(tags): + out = [] + tags = list(tags) + while tags: + out.extend(_consume_os(tags)) + out.extend(_consume_ent(tags)) + return out + + +def biluo_to_iob(tags): + out = [] + for tag in tags: + if tag is None: + out.append(tag) + else: + tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) + out.append(tag) + return out + + +def _consume_os(tags): + while tags and tags[0] == "O": + yield tags.pop(0) + + +def _consume_ent(tags): + if not tags: + return [] + tag = tags.pop(0) + target_in = "I" + tag[1:] + target_last = "L" + tag[1:] + length = 1 + while tags and tags[0] in {target_in, target_last}: + length += 1 + tags.pop(0) + label = tag[2:] + if length == 1: + if len(label) == 0: + raise ValueError(Errors.E177.format(tag=tag)) + return ["U-" + label] + else: + start = "B-" + label + end = "L-" + label + middle = [f"I-{label}" for _ in range(1, length - 1)] + return [start] + middle + [end] + + +def biluo_tags_from_doc(doc, missing="O"): + return biluo_tags_from_offsets( + doc, + [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], + missing=missing, + ) + + +def biluo_tags_from_offsets(doc, entities, missing="O"): + """Encode labelled spans into per-token tags, using the + Begin/In/Last/Unit/Out scheme (BILUO). + + doc (Doc): The document that the entity offsets refer to. The output tags + will refer to the token boundaries within the document. + entities (iterable): A sequence of `(start, end, label)` triples. `start` + and `end` should be character-offset integers denoting the slice into + the original string. + RETURNS (list): A list of unicode strings, describing the tags. Each tag + string will be of the form either "", "O" or "{action}-{label}", where + action is one of "B", "I", "L", "U". The string "-" is used where the + entity offsets don't align with the tokenization in the `Doc` object. + The training algorithm will view these as missing values. "O" denotes a + non-entity token. "B" denotes the beginning of a multi-token entity, + "I" the inside of an entity of three or more tokens, and "L" the end + of an entity of two or more tokens. "U" denotes a single-token entity. + + EXAMPLE: + >>> text = 'I like London.' + >>> entities = [(len('I like '), len('I like London'), 'LOC')] + >>> doc = nlp.tokenizer(text) + >>> tags = biluo_tags_from_offsets(doc, entities) + >>> assert tags == ["O", "O", 'U-LOC', "O"] + """ + # Ensure no overlapping entity labels exist + tokens_in_ents = {} + + starts = {token.idx: token.i for token in doc} + ends = {token.idx + len(token): token.i for token in doc} + biluo = ["-" for _ in doc] + # Handle entity cases + for start_char, end_char, label in entities: + if not label: + for s in starts: # account for many-to-one + if s >= start_char and s < end_char: + biluo[starts[s]] = "O" + else: + for token_index in range(start_char, end_char): + if token_index in tokens_in_ents.keys(): + raise ValueError( + Errors.E103.format( + span1=( + tokens_in_ents[token_index][0], + tokens_in_ents[token_index][1], + tokens_in_ents[token_index][2], + ), + span2=(start_char, end_char, label), + ) + ) + tokens_in_ents[token_index] = (start_char, end_char, label) + + start_token = starts.get(start_char) + end_token = ends.get(end_char) + # Only interested if the tokenization is correct + if start_token is not None and end_token is not None: + if start_token == end_token: + biluo[start_token] = f"U-{label}" + else: + biluo[start_token] = f"B-{label}" + for i in range(start_token + 1, end_token): + biluo[i] = f"I-{label}" + biluo[end_token] = f"L-{label}" + # Now distinguish the O cases from ones where we miss the tokenization + entity_chars = set() + for start_char, end_char, label in entities: + for i in range(start_char, end_char): + entity_chars.add(i) + for token in doc: + for i in range(token.idx, token.idx + len(token)): + if i in entity_chars: + break + else: + biluo[token.i] = missing + if "-" in biluo and missing != "-": + ent_str = str(entities) + warnings.warn( + Warnings.W030.format( + text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, + entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str, + ) + ) + return biluo + + +def spans_from_biluo_tags(doc, tags): + """Encode per-token tags following the BILUO scheme into Span object, e.g. + to overwrite the doc.ents. + + doc (Doc): The document that the BILUO tags refer to. + entities (iterable): A sequence of BILUO tags with each tag describing one + token. Each tags string will be of the form of either "", "O" or + "{action}-{label}", where action is one of "B", "I", "L", "U". + RETURNS (list): A sequence of Span objects. + """ + token_offsets = tags_to_entities(tags) + spans = [] + for label, start_idx, end_idx in token_offsets: + span = Span(doc, start_idx, end_idx + 1, label=label) + spans.append(span) + return spans + + +def offsets_from_biluo_tags(doc, tags): + """Encode per-token tags following the BILUO scheme into entity offsets. + + doc (Doc): The document that the BILUO tags refer to. + entities (iterable): A sequence of BILUO tags with each tag describing one + token. Each tags string will be of the form of either "", "O" or + "{action}-{label}", where action is one of "B", "I", "L", "U". + RETURNS (list): A sequence of `(start, end, label)` triples. `start` and + `end` will be character-offset integers denoting the slice into the + original string. + """ + spans = spans_from_biluo_tags(doc, tags) + return [(span.start_char, span.end_char, span.label_) for span in spans] + + +def tags_to_entities(tags): + """ Note that the end index returned by this function is inclusive. + To use it for Span creation, increment the end by 1.""" + entities = [] + start = None + for i, tag in enumerate(tags): + if tag is None: + continue + if tag.startswith("O"): + # TODO: We shouldn't be getting these malformed inputs. Fix this. + if start is not None: + start = None + else: + entities.append(("", i, i)) + continue + elif tag == "-": + continue + elif tag.startswith("I"): + if start is None: + raise ValueError(Errors.E067.format(tags=tags[: i + 1])) + continue + if tag.startswith("U"): + entities.append((tag[2:], i, i)) + elif tag.startswith("B"): + start = i + elif tag.startswith("L"): + entities.append((tag[2:], start, i)) + start = None + else: + raise ValueError(Errors.E068.format(tag=tag)) + return entities diff --git a/spacy/language.py b/spacy/language.py index 94da63a1a..573b83e5f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -529,6 +529,22 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) + def _convert_examples(self, examples): + converted_examples = [] + if isinstance(examples, tuple): + examples = [examples] + for eg in examples: + if isinstance(eg, Example): + converted_examples.append(eg.copy()) + elif isinstance(eg, tuple): + doc, annot = eg + if isinstance(doc, str): + doc = self.make_doc(doc) + converted_examples.append(Example.from_dict(doc, annot)) + else: + raise ValueError(Errors.E979.format(type=type(eg))) + return converted_examples + def update( self, examples, @@ -556,7 +572,7 @@ class Language(object): if len(examples) == 0: return - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = self._convert_examples(examples) if sgd is None: if self._optimizer is None: @@ -604,7 +620,7 @@ class Language(object): # TODO: document if len(examples) == 0: return - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = self._convert_examples(examples) if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() @@ -632,19 +648,6 @@ class Language(object): sgd(W, dW, key=key) return losses - def preprocess_gold(self, examples): - """Can be called before training to pre-process gold data. By default, - it handles nonprojectivity and adds missing tags to the tag map. - - examples (iterable): `Example` objects. - YIELDS (tuple): `Example` objects. - """ - for name, proc in self.pipeline: - if hasattr(proc, "preprocess_gold"): - examples = proc.preprocess_gold(examples) - for ex in examples: - yield ex - def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. @@ -662,7 +665,7 @@ class Language(object): # Populate vocab else: for example in get_examples(): - for word in example.token_annotation.words: + for word in [t.text for t in example.reference]: _ = self.vocab[word] # noqa: F841 if cfg.get("device", -1) >= 0: @@ -725,24 +728,26 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ - examples = Example.to_example_objects(examples, make_doc=self.make_doc) + examples = self._convert_examples(examples) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: component_cfg = {} + docs = list(eg.predicted for eg in examples) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) if not hasattr(pipe, "pipe"): - examples = _pipe(examples, pipe, kwargs) + docs = _pipe(docs, pipe, kwargs) else: - examples = pipe.pipe(examples, as_example=True, **kwargs) - for ex in examples: + docs = pipe.pipe(docs, **kwargs) + for i, (doc, eg) in enumerate(zip(docs, examples)): if verbose: - print(ex.doc) + print(doc) + eg.predicted = doc kwargs = component_cfg.get("scorer", {}) kwargs.setdefault("verbose", verbose) - scorer.score(ex, **kwargs) + scorer.score(eg, **kwargs) return scorer @contextmanager @@ -787,7 +792,6 @@ class Language(object): cleanup=False, component_cfg=None, n_process=1, - as_example=False, ): """Process texts as a stream, and yield `Doc` objects in order. @@ -821,7 +825,6 @@ class Language(object): disable=disable, n_process=n_process, component_cfg=component_cfg, - as_example=as_example, ) for doc, context in zip(docs, contexts): yield (doc, context) @@ -1210,9 +1213,9 @@ def _pipe(examples, proc, kwargs): for arg in ["n_threads", "batch_size"]: if arg in kwargs: kwargs.pop(arg) - for ex in examples: - ex = proc(ex, **kwargs) - yield ex + for eg in examples: + eg = proc(eg, **kwargs) + yield eg def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state): diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index ab2bd9e10..5a8f28dfe 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -80,13 +80,12 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, U_end = (L_end, L_end + n_labels) + U_start, _ = (L_end, L_end + n_labels) # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) I_range = numpy.arange(I_start, I_end) L_range = numpy.arange(L_start, L_end) - O_action = U_end # If this is the last token and the previous action was B or I, only L # of that label is valid table[1, B_range, L_range] = 1 diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 215cdeda1..896f972c1 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -48,8 +48,7 @@ def forward(model, X, is_train): model.inc_grad("b", dY.sum(axis=0)) dY = dY.reshape((dY.shape[0], nO * nP)) - Wopfi = W.transpose((1, 2, 0, 3)) - Wopfi = model.ops.xp.ascontiguousarray(Wopfi) + Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3))) Wopfi = Wopfi.reshape((nO * nP, nF * nI)) dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) @@ -59,7 +58,8 @@ def forward(model, X, is_train): model.ops.gemm(dY, Xf, out=dWopfi, trans1=True) dWopfi = dWopfi.reshape((nO, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) - model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3))) + dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3))) + model.inc_grad("W", dWopfi) return dXf.reshape((dXf.shape[0], nF, nI)) return Yf, backward diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index b3a9e0815..6f154bc81 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -48,9 +48,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def mlm_forward(model, docs, is_train): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) - output, backprop = model.get_ref("wrapped-model").begin_update( - docs - ) # drop=drop + output, backprop = model.get_ref("wrapped-model").begin_update(docs) def mlm_backward(d_output): d_output *= 1 - mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 47c94cfa1..d436b1cf6 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,5 +1,6 @@ from pydantic import StrictInt from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array +from thinc.api import LayerNorm, Maxout, Mish from ...util import registry from .._precomputable_affine import PrecomputableAffine @@ -16,7 +17,11 @@ def build_tb_parser_model( nO=None, ): t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) + tok2vec = chain( + tok2vec, + list2array(), + Linear(hidden_width, t2v_width), + ) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 9db6f982f..0d6834f36 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,8 +1,30 @@ -from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic -from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout -from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout -from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed -from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor +from thinc.api import ( + Model, + reduce_mean, + Linear, + list2ragged, + Logistic, + ParametricAttention, +) +from thinc.api import chain, concatenate, clone, Dropout +from thinc.api import ( + SparseLinear, + Softmax, + softmax_activation, + Maxout, + reduce_sum, + Relu, + residual, + expand_window, +) +from thinc.api import ( + HashEmbed, + with_ragged, + with_array, + with_cpu, + uniqued, + FeatureExtractor, +) from ..spacy_vectors import SpacyVectors from ... import util diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b1bed1ea1..e329601da 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -147,7 +147,7 @@ def hash_char_embed_bilstm_v1( @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) + return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True) @registry.architectures.register("spacy.MultiHashEmbed.v1") diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index f7dad565e..88f27f0bf 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -38,8 +38,9 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): - tok2vec = model.get_ref("tok2vec").initialize(X=X) - lower = model.get_ref("lower").initialize() + model.get_ref("tok2vec").initialize(X=X) + lower = model.get_ref("lower") + lower.initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) model.get_ref("upper").initialize(X=statevecs) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index c45a72b25..8ded3890f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -51,9 +51,9 @@ class Morphologizer(Tagger): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): for example in get_examples(): - for i, morph in enumerate(example.token_annotation.morphs): - pos = example.token_annotation.get_pos(i) - morph = Morphology.feats_to_dict(morph) + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = token.morph norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] if pos: morph["POS"] = pos @@ -91,11 +91,12 @@ class Morphologizer(Tagger): correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for ex in examples: - gold = ex.gold - for i in range(len(gold.morphs)): - pos = gold.pos[i] if i < len(gold.pos) else "" - morph = gold.morphs[i] + for eg in examples: + pos_tags = eg.get_aligned("POS", as_string=True) + morphs = eg.get_aligned("MORPH", as_string=True) + for i in range(len(morphs)): + pos = pos_tags[i] + morph = morphs[i] feats = Morphology.feats_to_dict(morph) if pos: feats["POS"] = pos @@ -115,7 +116,7 @@ class Morphologizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4e04b96b5..be28dcc85 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -2,7 +2,6 @@ import numpy import srsly import random -from ast import literal_eval from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy @@ -20,7 +19,7 @@ from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj -from ..gold import Example +from ..gold.example import Example from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X @@ -48,56 +47,39 @@ class Pipe(object): def from_nlp(cls, nlp, model, **cfg): return cls(nlp.vocab, model, **cfg) - def _get_doc(self, example): - """ Use this method if the `example` can be both a Doc or an Example """ - if isinstance(example, Doc): - return example - return example.doc - def __init__(self, vocab, model, **cfg): """Create a new pipe instance.""" raise NotImplementedError - def __call__(self, example): + def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in-place, and returned. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - doc = self._get_doc(example) predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: scores, tensors = predictions self.set_annotations([doc], scores, tensors=tensors) else: self.set_annotations([doc], predictions) - if isinstance(example, Example): - example.doc = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without @@ -109,16 +91,6 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): - """Learn from a batch of documents and gold-standard information, - updating the pipe's model. - - Delegates to predict() and get_loss(). - """ - if set_annotations: - docs = (self._get_doc(ex) for ex in examples) - docs = list(self.pipe(docs)) - def rehearse(self, examples, sgd=None, losses=None, **config): pass @@ -255,29 +227,16 @@ class Tagger(Pipe): def labels(self): return tuple(self.vocab.morphology.tag_names) - def __call__(self, example): - doc = self._get_doc(example) + def __call__(self, doc): tags = self.predict([doc]) self.set_annotations([doc], tags) - if isinstance(example, Example): - example.doc = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): tag_ids = self.predict(docs) - assert len(docs) == len(examples) - assert len(tag_ids) == len(examples) self.set_annotations(docs, tag_ids) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): if not any(len(doc) for doc in docs): @@ -327,15 +286,19 @@ class Tagger(Pipe): doc.is_tagged = True def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): - # Handle cases where there are no tokens in any docs. - return + try: + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="Tagger", method="update", types=types)) set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + tag_scores, bp_tag_scores = self.model.begin_update( + [eg.predicted for eg in examples]) for sc in tag_scores: if self.model.ops.xp.isnan(sc.sum()): raise ValueError("nan value in scores") @@ -347,17 +310,20 @@ class Tagger(Pipe): if losses is not None: losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, self._scores2guesses(tag_scores)) def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of an initial model. """ + try: + docs = [eg.predicted for eg in examples] + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -373,7 +339,7 @@ class Tagger(Pipe): def get_loss(self, examples, scores): loss_func = SequenceCategoricalCrossentropy(names=self.labels) - truths = [eg.gold.tags for eg in examples] + truths = [eg.get_aligned("tag", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError("nan value when computing loss") @@ -389,7 +355,12 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): - for tag in example.token_annotation.tags: + try: + y = example.y + except AttributeError: + raise ValueError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) + for token in y: + tag = token.tag_ if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag] else: @@ -564,9 +535,9 @@ class SentenceRecognizer(Tagger): correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for ex in examples: - gold = ex.gold - for sent_start in gold.sent_starts: + for eg in examples: + sent_starts = eg.get_aligned("sent_start") + for sent_start in sent_starts: if sent_start is None: correct[idx] = guesses[idx] elif sent_start in tag_index: @@ -579,7 +550,7 @@ class SentenceRecognizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -690,8 +661,8 @@ class MultitaskObjective(Tagger): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: for example in gold_examples: - for i in range(len(example.token_annotation.ids)): - label = self.make_label(i, example.token_annotation) + for token in example.y: + label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() @@ -709,13 +680,13 @@ class MultitaskObjective(Tagger): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) - golds = [ex.gold for ex in examples] - docs = [ex.doc for ex in examples] - for i, gold in enumerate(golds): - for j in range(len(docs[i])): - # Handels alignment for tokenization differences - token_annotation = gold.get_token_annotation() - label = self.make_label(j, token_annotation) + docs = [eg.predicted for eg in examples] + for i, eg in enumerate(examples): + # Handles alignment for tokenization differences + doc_annots = eg.get_aligned() # TODO + for j in range(len(eg.predicted)): + tok_annots = {key: values[j] for key, values in tok_annots.items()} + label = self.make_label(j, tok_annots) if label is None or label not in self.labels: correct[idx] = guesses[idx] else: @@ -727,83 +698,49 @@ class MultitaskObjective(Tagger): return float(loss), d_scores @staticmethod - def make_dep(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - return token_annotation.deps[i] + def make_dep(token): + return token.dep_ @staticmethod - def make_tag(i, token_annotation): - return token_annotation.tags[i] + def make_tag(token): + return token.tag_ @staticmethod - def make_ent(i, token_annotation): - if token_annotation.entities is None: - return None - return token_annotation.entities[i] + def make_ent(token): + if token.ent_iob_ == "O": + return "O" + else: + return token.ent_iob_ + "-" + token.ent_type_ @staticmethod - def make_dep_tag_offset(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - offset = token_annotation.heads[i] - i + def make_dep_tag_offset(token): + dep = token.dep_ + tag = token.tag_ + offset = token.head.i - token.i offset = min(offset, 2) offset = max(offset, -2) - return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" + return f"{dep}-{tag}:{offset}" @staticmethod - def make_ent_tag(i, token_annotation): - if token_annotation.entities is None or token_annotation.entities[i] is None: - return None + def make_ent_tag(token): + if token.ent_iob_ == "O": + ent = "O" else: - return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" + ent = token.ent_iob_ + "-" + token.ent_type_ + tag = token.tag_ + return f"{tag}-{ent}" @staticmethod - def make_sent_start(target, token_annotation, cache=True, _cache={}): + def make_sent_start(token): """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) - - The implementation of this method uses an internal cache that relies - on the identity of the heads array, to avoid requiring a new piece - of gold data. You can pass cache=False if you know the cache will - do the wrong thing. """ - words = token_annotation.words - heads = token_annotation.heads - assert len(words) == len(heads) - assert target < len(words), (target, len(words)) - if cache: - if id(heads) in _cache: - return _cache[id(heads)][target] - else: - for key in list(_cache.keys()): - _cache.pop(key) - sent_tags = ["I-SENT"] * len(words) - _cache[id(heads)] = sent_tags + if token.is_sent_start and token.is_sent_end: + return "U-SENT" + elif token.is_sent_start: + return "B-SENT" else: - sent_tags = ["I-SENT"] * len(words) - - def _find_root(child): - seen = set([child]) - while child is not None and heads[child] != child: - seen.add(child) - child = heads[child] - return child - - sentences = {} - for i in range(len(words)): - root = _find_root(i) - if root is None: - sent_tags[i] = None - else: - sentences.setdefault(root, []).append(i) - for root, span in sorted(sentences.items()): - if len(span) == 1: - sent_tags[span[0]] = "U-SENT" - else: - sent_tags[span[0]] = "B-SENT" - sent_tags[span[-1]] = "L-SENT" - return sent_tags[target] + return "I-SENT" class ClozeMultitask(Pipe): @@ -836,7 +773,7 @@ class ClozeMultitask(Pipe): # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) + ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples]) target = vectors[ids] gradient = self.distance.get_grad(prediction, target) loss = self.distance.get_loss(prediction, target) @@ -846,11 +783,14 @@ class ClozeMultitask(Pipe): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): - examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. set_dropout_rate(self.model, drop) - predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) + try: + predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples]) + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: @@ -885,18 +825,11 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): tensors = [doc.tensor for doc in docs] @@ -917,12 +850,17 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): - # Handle cases where there are no tokens in any docs. - return + try: + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) + scores, bp_scores = self.model.begin_update( + [eg.predicted for eg in examples] + ) loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores) if sgd is not None: @@ -931,14 +869,17 @@ class TextCategorizer(Pipe): losses.setdefault(self.name, 0.0) losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, scores=scores) def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs=[ex.doc for ex in examples] + try: + docs = [eg.predicted for eg in examples] + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types)) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -954,13 +895,12 @@ class TextCategorizer(Pipe): losses[self.name] += (gradient**2).sum() def _examples_to_truth(self, examples): - gold_cats = [ex.doc_annotation.cats for ex in examples] - truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f") - for i, gold_cat in enumerate(gold_cats): + truths = numpy.zeros((len(examples), len(self.labels)), dtype="f") + not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") + for i, eg in enumerate(examples): for j, label in enumerate(self.labels): - if label in gold_cat: - truths[i, j] = gold_cat[label] + if label in eg.reference.cats: + truths[i, j] = eg.reference.cats[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -997,7 +937,11 @@ class TextCategorizer(Pipe): # TODO: begin_training is not guaranteed to see all data / labels ? examples = list(get_examples()) for example in examples: - for cat in example.doc_annotation.cats: + try: + y = example.y + except AttributeError: + raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example))) + for cat in y.cats: self.add_label(cat) self.require_labels() docs = [Doc(Vocab(), words=["hello"])] @@ -1156,65 +1100,52 @@ class EntityLinker(Pipe): losses.setdefault(self.name, 0.0) if not examples: return 0 - examples = Example.to_example_objects(examples) sentence_docs = [] - docs = [ex.doc for ex in examples] + try: + docs = [eg.predicted for eg in examples] + except AttributeError: + types = set([type(eg) for eg in examples]) + raise ValueError(Errors.E978.format(name="EntityLinker", method="update", types=types)) if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( predictions = self.model.predict(docs) - golds = [ex.gold for ex in examples] - for doc, gold in zip(docs, golds): - ents_by_offset = dict() + for eg in examples: + sentences = [s for s in eg.predicted.sents] + kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.predicted.ents: + kb_id = kb_ids[ent.start] # KB ID of the first token is the same as the whole span + if kb_id: + try: + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: + # Catch the exception when ent.sent is None and provide a user-friendly warning + raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) - sentences = [s for s in doc.sents] + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in doc.ents: - ents_by_offset[(ent.start_char, ent.end_char)] = ent - - for entity, kb_dict in gold.links.items(): - if isinstance(entity, str): - entity = literal_eval(entity) - start, end = entity - mention = doc.text[start:end] - - # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt - if not (start, end) in ents_by_offset: - raise RuntimeError(Errors.E188) - - ent = ents_by_offset[(start, end)] - - for kb_id, value in kb_dict.items(): - # Currently only training on the positive instances - we assume there is at least 1 per doc/gold - if value: - try: - # find the sentence in the list of sentences. - sent_index = sentences.index(ent.sent) - - except AttributeError: - # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) - - # get n previous sentences, if there are any - start_sentence = max(0, sent_index - self.n_sents) - - # get n posterior sentences, or as many < n as there are - end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - - # get token positions - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - - # append that span as a doc to training - sent_doc = doc[start_token:end_token].as_doc() - sentence_docs.append(sent_doc) + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + # append that span as a doc to training + sent_doc = eg.predicted[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) set_dropout_rate(self.model, drop) + if not sentence_docs: + warnings.warn(Warnings.W093.format(name="Entity Linker")) + return 0.0 sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds) + loss, d_scores = self.get_similarity_loss( + scores=sentence_encodings, + examples=examples + ) bp_context(d_scores) - if sgd is not None: self.model.finish_update(sgd) @@ -1224,15 +1155,15 @@ class EntityLinker(Pipe): self.set_annotations(docs, predictions) return loss - def get_similarity_loss(self, golds, scores): + def get_similarity_loss(self, examples, scores): entity_encodings = [] - for gold in golds: - for entity, kb_dict in gold.links.items(): - for kb_id, value in kb_dict.items(): - # this loss function assumes we're only using positive examples - if value: - entity_encoding = self.kb.get_vector(kb_id) - entity_encodings.append(entity_encoding) + for eg in examples: + kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.predicted.ents: + kb_id = kb_ids[ent.start] + if kb_id: + entity_encoding = self.kb.get_vector(kb_id) + entity_encodings.append(entity_encoding) entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") @@ -1246,10 +1177,12 @@ class EntityLinker(Pipe): def get_loss(self, examples, scores): cats = [] - for ex in examples: - for entity, kb_dict in ex.gold.links.items(): - for kb_id, value in kb_dict.items(): - cats.append([value]) + for eg in examples: + kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) + for ent in eg.predicted.ents: + kb_id = kb_ids[ent.start] + if kb_id: + cats.append([1.0]) cats = self.model.ops.asarray(cats, dtype="float32") if len(scores) != len(cats): @@ -1260,27 +1193,16 @@ class EntityLinker(Pipe): loss = loss / len(cats) return loss, d_scores - def __call__(self, example): - doc = self._get_doc(example) + def __call__(self, doc): kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) - if isinstance(example, Example): - example.doc = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ @@ -1466,7 +1388,7 @@ class Sentencizer(Pipe): ): pass - def __call__(self, example): + def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. example (Doc or Example): The document to process. @@ -1474,7 +1396,6 @@ class Sentencizer(Pipe): DOCS: https://spacy.io/api/sentencizer#call """ - doc = self._get_doc(example) start = 0 seen_period = False for i, token in enumerate(doc): @@ -1488,26 +1409,17 @@ class Sentencizer(Pipe): seen_period = True if start < len(doc): doc[start].is_sent_start = True - if isinstance(example, Example): - example.doc = doc - return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 58f647b67..9a8991557 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -70,8 +70,7 @@ class SimpleNER(Pipe): def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): if not any(_has_ner(eg) for eg in examples): return 0 - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] + docs = [eg.doc for eg in examples] set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) loss, d_scores = self.get_loss(examples, scores) @@ -140,8 +139,7 @@ def _has_ner(eg): def _get_labels(examples): labels = set() for eg in examples: - for ner_tag in eg.token_annotation.entities: + for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True): if ner_tag != "O" and ner_tag != "-": - _, label = ner_tag.split("-", 1) - labels.add(label) + labels.add(ner_tag) return list(sorted(labels)) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index de30a55f0..047cf5caa 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -5,7 +5,7 @@ from ..gold import Example from ..tokens import Doc from ..vocab import Vocab from ..language import component -from ..util import link_vectors_to_models, minibatch, eg2doc +from ..util import link_vectors_to_models, minibatch from .defaults import default_tok2vec @@ -51,22 +51,18 @@ class Tok2Vec(Pipe): self.set_annotations([doc], tokvecses) return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): """Process `Doc` objects as a stream. stream (iterator): A sequence of `Doc` objects to process. batch_size (int): Number of `Doc` objects to group. n_threads (int): Number of threads. YIELDS (iterator): A sequence of `Doc` objects, in order of input. """ - for batch in minibatch(stream, batch_size): - batch = list(batch) - if as_example: - docs = [eg2doc(doc) for doc in batch] - else: - docs = batch + for docs in minibatch(stream, batch_size): + docs = list(docs) tokvecses = self.predict(docs) self.set_annotations(docs, tokvecses) - yield from batch + yield from docs def predict(self, docs): """Return a single tensor for a batch of documents. @@ -97,8 +93,7 @@ class Tok2Vec(Pipe): """ if losses is None: losses = {} - examples = Example.to_example_objects(examples) - docs = [eg.doc for eg in examples] + docs = [eg.predicted for eg in examples] if isinstance(docs, Doc): docs = [docs] set_dropout_rate(self.model, drop) diff --git a/spacy/scorer.py b/spacy/scorer.py index af74db80e..87033d234 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,5 @@ import numpy as np -from .gold import tags_to_entities, GoldParse, DocAnnotation from .errors import Errors @@ -275,7 +274,7 @@ class Scorer(object): } def score(self, example, verbose=False, punct_labels=("p", "punct")): - """Update the evaluation scores from a single Doc / GoldParse pair. + """Update the evaluation scores from a single Example. example (Example): The predicted annotations + correct annotations. verbose (bool): Print debugging information. @@ -285,17 +284,9 @@ class Scorer(object): DOCS: https://spacy.io/api/scorer#score """ - if isinstance(example, tuple) and len(example) == 2: - doc, gold = example - else: - gold = example.gold - doc = example.doc - - if len(doc) != len(gold): - doc_annotation = DocAnnotation(cats=gold.cats) - token_annotation = gold.orig - gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation) - orig = gold.orig + doc = example.predicted + gold_doc = example.reference + align = example.alignment gold_deps = set() gold_deps_per_dep = {} gold_tags = set() @@ -303,36 +294,28 @@ class Scorer(object): gold_morphs = set() gold_morphs_per_feat = {} gold_sent_starts = set() - gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, pos, morph, head, dep, sent_start in zip( - orig.ids, - orig.tags, - orig.pos, - orig.morphs, - orig.heads, - orig.deps, - orig.sent_starts, - ): - gold_tags.add((id_, tag)) - gold_pos.add((id_, pos)) - gold_morphs.add((id_, morph)) - if morph: - for feat in morph.split("|"): + for gold_i, token in enumerate(gold_doc): + gold_tags.add((gold_i, token.tag_)) + gold_pos.add((gold_i, token.pos_)) + gold_morphs.add((gold_i, token.morph_)) + if token.morph_: + for feat in token.morph_.split("|"): field, values = feat.split("=") if field not in self.morphs_per_feat: self.morphs_per_feat[field] = PRFScore() if field not in gold_morphs_per_feat: gold_morphs_per_feat[field] = set() - gold_morphs_per_feat[field].add((id_, feat)) - if sent_start: - gold_sent_starts.add(id_) - if dep not in (None, "") and dep.lower() not in punct_labels: - gold_deps.add((id_, head, dep.lower())) - if dep.lower() not in self.labelled_per_dep: - self.labelled_per_dep[dep.lower()] = PRFScore() - if dep.lower() not in gold_deps_per_dep: - gold_deps_per_dep[dep.lower()] = set() - gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower())) + gold_morphs_per_feat[field].add((gold_i, feat)) + if token.sent_start: + gold_sent_starts.add(gold_i) + dep = token.dep_.lower() + if dep not in punct_labels: + gold_deps.add((gold_i, token.head.i, dep)) + if dep not in self.labelled_per_dep: + self.labelled_per_dep[dep] = PRFScore() + if dep not in gold_deps_per_dep: + gold_deps_per_dep[dep] = set() + gold_deps_per_dep[dep].add((gold_i, token.head.i, dep)) cand_deps = set() cand_deps_per_dep = {} cand_tags = set() @@ -343,7 +326,7 @@ class Scorer(object): for token in doc: if token.orth_.isspace(): continue - gold_i = gold.cand_to_gold[token.i] + gold_i = align.cand_to_gold[token.i] if gold_i is None: self.tokens.fp += 1 else: @@ -362,7 +345,7 @@ class Scorer(object): if token.is_sent_start: cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): - gold_head = gold.cand_to_gold[token.head.i] + gold_head = align.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set # Multiple (None, None) deps are possible if gold_i is None or gold_head is None: @@ -377,23 +360,30 @@ class Scorer(object): cand_deps_per_dep[token.dep_.lower()].add( (gold_i, gold_head, token.dep_.lower()) ) - if "-" not in [token[-1] for token in orig.entities]: - # Find all NER labels in gold and doc - ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) - # Set up all labels for per type scoring and prepare gold per type - gold_per_ents = {ent_label: set() for ent_label in ent_labels} - for ent_label in ent_labels: - if ent_label not in self.ner_per_ents: - self.ner_per_ents[ent_label] = PRFScore() - gold_per_ents[ent_label].update( - [x for x in gold_ents if x[0] == ent_label] - ) - # Find all candidate labels, for all and per type - cand_ents = set() + # Find all NER labels in gold and doc + ent_labels = set( + [k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents] + ) + # Set up all labels for per type scoring and prepare gold per type + gold_per_ents = {ent_label: set() for ent_label in ent_labels} + for ent_label in ent_labels: + if ent_label not in self.ner_per_ents: + self.ner_per_ents[ent_label] = PRFScore() + # Find all candidate labels, for all and per type + gold_ents = set() + cand_ents = set() + # If we have missing values in the gold, we can't easily tell whether + # our NER predictions are true. + # It seems bad but it's what we've always done. + if all(token.ent_iob != 0 for token in gold_doc): + for ent in gold_doc.ents: + gold_ent = (ent.label_, ent.start, ent.end - 1) + gold_ents.add(gold_ent) + gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1)) cand_per_ents = {ent_label: set() for ent_label in ent_labels} for ent in doc.ents: - first = gold.cand_to_gold[ent.start] - last = gold.cand_to_gold[ent.end - 1] + first = align.cand_to_gold[ent.start] + last = align.cand_to_gold[ent.end - 1] if first is None or last is None: self.ner.fp += 1 self.ner_per_ents[ent.label_].fp += 1 @@ -424,40 +414,40 @@ class Scorer(object): set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) ) if ( - len(gold.cats) > 0 + len(gold_doc.cats) > 0 and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) - == set(gold.cats) - and set(gold.cats) == set(doc.cats) + == set(gold_doc.cats) + and set(gold_doc.cats) == set(doc.cats) ): - goldcat = max(gold.cats, key=gold.cats.get) + goldcat = max(gold_doc.cats, key=gold_doc.cats.get) candcat = max(doc.cats, key=doc.cats.get) if self.textcat_positive_label: self.textcat.score_set( set([self.textcat_positive_label]) & set([candcat]), set([self.textcat_positive_label]) & set([goldcat]), ) - for label in set(gold.cats): + for label in set(gold_doc.cats): self.textcat_auc_per_cat[label].score_set( - doc.cats[label], gold.cats[label] + doc.cats[label], gold_doc.cats[label] ) self.textcat_f_per_cat[label].score_set( set([label]) & set([candcat]), set([label]) & set([goldcat]) ) elif len(self.textcat_f_per_cat) > 0: model_labels = set(self.textcat_f_per_cat) - eval_labels = set(gold.cats) + eval_labels = set(gold_doc.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) elif len(self.textcat_auc_per_cat) > 0: model_labels = set(self.textcat_auc_per_cat) - eval_labels = set(gold.cats) + eval_labels = set(gold_doc.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) ) if verbose: - gold_words = orig.words + gold_words = gold_doc.words for w_id, h_id, dep in cand_deps - gold_deps: print("F", gold_words[w_id], dep, gold_words[h_id]) for w_id, h_id, dep in gold_deps - cand_deps: diff --git a/spacy/syntax/_beam_utils.pxd b/spacy/syntax/_beam_utils.pxd deleted file mode 100644 index cf99ac3d1..000000000 --- a/spacy/syntax/_beam_utils.pxd +++ /dev/null @@ -1,9 +0,0 @@ -from ..typedefs cimport hash_t, class_t - -# These are passed as callbacks to thinc.search.Beam -cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 - -cdef int check_final_state(void* _state, void* extra_args) except -1 - - -cdef hash_t hash_state(void* _state, void* _) except 0 diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx deleted file mode 100644 index 03702e54e..000000000 --- a/spacy/syntax/_beam_utils.pyx +++ /dev/null @@ -1,329 +0,0 @@ -# cython: infer_types=True, profile=True -cimport numpy as np -from cpython.ref cimport PyObject, Py_XDECREF -from thinc.extra.search cimport Beam -from thinc.extra.search cimport MaxViolation - -from thinc.extra.search import MaxViolation -import numpy - -from ..typedefs cimport hash_t, class_t -from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParse -from .stateclass cimport StateC, StateClass - -from ..errors import Errors - - -# These are passed as callbacks to thinc.search.Beam -cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = <StateC*>_dest - src = <StateC*>_src - moves = <const Transition*>_moves - dest.clone(src) - moves[clas].do(dest, moves[clas].label) - dest.push_hist(clas) - - -cdef int check_final_state(void* _state, void* extra_args) except -1: - state = <StateC*>_state - return state.is_final() - - -cdef hash_t hash_state(void* _state, void* _) except 0: - state = <StateC*>_state - if state.is_final(): - return 1 - else: - return state.hash() - - -def collect_states(beams): - cdef StateClass state - cdef Beam beam - states = [] - for state_or_beam in beams: - if isinstance(state_or_beam, StateClass): - states.append(state_or_beam) - else: - beam = state_or_beam - state = StateClass.borrow(<StateC*>beam.at(0)) - states.append(state) - return states - - -cdef class ParserBeam(object): - cdef public TransitionSystem moves - cdef public object states - cdef public object golds - cdef public object beams - cdef public object dones - - def __init__(self, TransitionSystem moves, states, golds, - int width, float density=0.): - self.moves = moves - self.states = states - self.golds = golds - self.beams = [] - cdef Beam beam - cdef StateClass state - cdef StateC* st - for state in states: - beam = Beam(self.moves.n_moves, width, min_density=density) - beam.initialize(self.moves.init_beam_state, - self.moves.del_beam_state, state.c.length, - state.c._sent) - for i in range(beam.width): - st = <StateC*>beam.at(i) - st.offset = state.c.offset - self.beams.append(beam) - self.dones = [False] * len(self.beams) - - @property - def is_done(self): - return all(b.is_done or self.dones[i] - for i, b in enumerate(self.beams)) - - def __getitem__(self, i): - return self.beams[i] - - def __len__(self): - return len(self.beams) - - def advance(self, scores, follow_gold=False): - cdef Beam beam - for i, beam in enumerate(self.beams): - if beam.is_done or not scores[i].size or self.dones[i]: - continue - self._set_scores(beam, scores[i]) - if self.golds is not None: - self._set_costs(beam, self.golds[i], follow_gold=follow_gold) - beam.advance(transition_state, hash_state, <void*>self.moves.c) - beam.check_done(check_final_state, NULL) - # This handles the non-monotonic stuff for the parser. - if beam.is_done and self.golds is not None: - for j in range(beam.size): - state = StateClass.borrow(<StateC*>beam.at(j)) - if state.is_final(): - try: - if self.moves.is_gold_parse(state, self.golds[i]): - beam._states[j].loss = 0.0 - except NotImplementedError: - break - - def _set_scores(self, Beam beam, float[:, ::1] scores): - cdef float* c_scores = &scores[0, 0] - cdef int nr_state = min(scores.shape[0], beam.size) - cdef int nr_class = scores.shape[1] - for i in range(nr_state): - state = <StateC*>beam.at(i) - if not state.is_final(): - for j in range(nr_class): - beam.scores[i][j] = c_scores[i * nr_class + j] - self.moves.set_valid(beam.is_valid[i], state) - else: - for j in range(beam.nr_class): - beam.scores[i][j] = 0 - beam.costs[i][j] = 0 - - def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): - for i in range(beam.size): - state = StateClass.borrow(<StateC*>beam.at(i)) - if not state.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], - state, gold) - if follow_gold: - min_cost = 0 - for j in range(beam.nr_class): - if beam.is_valid[i][j] and beam.costs[i][j] < min_cost: - min_cost = beam.costs[i][j] - for j in range(beam.nr_class): - if beam.costs[i][j] > min_cost: - beam.is_valid[i][j] = 0 - - -def get_token_ids(states, int n_tokens): - cdef StateClass state - cdef np.ndarray ids = numpy.zeros((len(states), n_tokens), - dtype='int32', order='C') - c_ids = <int*>ids.data - for i, state in enumerate(states): - if not state.is_final(): - state.c.set_context_tokens(c_ids, n_tokens) - else: - ids[i] = -1 - c_ids += ids.shape[1] - return ids - - -nr_update = 0 - - -def update_beam(TransitionSystem moves, int nr_feature, int max_steps, - states, golds, - state2vec, vec2scores, - int width, losses=None, drop=0., - early_update=True, beam_density=0.0): - global nr_update - cdef MaxViolation violn - nr_update += 1 - pbeam = ParserBeam(moves, states, golds, width=width, density=beam_density) - gbeam = ParserBeam(moves, states, golds, width=width, density=beam_density) - cdef StateClass state - beam_maps = [] - backprops = [] - violns = [MaxViolation() for _ in range(len(states))] - for t in range(max_steps): - if pbeam.is_done and gbeam.is_done: - break - # The beam maps let us find the right row in the flattened scores - # arrays for each state. States are identified by (example id, - # history). We keep a different beam map for each step (since we'll - # have a flat scores array for each step). The beam map will let us - # take the per-state losses, and compute the gradient for each (step, - # state, class). - beam_maps.append({}) - # Gather all states from the two beams in a list. Some stats may occur - # in both beams. To figure out which beam each state belonged to, - # we keep two lists of indices, p_indices and g_indices - states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], - nr_update) - if not states: - break - # Now that we have our flat list of states, feed them through the model - token_ids = get_token_ids(states, nr_feature) - vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) - scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) - - # Store the callbacks for the backward pass - backprops.append((token_ids, bp_vectors, bp_scores)) - - # Unpack the flat scores into lists for the two beams. The indices arrays - # tell us which example and state the scores-row refers to. - p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') - for indices in p_indices] - g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') - for indices in g_indices] - # Now advance the states in the beams. The gold beam is constrained to - # to follow only gold analyses. - pbeam.advance(p_scores) - gbeam.advance(g_scores, follow_gold=True) - # Track the "maximum violation", to use in the update. - for i, violn in enumerate(violns): - violn.check_crf(pbeam[i], gbeam[i]) - histories = [] - losses = [] - for violn in violns: - if violn.p_hist: - histories.append(violn.p_hist + violn.g_hist) - losses.append(violn.p_probs + violn.g_probs) - else: - histories.append([]) - losses.append([]) - states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses) - beams = list(pbeam.beams) + list(gbeam.beams) - return states_d_scores, backprops[:len(states_d_scores)], beams - - -def get_states(pbeams, gbeams, beam_map, nr_update): - seen = {} - states = [] - p_indices = [] - g_indices = [] - cdef Beam pbeam, gbeam - if len(pbeams) != len(gbeams): - raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams))) - for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): - p_indices.append([]) - g_indices.append([]) - for i in range(pbeam.size): - state = StateClass.borrow(<StateC*>pbeam.at(i)) - if not state.is_final(): - key = tuple([eg_id] + pbeam.histories[i]) - if key in seen: - raise ValueError(Errors.E080.format(key=key)) - seen[key] = len(states) - p_indices[-1].append(len(states)) - states.append(state) - beam_map.update(seen) - for i in range(gbeam.size): - state = StateClass.borrow(<StateC*>gbeam.at(i)) - if not state.is_final(): - key = tuple([eg_id] + gbeam.histories[i]) - if key in seen: - g_indices[-1].append(seen[key]) - else: - g_indices[-1].append(len(states)) - beam_map[key] = len(states) - states.append(state) - p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices] - g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices] - return states, p_idx, g_idx - - -def get_gradient(nr_class, beam_maps, histories, losses): - """The global model assigns a loss to each parse. The beam scores - are additive, so the same gradient is applied to each action - in the history. This gives the gradient of a single *action* - for a beam state -- so we have "the gradient of loss for taking - action i given history H." - - Histories: Each hitory is a list of actions - Each candidate has a history - Each beam has multiple candidates - Each batch has multiple beams - So history is list of lists of lists of ints - """ - grads = [] - nr_steps = [] - for eg_id, hists in enumerate(histories): - nr_step = 0 - for loss, hist in zip(losses[eg_id], hists): - if loss != 0.0 and not numpy.isnan(loss): - nr_step = max(nr_step, len(hist)) - nr_steps.append(nr_step) - for i in range(max(nr_steps)): - grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), - dtype='f')) - if len(histories) != len(losses): - raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses))) - for eg_id, hists in enumerate(histories): - for loss, hist in zip(losses[eg_id], hists): - if loss == 0.0 or numpy.isnan(loss): - continue - key = tuple([eg_id]) - # Adjust loss for length - # We need to do this because each state in a short path is scored - # multiple times, as we add in the average cost when we run out - # of actions. - avg_loss = loss / len(hist) - loss += avg_loss * (nr_steps[eg_id] - len(hist)) - for j, clas in enumerate(hist): - i = beam_maps[j][key] - # In step j, at state i action clas - # resulted in loss - grads[j][i, clas] += loss - key = key + tuple([clas]) - return grads - - -def cleanup_beam(Beam beam): - cdef StateC* state - # Once parsing has finished, states in beam may not be unique. Is this - # correct? - seen = set() - for i in range(beam.width): - addr = <size_t>beam._parents[i].content - if addr not in seen: - state = <StateC*>addr - del state - seen.add(addr) - else: - raise ValueError(Errors.E023.format(addr=addr, i=i)) - addr = <size_t>beam._states[i].content - if addr not in seen: - state = <StateC*>addr - del state - seen.add(addr) - else: - raise ValueError(Errors.E023.format(addr=addr, i=i)) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 60d22a1ab..d3093d60d 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -16,7 +16,6 @@ from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop from ..typedefs cimport weight_t, class_t, hash_t from ..tokens.doc cimport Doc -from ..gold cimport GoldParse from .stateclass cimport StateClass from .transition_system cimport Transition @@ -24,7 +23,6 @@ from ..compat import copy_array from ..errors import Errors, TempErrors from ..util import link_vectors_to_models, create_default_optimizer from .. import util -from . import _beam_utils from . import nonproj @@ -261,8 +259,7 @@ class ParserStepModel(Model): def mark_class_seen(self, class_): self._class_mask[class_] = 1 - def get_token_ids(self, batch): - states = _beam_utils.collect_states(batch) + def get_token_ids(self, states): cdef StateClass state states = [state for state in states if not state.is_final()] cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 14d706548..a59be716a 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -3,12 +3,11 @@ from cymem.cymem cimport Pool from .stateclass cimport StateClass from ..typedefs cimport weight_t, attr_t from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParseC cdef class ArcEager(TransitionSystem): pass -cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil -cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil +cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil +cdef weight_t arc_cost(StateClass stcls, const void* _gold, int head, int child) nogil diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 19be95f3f..fcc05de3f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,19 +1,19 @@ # cython: profile=True, cdivision=True, infer_types=True from cpython.ref cimport Py_INCREF -from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam +from cymem.cymem cimport Pool, Address +from libc.stdint cimport int32_t from collections import defaultdict, Counter import json from ..typedefs cimport hash_t, attr_t from ..strings cimport hash_string -from ..gold cimport GoldParse, GoldParseC from ..structs cimport TokenC from ..tokens.doc cimport Doc, set_children_from_heads from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport move_cost_func_t, label_cost_func_t +from ..gold.example cimport Example from ..errors import Errors from .nonproj import is_nonproj_tree @@ -49,53 +49,232 @@ MOVE_NAMES[RIGHT] = 'R' MOVE_NAMES[BREAK] = 'B' +cdef enum: + HEAD_IN_STACK = 0 + HEAD_IN_BUFFER + HEAD_UNKNOWN + IS_SENT_START + SENT_START_UNKNOWN + + +cdef struct GoldParseStateC: + char* state_bits + int32_t* n_kids_in_buffer + int32_t* n_kids_in_stack + int32_t* heads + attr_t* labels + int32_t** kids + int32_t* n_kids + int32_t length + int32_t stride + + +cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, + heads, labels, sent_starts) except *: + cdef GoldParseStateC gs + gs.length = len(heads) + gs.stride = 1 + gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0])) + gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0])) + gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0])) + gs.state_bits = <char*>mem.alloc(gs.length, sizeof(gs.state_bits[0])) + gs.n_kids_in_buffer = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_buffer[0])) + gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0])) + + for i, is_sent_start in enumerate(sent_starts): + if is_sent_start == True: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + IS_SENT_START, + 1 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + SENT_START_UNKNOWN, + 0 + ) + + elif is_sent_start is None: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + SENT_START_UNKNOWN, + 1 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + IS_SENT_START, + 0 + ) + else: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + SENT_START_UNKNOWN, + 0 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + IS_SENT_START, + 0 + ) + + for i, (head, label) in enumerate(zip(heads, labels)): + if head is not None: + gs.heads[i] = head + gs.labels[i] = label + if i != head: + gs.n_kids[head] += 1 + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_UNKNOWN, + 0 + ) + else: + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_UNKNOWN, + 1 + ) + # Make an array of pointers, pointing into the gs_kids_flat array. + gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*)) + for i in range(gs.length): + if gs.n_kids[i] != 0: + gs.kids[i] = <int32_t*>mem.alloc(gs.n_kids[i], sizeof(int32_t)) + # This is a temporary buffer + js_addr = Address(gs.length, sizeof(int32_t)) + js = <int32_t*>js_addr.ptr + for i in range(gs.length): + if not is_head_unknown(&gs, i): + head = gs.heads[i] + if head != i: + gs.kids[head][js[head]] = i + js[head] += 1 + return gs + + +cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) nogil: + for i in range(gs.length): + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_BUFFER, + 0 + ) + gs.state_bits[i] = set_state_flag( + gs.state_bits[i], + HEAD_IN_STACK, + 0 + ) + gs.n_kids_in_stack[i] = 0 + gs.n_kids_in_buffer[i] = 0 + + for i in range(stcls.stack_depth()): + s_i = stcls.S(i) + if not is_head_unknown(gs, s_i): + gs.n_kids_in_stack[gs.heads[s_i]] += 1 + for kid in gs.kids[s_i][:gs.n_kids[s_i]]: + gs.state_bits[kid] = set_state_flag( + gs.state_bits[kid], + HEAD_IN_STACK, + 1 + ) + for i in range(stcls.buffer_length()): + b_i = stcls.B(i) + if not is_head_unknown(gs, b_i): + gs.n_kids_in_buffer[gs.heads[b_i]] += 1 + for kid in gs.kids[b_i][:gs.n_kids[b_i]]: + gs.state_bits[kid] = set_state_flag( + gs.state_bits[kid], + HEAD_IN_BUFFER, + 1 + ) + + +cdef class ArcEagerGold: + cdef GoldParseStateC c + cdef Pool mem + + def __init__(self, ArcEager moves, StateClass stcls, Example example): + self.mem = Pool() + heads, labels = example.get_aligned_parse(projectivize=True) + labels = [label if label is not None else "" for label in labels] + labels = [example.x.vocab.strings.add(label) for label in labels] + sent_starts = example.get_aligned("SENT_START") + assert len(heads) == len(labels) == len(sent_starts) + self.c = create_gold_state(self.mem, stcls, heads, labels, sent_starts) + + def update(self, StateClass stcls): + update_gold_state(&self.c, stcls) + + +cdef int check_state_gold(char state_bits, char flag) nogil: + cdef char one = 1 + return state_bits & (one << flag) + + +cdef int set_state_flag(char state_bits, char flag, int value) nogil: + cdef char one = 1 + if value: + return state_bits | (one << flag) + else: + return state_bits & ~(one << flag) + + +cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil: + return check_state_gold(gold.state_bits[i], HEAD_IN_STACK) + + +cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil: + return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER) + + +cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil: + return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN) + +cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil: + return check_state_gold(gold.state_bits[i], IS_SENT_START) + +cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil: + return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN) + + # Helper functions for the arc-eager oracle -cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: +cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil: + gold = <const GoldParseStateC*>_gold cdef weight_t cost = 0 - cdef int i, S_i - for i in range(stcls.stack_depth()): - S_i = stcls.S(i) - if gold.heads[target] == S_i: - cost += 1 - if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): - cost += 1 - if BINARY_COSTS and cost >= 1: - return cost - cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0 - return cost - - -cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: - cdef weight_t cost = 0 - cdef int i, B_i - for i in range(stcls.buffer_length()): - B_i = stcls.B(i) - cost += gold.heads[B_i] == target - cost += gold.heads[target] == B_i - if gold.heads[B_i] == B_i or gold.heads[B_i] < target: - break - if BINARY_COSTS and cost >= 1: - return cost + if is_head_in_stack(gold, target): + cost += 1 + cost += gold.n_kids_in_stack[target] if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0: cost += 1 return cost -cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: +cdef weight_t pop_cost(StateClass stcls, const void* _gold, int target) nogil: + gold = <const GoldParseStateC*>_gold + cdef weight_t cost = 0 + if is_head_in_buffer(gold, target): + cost += 1 + cost += gold[0].n_kids_in_buffer[target] + if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0: + cost += 1 + return cost + + +cdef weight_t arc_cost(StateClass stcls, const void* _gold, int head, int child) nogil: + gold = <const GoldParseStateC*>_gold if arc_is_gold(gold, head, child): return 0 elif stcls.H(child) == gold.heads[child]: return 1 # Head in buffer - elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0: + elif is_head_in_buffer(gold, child): return 1 else: return 0 -cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: - if not gold.has_dep[child]: +cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil: + if is_head_unknown(gold, child): return True elif gold.heads[child] == head: return True @@ -103,8 +282,8 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: return False -cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil: - if not gold.has_dep[child]: +cdef bint label_is_gold(const GoldParseStateC* gold, int head, int child, attr_t label) nogil: + if is_head_unknown(gold, child): return True elif label == 0: return True @@ -114,8 +293,9 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t labe return False -cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: - return gold.heads[word] == word or not gold.has_dep[word] +cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil: + return gold.heads[word] == word or is_head_unknown(gold, word) + cdef class Shift: @staticmethod @@ -129,15 +309,17 @@ cdef class Shift: st.fast_forward() @staticmethod - cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass st, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil: + gold = <const GoldParseStateC*>_gold return push_cost(s, gold, s.B(0)) @staticmethod - cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil: return 0 @@ -155,26 +337,28 @@ cdef class Reduce: st.fast_forward() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass st, const GoldParseC* gold) nogil: - cost = pop_cost(st, gold, st.S(0)) - if not st.has_head(st.S(0)): - # Decrement cost for the arcs e save - for i in range(1, st.stack_depth()): - S_i = st.S(i) - if gold.heads[st.S(0)] == S_i: - cost -= 1 - if gold.heads[S_i] == st.S(0): - cost -= 1 + cdef inline weight_t move_cost(StateClass st, const void* _gold) nogil: + gold = <const GoldParseStateC*>_gold + s0 = st.S(0) + cost = pop_cost(st, gold, s0) + return_to_buffer = not st.has_head(s0) + if return_to_buffer: + # Decrement cost for the arcs we save, as we'll be putting this + # back to the buffer + if is_head_in_stack(gold, s0): + cost -= 1 + cost -= gold.n_kids_in_stack[s0] if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0: cost -= 1 return cost @staticmethod - cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t label_cost(StateClass s, const void* gold, attr_t label) nogil: return 0 @@ -193,25 +377,28 @@ cdef class LeftArc: st.fast_forward() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef inline weight_t move_cost(StateClass s, const GoldParseStateC* gold) nogil: cdef weight_t cost = 0 - if arc_is_gold(gold, s.B(0), s.S(0)): + s0 = s.S(0) + b0 = s.B(0) + if arc_is_gold(gold, b0, s0): # Have a negative cost if we 'recover' from the wrong dependency - return 0 if not s.has_head(s.S(0)) else -1 + return 0 if not s.has_head(s0) else -1 else: # Account for deps we might lose between S0 and stack - if not s.has_head(s.S(0)): - for i in range(1, s.stack_depth()): - cost += gold.heads[s.S(i)] == s.S(0) - cost += gold.heads[s.S(0)] == s.S(i) + if not s.has_head(s0): + cost += gold.n_kids_in_stack[s0] + if is_head_in_buffer(gold, s0): + cost += 1 return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) @staticmethod - cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t label_cost(StateClass s, const GoldParseStateC* gold, attr_t label) nogil: return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) @@ -223,7 +410,7 @@ cdef class RightArc: return 0 sent_start = st._sent[st.B_(0).l_edge].sent_start return sent_start != 1 and st.H(st.S(0)) != st.B(0) - + @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.add_arc(st.S(0), st.B(0), label) @@ -231,11 +418,13 @@ cdef class RightArc: st.fast_forward() @staticmethod - cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil: + gold = <const GoldParseStateC*>_gold if arc_is_gold(gold, s.S(0), s.B(0)): return 0 elif s.c.shifted[s.B(0)]: @@ -244,7 +433,8 @@ cdef class RightArc: return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) @staticmethod - cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) @@ -271,23 +461,22 @@ cdef class Break: st.fast_forward() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <const GoldParseStateC*>_gold return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) @staticmethod - cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil: - cdef weight_t cost = 0 - cdef int i, j, S_i, B_i + cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil: + gold = <const GoldParseStateC*>_gold + cost = 0 for i in range(s.stack_depth()): S_i = s.S(i) - for j in range(s.buffer_length()): - B_i = s.B(j) - cost += gold.heads[S_i] == B_i - cost += gold.heads[B_i] == S_i - if cost != 0: - return cost - # Check for sentence boundary --- if it's here, we can't have any deps - # between stack and buffer, so rest of action is irrelevant. + cost += gold.n_kids_in_buffer[S_i] + if is_head_in_buffer(gold, S_i): + cost += 1 + # It's weird not to check the gold sentence boundaries but if we do, + # we can't account for "sunk costs", i.e. situations where we're already + # wrong. s0_root = _get_root(s.S(0), gold) b0_root = _get_root(s.B(0), gold) if s0_root != b0_root or s0_root == -1 or b0_root == -1: @@ -296,14 +485,16 @@ cdef class Break: return cost + 1 @staticmethod - cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef inline weight_t label_cost(StateClass s, const void* gold, attr_t label) nogil: return 0 -cdef int _get_root(int word, const GoldParseC* gold) nogil: - while gold.heads[word] != word and gold.has_dep[word] and word >= 0: - word = gold.heads[word] - if not gold.has_dep[word]: +cdef int _get_root(int word, const GoldParseStateC* gold) nogil: + if is_head_unknown(gold, word): return -1 + while gold.heads[word] != word and word >= 0: + word = gold.heads[word] + if is_head_unknown(gold, word): + return -1 else: return word @@ -330,8 +521,6 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1: cdef class ArcEager(TransitionSystem): def __init__(self, *args, **kwargs): TransitionSystem.__init__(self, *args, **kwargs) - self.init_beam_state = _init_state - self.del_beam_state = _del_state @classmethod def get_actions(cls, **kwargs): @@ -345,10 +534,11 @@ cdef class ArcEager(TransitionSystem): for label in kwargs.get('right_labels', []): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 - for example in kwargs.get('gold_parses', []): - heads, labels = nonproj.projectivize(example.token_annotation.heads, - example.token_annotation.deps) - for child, head, label in zip(example.token_annotation.ids, heads, labels): + for example in kwargs.get('examples', []): + heads, labels = example.get_aligned_parse(projectivize=True) + for child, (head, label) in enumerate(zip(heads, labels)): + if head is None or label is None: + continue if label.upper() == 'ROOT' : label = 'ROOT' if head == child: @@ -378,102 +568,47 @@ cdef class ArcEager(TransitionSystem): def action_types(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) - def get_cost(self, StateClass state, GoldParse gold, action): - cdef Transition t = self.lookup_transition(action) - if not t.is_valid(state.c, t.label): - return 9000 - else: - return t.get_cost(state, &gold.c, t.label) - def transition(self, StateClass state, action): cdef Transition t = self.lookup_transition(action) t.do(state.c, t.label) return state - def is_gold_parse(self, StateClass state, GoldParse gold): - predicted = set() - truth = set() - for i in range(gold.length): - if gold.cand_to_gold[i] is None: - continue - if state.safe_get(i).dep: - predicted.add((i, state.H(i), - self.strings[state.safe_get(i).dep])) - else: - predicted.add((i, state.H(i), 'ROOT')) - id_ = gold.orig.ids[gold.cand_to_gold[i]] - head = gold.orig.heads[gold.cand_to_gold[i]] - dep = gold.orig.deps[gold.cand_to_gold[i]] - truth.add((id_, head, dep)) - return truth == predicted + def is_gold_parse(self, StateClass state, gold): + raise NotImplementedError - def has_gold(self, GoldParse gold, start=0, end=None): - end = end or len(gold.heads) - if all([tag is None for tag in gold.heads[start:end]]): - return False - else: - return True - - def preprocess_gold(self, GoldParse gold): - if not self.has_gold(gold): - return None - # Figure out whether we're using subtok - use_subtok = False - for action, labels in self.labels.items(): - if SUBTOK_LABEL in labels: - use_subtok = True - break - for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)): - # Missing values - if head is None or dep is None: - gold.c.heads[i] = i - gold.c.has_dep[i] = False - elif dep == SUBTOK_LABEL and not use_subtok: - # If we're not doing the joint tokenization and parsing, - # regard these subtok labels as missing - gold.c.heads[i] = i - gold.c.labels[i] = 0 - gold.c.has_dep[i] = False - else: - if head > i: - action = LEFT - elif head < i: - action = RIGHT - else: - action = BREAK - if dep not in self.labels[action]: - if action == BREAK: - dep = 'ROOT' - elif nonproj.is_decorated(dep): - backoff = nonproj.decompose(dep)[0] - if backoff in self.labels[action]: - dep = backoff - else: - dep = 'dep' - else: - dep = 'dep' - gold.c.has_dep[i] = True - if dep.upper() == 'ROOT': - dep = 'ROOT' - gold.c.heads[i] = head - gold.c.labels[i] = self.strings.add(dep) + def init_gold(self, StateClass state, Example example): + gold = ArcEagerGold(self, state, example) + self._replace_unseen_labels(gold) return gold - def get_beam_parses(self, Beam beam): - parses = [] - probs = beam.probs - for i in range(beam.size): - state = <StateC*>beam.at(i) - if state.is_final(): - self.finalize_state(state) - prob = probs[i] - parse = [] - for j in range(state.length): - head = state.H(j) - label = self.strings[state._sent[j].dep] - parse.append((head, j, label)) - parses.append((prob, parse)) - return parses + def init_gold_batch(self, examples): + all_states = self.init_batch([eg.predicted for eg in examples]) + golds = [] + states = [] + for state, eg in zip(all_states, examples): + if self.has_gold(eg) and not state.is_final(): + golds.append(self.init_gold(state, eg)) + states.append(state) + n_steps = sum([len(s.queue) for s in states]) + return states, golds, n_steps + + def _replace_unseen_labels(self, ArcEagerGold gold): + backoff_label = self.strings["dep"] + root_label = self.strings["ROOT"] + left_labels = self.labels[LEFT] + right_labels = self.labels[RIGHT] + break_labels = self.labels[BREAK] + for i in range(gold.c.length): + if not is_head_unknown(&gold.c, i): + head = gold.c.heads[i] + label = self.strings[gold.c.labels[i]] + if head > i and label not in left_labels: + gold.c.labels[i] = backoff_label + elif head < i and label not in right_labels: + gold.c.labels[i] = backoff_label + elif head == i and label not in break_labels: + gold.c.labels[i] = root_label + return gold cdef Transition lookup_transition(self, object name_or_id) except *: if isinstance(name_or_id, int): @@ -489,7 +624,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] - return Transition(clas=0, move=MISSING, label=0) + raise KeyError(f"Unknown transition: {name}") def move_name(self, int move, attr_t label): label_str = self.strings[label] @@ -554,6 +689,13 @@ cdef class ArcEager(TransitionSystem): doc.is_parsed = True set_children_from_heads(doc.c, doc.length) + def has_gold(self, Example eg, start=0, end=None): + for word in eg.y[start:end]: + if word.dep != 0: + return True + else: + return False + cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid is_valid[SHIFT] = Shift.is_valid(st, 0) @@ -567,68 +709,110 @@ cdef class ArcEager(TransitionSystem): output[i] = self.c[i].is_valid(st, self.c[i].label) else: output[i] = is_valid[self.c[i].move] + + def get_cost(self, StateClass stcls, gold, int i): + if not isinstance(gold, ArcEagerGold): + raise TypeError("Expected ArcEagerGold") + cdef ArcEagerGold gold_ = gold + gold_state = gold_.c + n_gold = 0 + if self.c[i].is_valid(stcls.c, self.c[i].label): + cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) + else: + cost = 9000 + return cost cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass stcls, GoldParse gold) except -1: - cdef int i, move - cdef attr_t label - cdef label_cost_func_t[N_MOVES] label_cost_funcs - cdef move_cost_func_t[N_MOVES] move_cost_funcs - cdef weight_t[N_MOVES] move_costs - for i in range(N_MOVES): - move_costs[i] = 9000 - move_cost_funcs[SHIFT] = Shift.move_cost - move_cost_funcs[REDUCE] = Reduce.move_cost - move_cost_funcs[LEFT] = LeftArc.move_cost - move_cost_funcs[RIGHT] = RightArc.move_cost - move_cost_funcs[BREAK] = Break.move_cost - - label_cost_funcs[SHIFT] = Shift.label_cost - label_cost_funcs[REDUCE] = Reduce.label_cost - label_cost_funcs[LEFT] = LeftArc.label_cost - label_cost_funcs[RIGHT] = RightArc.label_cost - label_cost_funcs[BREAK] = Break.label_cost - - cdef attr_t* labels = gold.c.labels - cdef int* heads = gold.c.heads - + StateClass stcls, gold) except -1: + if not isinstance(gold, ArcEagerGold): + raise TypeError("Expected ArcEagerGold") + cdef ArcEagerGold gold_ = gold + gold_.update(stcls) + gold_state = gold_.c n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls.c, self.c[i].label): is_valid[i] = True - move = self.c[i].move - label = self.c[i].label - if move_costs[move] == 9000: - move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) n_gold += costs[i] <= 0 else: is_valid[i] = False costs[i] = 9000 if n_gold < 1: - # Check projectivity --- leading cause - if is_nonproj_tree(gold.heads): - raise ValueError(Errors.E020) - else: - failure_state = stcls.print_state(gold.words) - raise ValueError(Errors.E021.format(n_actions=self.n_moves, - state=failure_state)) + raise ValueError - def get_beam_annot(self, Beam beam): - length = (<StateC*>beam.at(0)).length - heads = [{} for _ in range(length)] - deps = [{} for _ in range(length)] - probs = beam.probs - for i in range(beam.size): - state = <StateC*>beam.at(i) - self.finalize_state(state) - if state.is_final(): - prob = probs[i] - for j in range(state.length): - head = j + state._sent[j].head - dep = state._sent[j].dep - heads[j].setdefault(head, 0.0) - heads[j][head] += prob - deps[j].setdefault(dep, 0.0) - deps[j][dep] += prob - return heads, deps + def get_oracle_sequence(self, Example example): + cdef StateClass state + cdef ArcEagerGold gold + states, golds, n_steps = self.init_gold_batch([example]) + if not golds: + return [] + + cdef Pool mem = Pool() + # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc + assert self.n_moves > 0 + costs = <float*>mem.alloc(self.n_moves, sizeof(float)) + is_valid = <int*>mem.alloc(self.n_moves, sizeof(int)) + + state = states[0] + gold = golds[0] + history = [] + debug_log = [] + failed = False + while not state.is_final(): + try: + self.set_costs(is_valid, costs, state, gold) + except ValueError: + failed = True + break + for i in range(self.n_moves): + if is_valid[i] and costs[i] <= 0: + action = self.c[i] + history.append(i) + s0 = state.S(0) + b0 = state.B(0) + debug_log.append(" ".join(( + self.get_class_name(i), + "S0=", (example.x[s0].text if s0 >= 0 else "__"), + "B0=", (example.x[b0].text if b0 >= 0 else "__"), + "S0 head?", str(state.has_head(state.S(0))), + ))) + action.do(state.c, action.label) + break + else: + failed = False + break + if failed: + print("Actions") + for i in range(self.n_moves): + print(self.get_class_name(i)) + print("Gold") + for token in example.y: + print(token.i, token.text, token.dep_, token.head.text) + aligned_heads, aligned_labels = example.get_aligned_parse() + print("Aligned heads") + for i, head in enumerate(aligned_heads): + print(example.x[i], example.x[head] if head is not None else "__") + + print("Predicted tokens") + print([(w.i, w.text) for w in example.x]) + s0 = state.S(0) + b0 = state.B(0) + debug_log.append(" ".join(( + "?", + "S0=", (example.x[s0].text if s0 >= 0 else "-"), + "B0=", (example.x[b0].text if b0 >= 0 else "-"), + "S0 head?", str(state.has_head(state.S(0))), + ))) + s0 = state.S(0) + b0 = state.B(0) + print("\n".join(debug_log)) + print("Arc is gold B0, S0?", arc_is_gold(&gold.c, b0, s0)) + print("Arc is gold S0, B0?", arc_is_gold(&gold.c, s0, b0)) + print("is_head_unknown(s0)", is_head_unknown(&gold.c, s0)) + print("is_head_unknown(b0)", is_head_unknown(&gold.c, b0)) + print("b0", b0, "gold.heads[s0]", gold.c.heads[s0]) + print("Stack", [example.x[i] for i in state.stack]) + print("Buffer", [example.x[i] for i in state.queue]) + raise ValueError(Errors.E024) + return history diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd index 647f98fc0..989593a92 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/syntax/ner.pxd @@ -1,6 +1,5 @@ from .transition_system cimport TransitionSystem from .transition_system cimport Transition -from ..gold cimport GoldParseC from ..typedefs cimport attr_t diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index ff74be601..c4125bbdf 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,15 +1,16 @@ -from thinc.extra.search cimport Beam - from collections import Counter +from libc.stdint cimport int32_t +from cymem.cymem cimport Pool from ..typedefs cimport weight_t from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t -from ..gold cimport GoldParseC, GoldParse from ..lexeme cimport Lexeme from ..attrs cimport IS_SPACE +from ..gold.iob_utils import biluo_tags_from_offsets +from ..gold.example cimport Example from ..errors import Errors @@ -35,6 +36,43 @@ MOVE_NAMES[OUT] = 'O' MOVE_NAMES[ISNT] = 'x' +cdef struct GoldNERStateC: + Transition* ner + int32_t length + + +cdef class BiluoGold: + cdef Pool mem + cdef GoldNERStateC c + + def __init__(self, BiluoPushDown moves, StateClass stcls, Example example): + self.mem = Pool() + self.c = create_gold_state(self.mem, moves, stcls, example) + + def update(self, StateClass stcls): + update_gold_state(&self.c, stcls) + + + +cdef GoldNERStateC create_gold_state( + Pool mem, + BiluoPushDown moves, + StateClass stcls, + Example example +) except *: + cdef GoldNERStateC gs + gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition)) + ner_tags = example.get_aligned_ner() + for i, ner_tag in enumerate(ner_tags): + gs.ner[i] = moves.lookup_transition(ner_tag) + return gs + + +cdef void update_gold_state(GoldNERStateC* gs, StateClass stcls) except *: + # We don't need to update each time, unlike the parser. + pass + + cdef do_func_t[N_MOVES] do_funcs @@ -71,12 +109,12 @@ cdef class BiluoPushDown(TransitionSystem): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = 1 moves = ('M', 'B', 'I', 'L', 'U') - for example in kwargs.get('gold_parses', []): - for i, ner_tag in enumerate(example.token_annotation.entities): - if ner_tag != 'O' and ner_tag != '-': - _, label = ner_tag.split('-', 1) + for example in kwargs.get('examples', []): + for token in example.y: + ent_type = token.ent_type_ + if ent_type: for action in (BEGIN, IN, LAST, UNIT): - actions[action][label] += 1 + actions[action][ent_type] += 1 return actions @property @@ -91,52 +129,16 @@ cdef class BiluoPushDown(TransitionSystem): else: return MOVE_NAMES[move] + '-' + self.strings[label] - def has_gold(self, GoldParse gold, start=0, end=None): - end = end or len(gold.ner) - if all([tag in ('-', None) for tag in gold.ner[start:end]]): - return False - else: - return True - - def preprocess_gold(self, GoldParse gold): - if not self.has_gold(gold): - return None - for i in range(gold.length): - gold.c.ner[i] = self.lookup_transition(gold.ner[i]) - return gold - - def get_beam_annot(self, Beam beam): - entities = {} - probs = beam.probs - for i in range(beam.size): - state = <StateC*>beam.at(i) - if state.is_final(): - self.finalize_state(state) - prob = probs[i] - for j in range(state._e_i): - start = state._ents[j].start - end = state._ents[j].end - label = state._ents[j].label - entities.setdefault((start, end, label), 0.0) - entities[(start, end, label)] += prob - return entities - - def get_beam_parses(self, Beam beam): - parses = [] - probs = beam.probs - for i in range(beam.size): - state = <StateC*>beam.at(i) - if state.is_final(): - self.finalize_state(state) - prob = probs[i] - parse = [] - for j in range(state._e_i): - start = state._ents[j].start - end = state._ents[j].end - label = state._ents[j].label - parse.append((start, end, self.strings[label])) - parses.append((prob, parse)) - return parses + def init_gold_batch(self, examples): + all_states = self.init_batch([eg.predicted for eg in examples]) + golds = [] + states = [] + for state, eg in zip(all_states, examples): + if self.has_gold(eg) and not state.is_final(): + golds.append(self.init_gold(state, eg)) + states.append(state) + n_steps = sum([len(s.queue) for s in states]) + return states, golds, n_steps cdef Transition lookup_transition(self, object name) except *: cdef attr_t label @@ -237,6 +239,47 @@ cdef class BiluoPushDown(TransitionSystem): self.add_action(UNIT, st._sent[i].ent_type) self.add_action(LAST, st._sent[i].ent_type) + def init_gold(self, StateClass state, Example example): + return BiluoGold(self, state, example) + + def has_gold(self, Example eg, start=0, end=None): + for word in eg.y[start:end]: + if word.ent_iob != 0: + return True + else: + return False + + def get_cost(self, StateClass stcls, gold, int i): + if not isinstance(gold, BiluoGold): + raise TypeError("Expected BiluoGold") + cdef BiluoGold gold_ = gold + gold_state = gold_.c + n_gold = 0 + if self.c[i].is_valid(stcls.c, self.c[i].label): + cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) + else: + cost = 9000 + return cost + + cdef int set_costs(self, int* is_valid, weight_t* costs, + StateClass stcls, gold) except -1: + if not isinstance(gold, BiluoGold): + raise TypeError("Expected BiluoGold") + cdef BiluoGold gold_ = gold + gold_.update(stcls) + gold_state = gold_.c + n_gold = 0 + for i in range(self.n_moves): + if self.c[i].is_valid(stcls.c, self.c[i].label): + is_valid[i] = 1 + costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) + n_gold += costs[i] <= 0 + else: + is_valid[i] = 0 + costs[i] = 9000 + if n_gold < 1: + raise ValueError + cdef class Missing: @staticmethod @@ -248,7 +291,7 @@ cdef class Missing: pass @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: return 9000 @@ -300,7 +343,8 @@ cdef class Begin: st.pop() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <GoldNERStateC*>_gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label @@ -363,7 +407,8 @@ cdef class In: st.pop() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <GoldNERStateC*>_gold move = IN cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT cdef int g_act = gold.ner[s.B(0)].move @@ -429,7 +474,8 @@ cdef class Last: st.pop() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <GoldNERStateC*>_gold move = LAST cdef int g_act = gold.ner[s.B(0)].move @@ -497,7 +543,8 @@ cdef class Unit: st.pop() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <GoldNERStateC*>_gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label @@ -537,7 +584,8 @@ cdef class Out: st.pop() @staticmethod - cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: + cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil: + gold = <GoldNERStateC*>_gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1dcb92016..23dca79e3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,7 +9,6 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam from thinc.backends.linalg cimport Vec, VecVec from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops @@ -21,7 +20,6 @@ import numpy import warnings from ..tokens.doc cimport Doc -from ..gold cimport GoldParse from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -30,14 +28,12 @@ from ._parser_model cimport get_c_weights, get_c_sizes from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition -from . cimport _beam_utils +from ..gold.example cimport Example -from ..gold import Example from ..util import link_vectors_to_models, create_default_optimizer, registry from ..compat import copy_array from ..errors import Errors, Warnings from .. import util -from . import _beam_utils from . import nonproj @@ -144,71 +140,46 @@ cdef class Parser: ''' pass - def preprocess_gold(self, examples): - for ex in examples: - yield ex - def use_params(self, params): # Can't decorate cdef class :(. Workaround. with self.model.use_params(params): yield - def __call__(self, Doc doc, beam_width=None): + def __call__(self, Doc doc): """Apply the parser or entity recognizer, setting the annotations onto the `Doc` object. doc (Doc): The document to be processed. """ - if beam_width is None: - beam_width = self.cfg['beam_width'] - beam_density = self.cfg.get('beam_density', 0.) - states = self.predict([doc], beam_width=beam_width, - beam_density=beam_density) + states = self.predict([doc]) self.set_annotations([doc], states, tensors=None) return doc - def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None, - as_example=False): + def pipe(self, docs, int batch_size=256, int n_threads=-1): """Process a stream of documents. stream: The sequence of documents to process. batch_size (int): Number of documents to accumulate into a working set. YIELDS (Doc): Documents, in order. """ - if beam_width is None: - beam_width = self.cfg['beam_width'] - beam_density = self.cfg.get('beam_density', 0.) cdef Doc doc for batch in util.minibatch(docs, size=batch_size): batch_in_order = list(batch) - docs = [self._get_doc(ex) for ex in batch_in_order] - by_length = sorted(docs, key=lambda doc: len(doc)) + by_length = sorted(batch, key=lambda doc: len(doc)) for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): subbatch = list(subbatch) - parse_states = self.predict(subbatch, beam_width=beam_width, - beam_density=beam_density) + parse_states = self.predict(subbatch) self.set_annotations(subbatch, parse_states, tensors=None) - if as_example: - annotated_examples = [] - for ex, doc in zip(batch_in_order, docs): - ex.doc = doc - annotated_examples.append(ex) - yield from annotated_examples - else: - yield from batch_in_order + yield from batch_in_order - def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): + def predict(self, docs): if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): result = self.moves.init_batch(docs) self._resize() return result - if beam_width < 2: - return self.greedy_parse(docs, drop=drop) - else: - return self.beam_parse(docs, beam_width=beam_width, - beam_density=beam_density, drop=drop) + return self.greedy_parse(docs, drop=0.0) def greedy_parse(self, docs, drop=0.): cdef vector[StateC*] states @@ -230,44 +201,6 @@ cdef class Parser: weights, sizes) return batch - def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): - cdef Beam beam - cdef Doc doc - cdef np.ndarray token_ids - set_dropout_rate(self.model, drop) - beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density) - # This is pretty dirty, but the NER can resize itself in init_batch, - # if labels are missing. We therefore have to check whether we need to - # expand our model output. - self._resize() - cdef int nr_feature = self.model.get_ref("lower").get_dim("nF") - model = self.model.predict(docs) - token_ids = numpy.zeros((len(docs) * beam_width, nr_feature), - dtype='i', order='C') - cdef int* c_ids - cdef int n_states - model = self.model.predict(docs) - todo = [beam for beam in beams if not beam.is_done] - while todo: - token_ids.fill(-1) - c_ids = <int*>token_ids.data - n_states = 0 - for beam in todo: - for i in range(beam.size): - state = <StateC*>beam.at(i) - # This way we avoid having to score finalized states - # We do have to take care to keep indexes aligned, though - if not state.is_final(): - state.set_context_tokens(c_ids, nr_feature) - c_ids += nr_feature - n_states += 1 - if n_states == 0: - break - vectors = model.state2vec.predict(token_ids[:n_states]) - scores = model.vec2scores.predict(vectors) - todo = self.transition_beams(todo, scores) - return beams - cdef void _parseC(self, StateC** states, WeightsC weights, SizesC sizes) nogil: cdef int i, j @@ -288,20 +221,9 @@ cdef class Parser: unfinished.clear() free_activations(&activations) - def set_annotations(self, docs, states_or_beams, tensors=None): + def set_annotations(self, docs, states, tensors=None): cdef StateClass state - cdef Beam beam cdef Doc doc - states = [] - beams = [] - for state_or_beam in states_or_beams: - if isinstance(state_or_beam, StateClass): - states.append(state_or_beam) - else: - beam = state_or_beam - state = StateClass.borrow(<StateC*>beam.at(0)) - states.append(state) - beams.append(beam) for i, (state, doc) in enumerate(zip(states, docs)): self.moves.finalize_state(state.c) for j in range(doc.length): @@ -309,8 +231,6 @@ cdef class Parser: self.moves.finalize_doc(doc) for hook in self.postprocesses: hook(doc) - for beam in beams: - _beam_utils.cleanup_beam(beam) def transition_states(self, states, float[:, ::1] scores): cdef StateClass state @@ -342,50 +262,25 @@ cdef class Parser: states[i].push_hist(guess) free(is_valid) - def transition_beams(self, beams, float[:, ::1] scores): - cdef Beam beam - cdef float* c_scores = &scores[0, 0] - for beam in beams: - for i in range(beam.size): - state = <StateC*>beam.at(i) - if not state.is_final(): - self.moves.set_valid(beam.is_valid[i], state) - memcpy(beam.scores[i], c_scores, scores.shape[1] * sizeof(float)) - c_scores += scores.shape[1] - beam.advance(_beam_utils.transition_state, _beam_utils.hash_state, <void*>self.moves.c) - beam.check_done(_beam_utils.check_final_state, NULL) - return [b for b in beams if not b.is_done] - def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if losses is None: losses = {} losses.setdefault(self.name, 0.) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) - # The probability we use beam update, instead of falling back to - # a greedy update - beam_update_prob = self.cfg['beam_update_prob'] - if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam(examples, self.cfg['beam_width'], - drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations, - beam_density=self.cfg.get('beam_density', 0.001)) - set_dropout_rate(self.model, drop) - cut_gold = True - if cut_gold: - # Chop sequences into lengths of this many transitions, to make the - # batch uniform length. - cut_gold = numpy.random.choice(range(20, 100)) - states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold) - else: - states, golds, max_steps = self._init_gold_batch_no_cut(examples) - states_golds = [(s, g) for (s, g) in zip(states, golds) - if not s.is_final() and g is not None] # Prepare the stepwise model, and get the callback for finishing the batch - model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples]) + model, backprop_tok2vec = self.model.begin_update( + [eg.predicted for eg in examples]) + # Chop sequences into lengths of this many transitions, to make the + # batch uniform length. We randomize this to overfit less. + cut_gold = numpy.random.choice(range(20, 100)) + states, golds, max_steps = self._init_gold_batch( + examples, + max_length=cut_gold + ) all_states = list(states) + states_golds = zip(states, golds) for _ in range(max_steps): if not states_golds: break @@ -395,18 +290,18 @@ cdef class Parser: backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) - states_golds = [eg for eg in states_golds if not eg[0].is_final()] + states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] + backprop_tok2vec(golds) - if sgd is not None: + if sgd not in (None, False): self.model.finish_update(sgd) if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, all_states) return losses def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" - examples = Example.to_example_objects(examples) if losses is None: losses = {} for multitask in self._multitasks: @@ -416,7 +311,7 @@ cdef class Parser: return None losses.setdefault(self.name, 0.) - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to @@ -448,52 +343,6 @@ cdef class Parser: losses[self.name] += loss / n_scores return losses - def update_beam(self, examples, width, drop=0., sgd=None, losses=None, - set_annotations=False, beam_density=0.0): - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] - golds = [ex.gold for ex in examples] - new_golds = [] - lengths = [len(d) for d in docs] - states = self.moves.init_batch(docs) - for gold in golds: - self.moves.preprocess_gold(gold) - new_golds.append(gold) - set_dropout_rate(self.model, drop) - model, backprop_tok2vec = self.model.begin_update(docs) - states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, - self.model.get_ref("lower").get_dim("nF"), - 10000, - states, - golds, - model.state2vec, - model.vec2scores, - width, - losses=losses, - beam_density=beam_density - ) - for i, d_scores in enumerate(states_d_scores): - losses[self.name] += (d_scores**2).mean() - ids, bp_vectors, bp_scores = backprops[i] - d_vector = bp_scores(d_scores) - if isinstance(model.ops, CupyOps) \ - and not isinstance(ids, model.state2vec.ops.xp.ndarray): - model.backprops.append(( - util.get_async(model.cuda_stream, ids), - util.get_async(model.cuda_stream, d_vector), - bp_vectors)) - else: - model.backprops.append((ids, d_vector, bp_vectors)) - backprop_tok2vec(golds) - if sgd is not None: - self.model.finish_update(sgd) - if set_annotations: - self.set_annotations(docs, beams) - cdef Beam beam - for beam in beams: - _beam_utils.cleanup_beam(beam) - def get_gradients(self): """Get non-zero gradients of the model's parameters, as a dictionary keyed by the parameter ID. The values are (weights, gradients) tuples. @@ -511,66 +360,8 @@ cdef class Parser: queue.extend(node._layers) return gradients - def _init_gold_batch_no_cut(self, whole_examples): - states = self.moves.init_batch([eg.doc for eg in whole_examples]) - good_docs = [] - good_golds = [] - good_states = [] - for i, eg in enumerate(whole_examples): - doc = eg.doc - gold = self.moves.preprocess_gold(eg.gold) - if gold is not None and self.moves.has_gold(gold): - good_docs.append(doc) - good_golds.append(gold) - good_states.append(states[i]) - n_moves = [] - for doc, gold in zip(good_docs, good_golds): - oracle_actions = self.moves.get_oracle_sequence(doc, gold) - n_moves.append(len(oracle_actions)) - return good_states, good_golds, max(n_moves, default=0) * 2 - - def _init_gold_batch(self, whole_examples, min_length=5, max_length=500): - """Make a square batch, of length equal to the shortest doc. A long - doc will get multiple states. Let's say we have a doc of length 2*N, - where N is the shortest doc. We'll make two states, one representing - long_doc[:N], and another representing long_doc[N:].""" - cdef: - StateClass state - Transition action - whole_docs = [ex.doc for ex in whole_examples] - whole_golds = [ex.gold for ex in whole_examples] - whole_states = self.moves.init_batch(whole_docs) - max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs]))) - max_moves = 0 - states = [] - golds = [] - for doc, state, gold in zip(whole_docs, whole_states, whole_golds): - gold = self.moves.preprocess_gold(gold) - if gold is None: - continue - oracle_actions = self.moves.get_oracle_sequence(doc, gold) - start = 0 - while start < len(doc): - state = state.copy() - n_moves = 0 - while state.B(0) < start and not state.is_final(): - action = self.moves.c[oracle_actions.pop(0)] - action.do(state.c, action.label) - state.c.push_hist(action.clas) - n_moves += 1 - has_gold = self.moves.has_gold(gold, start=start, - end=start+max_length) - if not state.is_final() and has_gold: - states.append(state) - golds.append(gold) - max_moves = max(max_moves, n_moves) - start += min(max_length, len(doc)-start) - max_moves = max(max_moves, len(oracle_actions)) - return states, golds, max_moves - def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): cdef StateClass state - cdef GoldParse gold cdef Pool mem = Pool() cdef int i @@ -613,9 +404,11 @@ cdef class Parser: if not hasattr(get_examples, '__call__'): gold_tuples = get_examples get_examples = lambda: gold_tuples - actions = self.moves.get_actions(gold_parses=get_examples(), - min_freq=self.cfg['min_action_freq'], - learn_tokens=self.cfg["learn_tokens"]) + actions = self.moves.get_actions( + examples=get_examples(), + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"] + ) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): @@ -627,13 +420,8 @@ cdef class Parser: if sgd is None: sgd = self.create_optimizer() doc_sample = [] - gold_sample = [] for example in islice(get_examples(), 10): - parses = example.get_gold_parses(merge=False, vocab=self.vocab) - for doc, gold in parses: - if len(doc): - doc_sample.append(doc) - gold_sample.append(gold) + doc_sample.append(example.predicted) if pipeline is not None: for name, component in pipeline: @@ -652,12 +440,6 @@ cdef class Parser: link_vectors_to_models(self.vocab) return sgd - def _get_doc(self, example): - """ Use this method if the `example` can be both a Doc or an Example """ - if isinstance(example, Doc): - return example - return example.doc - def to_disk(self, path, exclude=tuple(), **kwargs): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), @@ -714,3 +496,42 @@ cdef class Parser: except AttributeError: raise ValueError(Errors.E149) return self + + def _init_gold_batch(self, examples, min_length=5, max_length=500): + """Make a square batch, of length equal to the shortest doc. A long + doc will get multiple states. Let's say we have a doc of length 2*N, + where N is the shortest doc. We'll make two states, one representing + long_doc[:N], and another representing long_doc[N:].""" + cdef: + StateClass state + Transition action + all_states = self.moves.init_batch([eg.predicted for eg in examples]) + kept = [] + for state, eg in zip(all_states, examples): + if self.moves.has_gold(eg) and not state.is_final(): + gold = self.moves.init_gold(state, eg) + kept.append((eg, state, gold)) + max_length = max(min_length, min(max_length, min([len(eg.x) for eg in examples]))) + max_moves = 0 + states = [] + golds = [] + for eg, state, gold in kept: + oracle_actions = self.moves.get_oracle_sequence(eg) + start = 0 + while start < len(eg.predicted): + state = state.copy() + n_moves = 0 + while state.B(0) < start and not state.is_final(): + action = self.moves.c[oracle_actions.pop(0)] + action.do(state.c, action.label) + state.c.push_hist(action.clas) + n_moves += 1 + has_gold = self.moves.has_gold(eg, start=start, + end=start+max_length) + if not state.is_final() and has_gold: + states.append(state) + golds.append(gold) + max_moves = max(max_moves, n_moves) + start += min(max_length, len(eg.x)-start) + max_moves = max(max_moves, len(oracle_actions)) + return states, golds, max_moves diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 1edb2e65c..5ccb11f37 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -7,7 +7,6 @@ from copy import copy from ..tokens.doc cimport Doc, set_children_from_heads -from ..gold import Example from ..errors import Errors @@ -50,8 +49,12 @@ def is_nonproj_arc(tokenid, heads): return False elif head is None: # unattached tokens cannot be non-projective return False - - start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) + + cdef int start, end + if head < tokenid: + start, end = (head+1, tokenid) + else: + start, end = (tokenid+1, head) for k in range(start, end): for ancestor in ancestors(k, heads): if ancestor is None: # for unattached tokens/subtrees @@ -78,8 +81,8 @@ def is_decorated(label): def count_decorated_labels(gold_data): freqs = {} for example in gold_data: - proj_heads, deco_deps = projectivize(example.token_annotation.heads, - example.token_annotation.deps) + proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), + example.get_aligned("DEP")) # set the label to ROOT for each root dependent deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] @@ -90,31 +93,6 @@ def count_decorated_labels(gold_data): return freqs -def preprocess_training_data(gold_data, label_freq_cutoff=30): - preprocessed = [] - freqs = {} - for example in gold_data: - new_example = Example(doc=example.doc) - proj_heads, deco_deps = projectivize(example.token_annotation.heads, - example.token_annotation.deps) - # set the label to ROOT for each root dependent - deco_deps = ['ROOT' if head == i else deco_deps[i] - for i, head in enumerate(proj_heads)] - # count label frequencies - if label_freq_cutoff > 0: - for label in deco_deps: - if is_decorated(label): - freqs[label] = freqs.get(label, 0) + 1 - proj_token_dict = example.token_annotation.to_dict() - proj_token_dict["heads"] = proj_heads - proj_token_dict["deps"] = deco_deps - new_example.set_token_annotation(**proj_token_dict) - preprocessed.append(new_example) - if label_freq_cutoff > 0: - return _filter_labels(preprocessed, label_freq_cutoff, freqs) - return preprocessed - - def projectivize(heads, labels): # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) @@ -200,22 +178,3 @@ def _find_new_head(token, headlabel): next_queue.append(child) queue = next_queue return token.head - - -def _filter_labels(examples, cutoff, freqs): - # throw away infrequent decorated labels - # can't learn them reliably anyway and keeps label set smaller - filtered = [] - for example in examples: - new_example = Example(doc=example.doc) - filtered_labels = [] - for label in example.token_annotation.deps: - if is_decorated(label) and freqs.get(label, 0) < cutoff: - filtered_labels.append(decompose(label)[0]) - else: - filtered_labels.append(label) - filtered_token_dict = example.token_annotation.to_dict() - filtered_token_dict["deps"] = filtered_labels - new_example.set_token_annotation(**filtered_token_dict) - filtered.append(new_example) - return filtered diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 5fd3b5c5f..836c08168 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -2,11 +2,10 @@ from cymem.cymem cimport Pool from ..typedefs cimport attr_t, weight_t from ..structs cimport TokenC -from ..gold cimport GoldParse -from ..gold cimport GoldParseC from ..strings cimport StringStore from .stateclass cimport StateClass from ._state cimport StateC +from ..gold.example cimport Example cdef struct Transition: @@ -17,14 +16,14 @@ cdef struct Transition: weight_t score bint (*is_valid)(const StateC* state, attr_t label) nogil - weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil + weight_t (*get_cost)(StateClass state, const void* gold, attr_t label) nogil int (*do)(StateC* state, attr_t label) nogil -ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, +ctypedef weight_t (*get_cost_func_t)(StateClass state, const void* gold, attr_tlabel) nogil -ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil -ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* +ctypedef weight_t (*move_cost_func_t)(StateClass state, const void* gold) nogil +ctypedef weight_t (*label_cost_func_t)(StateClass state, const void* gold, attr_t label) nogil ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil @@ -41,8 +40,6 @@ cdef class TransitionSystem: cdef int _size cdef public attr_t root_label cdef public freqs - cdef init_state_t init_beam_state - cdef del_state_t del_beam_state cdef public object labels cdef int initialize_state(self, StateC* state) nogil @@ -55,4 +52,4 @@ cdef class TransitionSystem: cdef int set_valid(self, int* output, const StateC* st) nogil cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass state, GoldParse gold) except -1 + StateClass state, gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 78017c84a..e1ec40e0e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,13 +1,12 @@ # cython: infer_types=True +from __future__ import print_function from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam from collections import Counter import srsly from ..typedefs cimport weight_t -from . cimport _beam_utils from ..tokens.doc cimport Doc from ..structs cimport TokenC from .stateclass cimport StateClass @@ -47,8 +46,6 @@ cdef class TransitionSystem: if labels_by_action: self.initialize_actions(labels_by_action, min_freq=min_freq) self.root_label = self.strings.add('ROOT') - self.init_beam_state = _init_state - self.del_beam_state = _del_state def __reduce__(self): return (self.__class__, (self.strings, self.labels), None, None) @@ -64,48 +61,55 @@ cdef class TransitionSystem: offset += len(doc) return states - def init_beams(self, docs, beam_width, beam_density=0.): - cdef Doc doc - beams = [] - cdef int offset = 0 - - # Doc objects might contain labels that we need to register actions for. We need to check for that - # *before* we create any Beam objects, because the Beam object needs the correct number of - # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions, - # and it doesn't matter that we create and discard the state objects. - self.init_batch(docs) - - for doc in docs: - beam = Beam(self.n_moves, beam_width, min_density=beam_density) - beam.initialize(self.init_beam_state, self.del_beam_state, - doc.length, doc.c) - for i in range(beam.width): - state = <StateC*>beam.at(i) - state.offset = offset - offset += len(doc) - beam.check_done(_beam_utils.check_final_state, NULL) - beams.append(beam) - return beams - - def get_oracle_sequence(self, doc, GoldParse gold): + def get_oracle_sequence(self, Example example, _debug=False): cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 costs = <float*>mem.alloc(self.n_moves, sizeof(float)) is_valid = <int*>mem.alloc(self.n_moves, sizeof(int)) - cdef StateClass state = StateClass(doc, offset=0) - self.initialize_state(state.c) + cdef StateClass state + states, golds, n_steps = self.init_gold_batch([example]) + if not states: + return [] + state = states[0] + gold = golds[0] history = [] + debug_log = [] while not state.is_final(): self.set_costs(is_valid, costs, state, gold) for i in range(self.n_moves): if is_valid[i] and costs[i] <= 0: action = self.c[i] history.append(i) + s0 = state.S(0) + b0 = state.B(0) + if _debug: + debug_log.append(" ".join(( + self.get_class_name(i), + "S0=", (example.x[s0].text if s0 >= 0 else "__"), + "B0=", (example.x[b0].text if b0 >= 0 else "__"), + "S0 head?", str(state.has_head(state.S(0))), + ))) action.do(state.c, action.label) break else: + if _debug: + print("Actions") + for i in range(self.n_moves): + print(self.get_class_name(i)) + print("Gold") + for token in example.y: + print(token.text, token.dep_, token.head.text) + s0 = state.S(0) + b0 = state.B(0) + debug_log.append(" ".join(( + "?", + "S0=", (example.x[s0].text if s0 >= 0 else "-"), + "B0=", (example.x[b0].text if b0 >= 0 else "-"), + "S0 head?", str(state.has_head(state.S(0))), + ))) + print("\n".join(debug_log)) raise ValueError(Errors.E024) return history @@ -124,12 +128,6 @@ cdef class TransitionSystem: def finalize_doc(self, doc): pass - def preprocess_gold(self, GoldParse gold): - raise NotImplementedError - - def is_gold_parse(self, StateClass state, GoldParse gold): - raise NotImplementedError - cdef Transition lookup_transition(self, object name) except *: raise NotImplementedError @@ -148,18 +146,8 @@ cdef class TransitionSystem: is_valid[i] = self.c[i].is_valid(st, self.c[i].label) cdef int set_costs(self, int* is_valid, weight_t* costs, - StateClass stcls, GoldParse gold) except -1: - cdef int i - self.set_valid(is_valid, stcls.c) - cdef int n_gold = 0 - for i in range(self.n_moves): - if is_valid[i]: - costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) - n_gold += costs[i] <= 0 - else: - costs[i] = 9000 - if n_gold <= 0: - raise ValueError(Errors.E024) + StateClass stcls, gold) except -1: + raise NotImplementedError def get_class_name(self, int clas): act = self.c[clas] diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index f44ae1421..e721b3f09 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,6 @@ import pytest from spacy.tokens import Doc -from spacy.attrs import ORTH, SHAPE, POS, DEP +from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH from ..util import get_doc @@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab): assert feats_array[3][1] == doc[3].pos +def test_doc_array_morph(en_vocab): + words = ["Eat", "blue", "ham"] + morph = ["Feat=V", "Feat=J", "Feat=N"] + doc = get_doc(en_vocab, words=words, morphs=morph) + assert morph[0] == doc[0].morph_ + assert morph[1] == doc[1].morph_ + assert morph[2] == doc[2].morph_ + + feats_array = doc.to_array((ORTH, MORPH)) + assert feats_array[0][1] == doc[0].morph.key + assert feats_array[1][1] == doc[1].morph.key + assert feats_array[2][1] == doc[2].morph.key + + def test_doc_array_dep(en_vocab): words = ["A", "nice", "sentence", "."] deps = ["det", "amod", "ROOT", "punct"] diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 893465b45..b5fa933cd 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,8 +1,9 @@ import pytest from thinc.api import Adam from spacy.attrs import NORM -from spacy.gold import GoldParse from spacy.vocab import Vocab + +from spacy.gold import Example from spacy.pipeline.defaults import default_parser, default_ner from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer @@ -39,8 +40,9 @@ def _train_parser(parser): for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update((doc, gold), sgd=sgd, losses=losses) + gold = {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + example = Example.from_dict(doc, gold) + parser.update([example], sgd=sgd, losses=losses) return parser @@ -51,10 +53,9 @@ def test_add_label(parser): for i in range(100): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = GoldParse( - doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"] - ) - parser.update((doc, gold), sgd=sgd, losses=losses) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + example = Example.from_dict(doc, gold) + parser.update([example], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 42b62251e..0ef978bfa 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,22 +1,23 @@ import pytest from spacy.vocab import Vocab +from spacy.gold import Example from spacy.pipeline.defaults import default_parser from spacy.pipeline import DependencyParser from spacy.tokens import Doc -from spacy.gold import GoldParse from spacy.syntax.nonproj import projectivize -from spacy.syntax.stateclass import StateClass from spacy.syntax.arc_eager import ArcEager def get_sequence_costs(M, words, heads, deps, transitions): doc = Doc(Vocab(), words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - state = StateClass(doc) - M.preprocess_gold(gold) + example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + states, golds, _ = M.init_gold_batch([example]) + state = states[0] + gold = golds[0] cost_history = [] for gold_action in transitions: + gold.update(state) state_costs = {} for i in range(M.n_moves): name = M.class_name(i) @@ -39,31 +40,13 @@ def arc_eager(vocab): return moves -@pytest.fixture -def words(): - return ["a", "b"] - - -@pytest.fixture -def doc(words, vocab): - if vocab is None: - vocab = Vocab() - return Doc(vocab, words=list(words)) - - -@pytest.fixture -def gold(doc, words): - if len(words) == 2: - return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"]) - else: - raise NotImplementedError - - -@pytest.mark.xfail def test_oracle_four_words(arc_eager, vocab): words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] deps = ["left", "ROOT", "left", "ROOT"] + for dep in deps: + arc_eager.add_action(2, dep) # Left + arc_eager.add_action(3, dep) # Right actions = ["L-left", "B-ROOT", "L-left"] state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions) assert state.is_final() @@ -72,7 +55,7 @@ def test_oracle_four_words(arc_eager, vocab): assert state_costs[actions[i]] == 0.0, actions[i] for other_action, cost in state_costs.items(): if other_action != actions[i]: - assert cost >= 1 + assert cost >= 1, (i, other_action) annot_tuples = [ @@ -140,7 +123,7 @@ def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) config = { "learn_tokens": False, - "min_action_freq": 30, + "min_action_freq": 0, "beam_width": 1, "beam_update_prob": 1.0, } @@ -149,12 +132,98 @@ def test_get_oracle_actions(): parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") + heads, deps = projectivize(heads, deps) for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) - heads, deps = projectivize(heads, deps) - gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) - parser.moves.preprocess_gold(gold) - parser.moves.get_oracle_sequence(doc, gold) + example = Example.from_dict( + doc, {"words": words, "tags": tags, "heads": heads, "deps": deps} + ) + parser.moves.get_oracle_sequence(example) + + +def test_oracle_dev_sentence(vocab, arc_eager): + words_deps_heads = """ + Rolls-Royce nn Inc. + Motor nn Inc. + Cars nn Inc. + Inc. nsubj said + said ROOT said + it nsubj expects + expects ccomp said + its poss sales + U.S. nn sales + sales nsubj steady + to aux steady + remain cop steady + steady xcomp expects + at prep steady + about quantmod 1,200 + 1,200 num cars + cars pobj at + in prep steady + 1990 pobj in + . punct said + """ + expected_transitions = [ + "S", # Shift 'Motor' + "S", # Shift 'Cars' + "L-nn", # Attach 'Cars' to 'Inc.' + "L-nn", # Attach 'Motor' to 'Inc.' + "L-nn", # Attach 'Rolls-Royce' to 'Inc.', force shift + "L-nsubj", # Attach 'Inc.' to 'said' + "S", # Shift 'it' + "L-nsubj", # Attach 'it.' to 'expects' + "R-ccomp", # Attach 'expects' to 'said' + "S", # Shift 'its' + "S", # Shift 'U.S.' + "L-nn", # Attach 'U.S.' to 'sales' + "L-poss", # Attach 'its' to 'sales' + "S", # Shift 'sales' + "S", # Shift 'to' + "S", # Shift 'remain' + "L-cop", # Attach 'remain' to 'steady' + "L-aux", # Attach 'to' to 'steady' + "L-nsubj", # Attach 'sales' to 'steady' + "R-xcomp", # Attach 'steady' to 'expects' + "R-prep", # Attach 'at' to 'steady' + "S", # Shift 'about' + "L-quantmod", # Attach "about" to "1,200" + "S", # Shift "1,200" + "L-num", # Attach "1,200" to "cars" + "R-pobj", # Attach "cars" to "at" + "D", # Reduce "cars" + "D", # Reduce "at" + "R-prep", # Attach "in" to "steady" + "R-pobj", # Attach "1990" to "in" + "D", # Reduce "1990" + "D", # Reduce "in" + "D", # Reduce "steady" + "D", # Reduce "expects" + "R-punct", # Attach "." to "said" + ] + + gold_words = [] + gold_deps = [] + gold_heads = [] + for line in words_deps_heads.strip().split("\n"): + line = line.strip() + if not line: + continue + word, dep, head = line.split() + gold_words.append(word) + gold_deps.append(dep) + gold_heads.append(head) + gold_heads = [gold_words.index(head) for head in gold_heads] + for dep in gold_deps: + arc_eager.add_action(2, dep) # Left + arc_eager.add_action(3, dep) # Right + + doc = Doc(Vocab(), words=gold_words) + example = Example.from_dict(doc, {"heads": gold_heads, "deps": gold_deps}) + + ae_oracle_actions = arc_eager.get_oracle_sequence(example) + ae_oracle_actions = [arc_eager.get_class_name(i) for i in ae_oracle_actions] + assert ae_oracle_actions == expected_transitions diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index db4efcd95..6528a4223 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,4 +1,6 @@ import pytest +from spacy.attrs import ENT_IOB + from spacy import util from spacy.lang.en import English @@ -8,12 +10,11 @@ from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown -from spacy.gold import GoldParse +from spacy.gold import Example from spacy.tokens import Doc from ..util import make_tempdir - TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), @@ -52,51 +53,55 @@ def tsys(vocab, entity_types): def test_get_oracle_moves(tsys, doc, entity_annots): - gold = GoldParse(doc, entities=entity_annots) - tsys.preprocess_gold(gold) - act_classes = tsys.get_oracle_sequence(doc, gold) + example = Example.from_dict(doc, {"entities": entity_annots}) + act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"] def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] - gold = GoldParse(doc, entities=entity_annots) - for i, tag in enumerate(gold.ner): + example = Example.from_dict(doc, {"entities": entity_annots}) + ex_dict = example.to_dict() + + for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): if tag == "L-!GPE": - gold.ner[i] = "-" - tsys.preprocess_gold(gold) - act_classes = tsys.get_oracle_sequence(doc, gold) + ex_dict["doc_annotation"]["entities"][i] = "-" + example = Example.from_dict(doc, ex_dict) + + act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names def test_get_oracle_moves_negative_entities2(tsys, vocab): doc = Doc(vocab, words=["A", "B", "C", "D"]) - gold = GoldParse(doc, entities=[]) - gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] - tsys.preprocess_gold(gold) - act_classes = tsys.get_oracle_sequence(doc, gold) + entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] + example = Example.from_dict(doc, {"entities": entity_annots}) + act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names +@pytest.mark.xfail(reason="Maybe outdated? Unsure") def test_get_oracle_moves_negative_O(tsys, vocab): doc = Doc(vocab, words=["A", "B", "C", "D"]) - gold = GoldParse(doc, entities=[]) - gold.ner = ["O", "!O", "O", "!O"] - tsys.preprocess_gold(gold) - act_classes = tsys.get_oracle_sequence(doc, gold) + entity_annots = ["O", "!O", "O", "!O"] + example = Example.from_dict(doc, {"entities": entity_annots}) + act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names +# We can't easily represent this on a Doc object. Not sure what the best solution +# would be, but I don't think it's an important use case? +@pytest.mark.xfail(reason="No longer supported") def test_oracle_moves_missing_B(en_vocab): words = ["B", "52", "Bomber"] biluo_tags = [None, None, "L-PRODUCT"] doc = Doc(en_vocab, words=words) - gold = GoldParse(doc, words=words, entities=biluo_tags) + example = Example.from_dict(doc, {"words": words, "entities": biluo_tags}) moves = BiluoPushDown(en_vocab.strings) move_types = ("M", "B", "I", "L", "U", "O") @@ -111,16 +116,17 @@ def test_oracle_moves_missing_B(en_vocab): moves.add_action(move_types.index("I"), label) moves.add_action(move_types.index("L"), label) moves.add_action(move_types.index("U"), label) - moves.preprocess_gold(gold) - moves.get_oracle_sequence(doc, gold) - + moves.get_oracle_sequence(example) +# We can't easily represent this on a Doc object. Not sure what the best solution +# would be, but I don't think it's an important use case? +@pytest.mark.xfail(reason="No longer supported") def test_oracle_moves_whitespace(en_vocab): words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] doc = Doc(en_vocab, words=words) - gold = GoldParse(doc, words=words, entities=biluo_tags) + example = Example.from_dict(doc, {"entities": biluo_tags}) moves = BiluoPushDown(en_vocab.strings) move_types = ("M", "B", "I", "L", "U", "O") @@ -132,8 +138,7 @@ def test_oracle_moves_whitespace(en_vocab): else: action, label = tag.split("-") moves.add_action(move_types.index(action), label) - moves.preprocess_gold(gold) - moves.get_oracle_sequence(doc, gold) + moves.get_oracle_sequence(example) def test_accept_blocked_token(): diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index d88517fb5..93d92e26b 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,10 +1,11 @@ import pytest + +from spacy.gold import Example from spacy.pipeline.defaults import default_parser, default_tok2vec from spacy.vocab import Vocab from spacy.syntax.arc_eager import ArcEager from spacy.syntax.nn_parser import Parser from spacy.tokens.doc import Doc -from spacy.gold import GoldParse from thinc.api import Model @@ -52,7 +53,7 @@ def doc(vocab): @pytest.fixture def gold(doc): - return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"]) + return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]} def test_can_init_nn_parser(parser): @@ -77,7 +78,8 @@ def test_update_doc(parser, model, doc, gold): weights -= 0.001 * gradient return weights, gradient - parser.update((doc, gold), sgd=optimize) + example = Example.from_dict(doc, gold) + parser.update([example], sgd=optimize) @pytest.mark.xfail diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 841eb058c..e69de29bb 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -1,107 +0,0 @@ -import pytest -import numpy -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.pipeline.defaults import default_parser -from spacy.pipeline import DependencyParser -from spacy.syntax.arc_eager import ArcEager -from spacy.tokens import Doc -from spacy.syntax._beam_utils import ParserBeam -from spacy.syntax.stateclass import StateClass -from spacy.gold import GoldParse - - -@pytest.fixture -def vocab(): - return Vocab() - - -@pytest.fixture -def moves(vocab): - aeager = ArcEager(vocab.strings, {}) - aeager.add_action(2, "nsubj") - aeager.add_action(3, "dobj") - aeager.add_action(2, "aux") - return aeager - - -@pytest.fixture -def docs(vocab): - return [Doc(vocab, words=["Rats", "bite", "things"])] - - -@pytest.fixture -def states(docs): - return [StateClass(doc) for doc in docs] - - -@pytest.fixture -def tokvecs(docs, vector_size): - output = [] - for doc in docs: - vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size)) - output.append(numpy.asarray(vec)) - return output - - -@pytest.fixture -def golds(docs): - return [GoldParse(doc) for doc in docs] - - -@pytest.fixture -def batch_size(docs): - return len(docs) - - -@pytest.fixture -def beam_width(): - return 4 - - -@pytest.fixture -def vector_size(): - return 6 - - -@pytest.fixture -def beam(moves, states, golds, beam_width): - return ParserBeam(moves, states, golds, width=beam_width, density=0.0) - - -@pytest.fixture -def scores(moves, batch_size, beam_width): - return [ - numpy.asarray( - numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f" - ) - for _ in range(batch_size) - ] - - -def test_create_beam(beam): - pass - - -def test_beam_advance(beam, scores): - beam.advance(scores) - - -def test_beam_advance_too_few_scores(beam, scores): - with pytest.raises(IndexError): - beam.advance(scores[:-1]) - - -def test_beam_parse(): - nlp = Language() - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") - nlp.parser.add_label("nsubj") - nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) - doc = nlp.make_doc("Australia is a country") - nlp.parser(doc, beam_width=2) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 6e13d3044..f13b7e847 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -33,7 +33,7 @@ def test_parser_root(en_tokenizer): @pytest.mark.xfail -@pytest.mark.parametrize("text", ["Hello"]) +# @pytest.mark.parametrize("text", ["Hello"]) def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) doc = get_doc( @@ -46,7 +46,8 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): assert doc[0].dep != 0 -@pytest.mark.xfail +# We removed the step_through API a while ago. we should bring it back though +@pytest.mark.xfail(reason="Unsupported") def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." # heads = [1, 0, 1, -2, -3, -1, -5] @@ -90,8 +91,8 @@ def test_parser_merge_pp(en_tokenizer): assert doc[2].text == "another phrase" assert doc[3].text == "occurs" - -@pytest.mark.xfail +# We removed the step_through API a while ago. we should bring it back though +@pytest.mark.xfail(reason="Unsupported") def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 37a9136aa..ffd0c5df4 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,9 +1,9 @@ import pytest from thinc.api import Adam from spacy.attrs import NORM -from spacy.gold import GoldParse from spacy.vocab import Vocab +from spacy.gold import Example from spacy.pipeline.defaults import default_parser from spacy.tokens import Doc from spacy.pipeline import DependencyParser @@ -33,8 +33,10 @@ def parser(vocab): for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) - gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) - parser.update((doc, gold), sgd=sgd, losses=losses) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) return parser diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 62c7fbf17..a50ad8499 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -252,10 +252,18 @@ def test_preserving_links_ents_2(nlp): # fmt: off TRAIN_DATA = [ - ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}), - ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}), - ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}), - ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}), + ("Russ Cochran captured his first major title with his son as caddie.", + {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")]}), + ("Russ Cochran his reprints include EC Comics.", + {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "entities": [(0, 12, "PERSON")]}), + ("Russ Cochran has been publishing comic art.", + {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "entities": [(0, 12, "PERSON")]}), + ("Russ Cochran was a member of University of Kentucky's golf team.", + {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}), ] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] # fmt: on diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index f052c4380..c853de232 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -53,7 +53,7 @@ def test_overfitting_IO(): "Feat=J|POS=ADJ", "Feat=N|POS=NOUN", ] - assert gold_morphs == [t.morph_ for t in doc] + assert [t.morph_ for t in doc] == gold_morphs # Also test the results are still the same after IO with make_tempdir() as tmp_dir: diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 5c00b97ce..6dfa0acee 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -26,7 +26,7 @@ def test_sentencizer_pipe(): sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 - for ex in nlp.pipe(texts, as_example=True): + for ex in nlp.pipe(texts): doc = ex.doc assert doc.is_sentenced sent_starts = [t.is_sent_start for t in doc] diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 179659597..6f01ada69 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -7,11 +7,11 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer from spacy.tokens import Doc -from spacy.gold import GoldParse from spacy.util import fix_random_seed from ..util import make_tempdir from spacy.pipeline.defaults import default_tok2vec +from ...gold import Example TRAIN_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), @@ -51,21 +51,20 @@ def test_textcat_learns_multilabel(): cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) - model = TextCategorizer(nlp.vocab, width=8) + textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: - model.add_label(letter) - optimizer = model.begin_training() + textcat.add_label(letter) + optimizer = textcat.begin_training() for i in range(30): losses = {} - Ys = [GoldParse(doc, cats=cats) for doc, cats in docs] - Xs = [doc for doc, cats in docs] - model.update(Xs, Ys, sgd=optimizer, losses=losses) + examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] + textcat.update(examples, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} - model(doc) + textcat(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 6a2d16733..8c989a7eb 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -277,11 +277,18 @@ def test_issue1967(label): "beam_update_prob": 1.0, } ner = EntityRecognizer(Vocab(), default_ner(), **config) - example = Example(doc=None) - example.set_token_annotation( - ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] + example = Example.from_dict( + Doc(ner.vocab, words=["word"]), + { + "ids": [0], + "words": ["word"], + "tags": ["tag"], + "heads": [0], + "deps": ["dep"], + "entities": [label], + }, ) - ner.moves.get_actions(gold_parses=[example]) + assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] def test_issue1971(en_vocab): diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index 5e2764618..3bddc26ca 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,5 +1,7 @@ from collections import defaultdict +import pytest + from spacy.pipeline.defaults import default_ner from spacy.pipeline import EntityRecognizer @@ -7,6 +9,8 @@ from spacy.lang.en import English from spacy.tokens import Span +# skipped after removing Beam stuff during the Example/GoldParse refactor +@pytest.mark.skip def test_issue4313(): """ This should not crash or exit with some strange error code """ beam_width = 16 diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 80d37b1e6..fc05444d5 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,24 +1,31 @@ -import srsly -from spacy.gold import GoldCorpus +from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir +from ...gold.converters import json2docs +from ...tokens import DocBin def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: - json_path = tmpdir / "test4402.json" - srsly.write_json(json_path, json_data) + output_file = tmpdir / "test4402.spacy" + docs = json2docs([json_data]) + data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - corpus = GoldCorpus(str(json_path), str(json_path)) + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 - train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) - # assert that the data got split into 4 sentences - assert len(train_data) == 4 + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 -json_data = [ +json_data =\ { "id": 0, "paragraphs": [ @@ -89,4 +96,3 @@ json_data = [ }, ], } -] diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py index fa962c053..0708499de 100644 --- a/spacy/tests/regression/test_issue4529.py +++ b/spacy/tests/regression/test_issue4529.py @@ -1,5 +1,6 @@ import pytest -from spacy.gold import GoldParse + +from spacy.gold import Example @pytest.mark.parametrize( @@ -7,4 +8,4 @@ from spacy.gold import GoldParse ) def test_gold_misaligned(en_tokenizer, text, words): doc = en_tokenizer(text) - GoldParse(doc, words=words) + Example.from_dict(doc, {"words": words}) diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py index 721ec0098..e28d0f44a 100644 --- a/spacy/tests/regression/test_issue4665.py +++ b/spacy/tests/regression/test_issue4665.py @@ -1,4 +1,7 @@ -from spacy.cli.converters.conllu2json import conllu2json +import pytest + +# TODO +# from spacy.gold.converters.conllu2docs import conllu2docs input_data = """ 1 [ _ PUNCT -LRB- _ _ punct _ _ @@ -22,10 +25,11 @@ input_data = """ """ +@pytest.mark.xfail def test_issue4665(): """ conllu2json should not raise an exception if the HEAD column contains an underscore """ - - conllu2json(input_data) + pass + # conllu2json(input_data) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 132f7ac9f..ca0f3710f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,9 +1,14 @@ import pytest +from spacy.gold import docs_to_json +from spacy.gold.converters import iob2docs, conll_ner2docs +from spacy.gold.converters.conllu2json import conllu2json from spacy.lang.en import English -from spacy.cli.converters import conllu2json, iob2json, conll_ner2json from spacy.cli.pretrain import make_docs +# TODO +# from spacy.gold.converters import conllu2docs + def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu @@ -109,7 +114,7 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] -def test_cli_converters_iob2json(): +def test_cli_converters_iob2json(en_vocab): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -117,19 +122,21 @@ def test_cli_converters_iob2json(): "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", ] input_data = "\n".join(lines) - converted = iob2json(input_data, n_sents=10) - assert len(converted) == 1 - assert converted[0]["id"] == 0 - assert len(converted[0]["paragraphs"]) == 1 - assert len(converted[0]["paragraphs"][0]["sentences"]) == 4 + converted_docs = iob2docs(input_data, en_vocab, n_sents=10) + assert len(converted_docs) == 1 + converted = docs_to_json(converted_docs) + assert converted["id"] == 0 + assert len(converted["paragraphs"]) == 1 + assert len(converted["paragraphs"][0]["sentences"]) == 4 for i in range(0, 4): - sent = converted[0]["paragraphs"][0]["sentences"][i] + sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] - assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] - # fmt: on + assert len(converted_docs[0].ents) == 8 + for ent in converted_docs[0].ents: + assert(ent.text in ["New York City", "London"]) def test_cli_converters_conll_ner2json(): @@ -182,19 +189,22 @@ def test_cli_converters_conll_ner2json(): ".\t.\t_\tO", ] input_data = "\n".join(lines) - converted = conll_ner2json(input_data, n_sents=10) - assert len(converted) == 1 - assert converted[0]["id"] == 0 - assert len(converted[0]["paragraphs"]) == 1 - assert len(converted[0]["paragraphs"][0]["sentences"]) == 5 + converted_docs = conll_ner2docs(input_data, n_sents=10) + assert len(converted_docs) == 1 + converted = docs_to_json(converted_docs) + assert converted["id"] == 0 + assert len(converted["paragraphs"]) == 1 + assert len(converted["paragraphs"][0]["sentences"]) == 5 for i in range(0, 5): - sent = converted[0]["paragraphs"][0]["sentences"][i] + sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] - assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] # fmt: on + assert len(converted_docs[0].ents) == 10 + for ent in converted_docs[0].ents: + assert (ent.text in ["New York City", "London"]) def test_pretrain_make_docs(): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 982c0d910..a7c476688 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,15 +1,18 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align -from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation +from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align +from spacy.gold import Corpus, docs_to_json +from spacy.gold.example import Example +from spacy.gold.converters import json2docs from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree -from spacy.tokens import Doc +from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, compounding, minibatch import pytest import srsly from .util import make_tempdir +from ..gold.augment import make_orth_variants_example @pytest.fixture @@ -89,11 +92,18 @@ def merged_dict(): return { "ids": [1, 2, 3, 4, 5, 6, 7], "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "spaces": [True, True, True, True, True, True, False], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], - "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], + "sent_starts": [1, 0, 0, 1, 0, 0, 0], } +@pytest.fixture +def vocab(): + nlp = English() + return nlp.vocab + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] @@ -143,38 +153,182 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_example_from_dict_no_ner(en_vocab): + words = ["a", "b", "c", "d"] + spaces = [True, True, False, True] + predicted = Doc(en_vocab, words=words, spaces=spaces) + example = Example.from_dict(predicted, {"words": words}) + ner_tags = example.get_aligned_ner() + assert ner_tags == [None, None, None, None] + + +def test_example_from_dict_some_ner(en_vocab): + words = ["a", "b", "c", "d"] + spaces = [True, True, False, True] + predicted = Doc(en_vocab, words=words, spaces=spaces) + example = Example.from_dict( + predicted, + { + "words": words, + "entities": ["U-LOC", None, None, None] + } + ) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["U-LOC", None, None, None] + + +def test_json2docs_no_ner(en_vocab): + data = [{ + "id":1, + "paragraphs":[ + { + "sentences":[ + { + "tokens":[ + { + "dep":"nn", + "head":1, + "tag":"NNP", + "orth":"Ms." + }, + { + "dep":"nsubj", + "head":1, + "tag":"NNP", + "orth":"Haag" + }, + { + "dep":"ROOT", + "head":0, + "tag":"VBZ", + "orth":"plays" + }, + { + "dep":"dobj", + "head":-1, + "tag":"NNP", + "orth":"Elianti" + }, + { + "dep":"punct", + "head":-2, + "tag":".", + "orth":"." + } + ] + } + ] + } + ] + }] + docs = json2docs(data) + assert len(docs) == 1 + for doc in docs: + assert not doc.is_nered + for token in doc: + assert token.ent_iob == 0 + eg = Example( + Doc( + doc.vocab, + words=[w.text for w in doc], + spaces=[bool(w.whitespace_) for w in doc] + ), + doc + ) + ner_tags = eg.get_aligned_ner() + assert ner_tags == [None, None, None, None, None] + + + +def test_split_sentences(en_vocab): + words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] + doc = Doc(en_vocab, words=words) + gold_words = [ + "I", + "flew", + "to", + "San", + "Francisco", + "Valley", + "had", + "loads", + "of", + "fun", + ] + sent_starts = [True, False, False, False, False, False, True, False, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"] + doc = Doc(en_vocab, words=words) + gold_words = [ + "I", + "flew", + "to", + "San Francisco", + "Valley", + "had", + "loads of", + "fun", + ] + sent_starts = [True, False, False, False, False, True, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): # one-to-many words = ["I", "flew to", "San Francisco Valley", "."] spaces = [True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, - words=["I", "flew", "to", "San", "Francisco", "Valley", "."], - entities=entities, - ) - assert gp.ner == ["O", "O", "U-LOC", "O"] - + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", None, "U-LOC", "O"] + # many-to-one words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities - ) - assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + gold_words = ["I", "flew to", "San Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] # misaligned words = ["I flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities, + offset_start = len("I flew to ") + offset_end = len("I flew to San Francisco Valley") + entities = [(offset_start, offset_end, "LOC")] + links = {(offset_start, offset_end): {"Q816843": 1.0}} + gold_words = ["I", "flew to", "San", "Francisco Valley", "."] + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "links": links} ) - assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"] + ner_tags = example.get_aligned_ner() + assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"] + #assert example.get_aligned("ENT_KB_ID", as_string=True) == [ + # "", + # "", + # "Q816843", + # "Q816843", + # "", + #] + #assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == { + # "Q816843": 1.0 + #} # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces( @@ -183,33 +337,34 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): ) doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, - words=["I", "flew", " ", "to", "San Francisco Valley", "."], - entities=entities, + gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."] + gold_spaces = [True, True, False, True, False, False] + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} ) - assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] # from issue #4791 - data = ( - "I'll return the ₹54 amount", - { - "words": ["I", "'ll", "return", "the", "₹", "54", "amount"], - "entities": [(16, 19, "MONEY")], - }, + doc = en_tokenizer("I'll return the ₹54 amount") + gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + gold_spaces = [False, True, True, True, False, True, False] + entities = [(16, 19, "MONEY")] + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} ) - gp = GoldParse(en_tokenizer(data[0]), **data[1]) - assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"] + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "O", "O", "U-MONEY", "O"] - data = ( - "I'll return the $54 amount", - { - "words": ["I", "'ll", "return", "the", "$", "54", "amount"], - "entities": [(16, 19, "MONEY")], - }, + doc = en_tokenizer("I'll return the $54 amount") + gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"] + gold_spaces = [False, True, True, True, False, True, False] + entities = [(16, 19, "MONEY")] + example = Example.from_dict( + doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities} ) - gp = GoldParse(en_tokenizer(data[0]), **data[1]) - assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] def test_roundtrip_offsets_biluo_conversion(en_tokenizer): @@ -220,6 +375,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) + offsets_converted = [ent for ent in offsets if ent[2]] assert offsets_converted == offsets @@ -227,6 +383,7 @@ def test_biluo_spans(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] spans = spans_from_biluo_tags(doc, biluo_tags) + spans = [span for span in spans if span.label_] assert len(spans) == 2 assert spans[0].text == "Silicon Valley" assert spans[0].label_ == "LOC" @@ -237,7 +394,8 @@ def test_biluo_spans(en_tokenizer): def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 + example = Example.from_dict(doc, {"entities": biluo_tags}) + assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] def test_iob_to_biluo(): @@ -250,159 +408,98 @@ def test_iob_to_biluo(): iob_to_biluo(bad_iob) -def test_roundtrip_docs_to_json(doc): +def test_roundtrip_docs_to_docbin(doc): nlp = English() text = doc.text + idx = [t.idx for t in doc] tags = [t.tag_ for t in doc] pos = [t.pos_ for t in doc] morphs = [t.morph_ for t in doc] lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] - biluo_tags = iob_to_biluo( - [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] - ) cats = doc.cats + ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - # roundtrip to JSON + # roundtrip to DocBin with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) - - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert pos == goldparse.pos - assert morphs == goldparse.morphs - assert lemmas == goldparse.lemmas - assert deps == goldparse.labels - assert heads == goldparse.heads - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] - - # roundtrip to JSONL train dicts - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "roundtrip.jsonl" - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert pos == goldparse.pos - assert morphs == goldparse.morphs - assert lemmas == goldparse.lemmas - assert deps == goldparse.labels - assert heads == goldparse.heads - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] - - # roundtrip to JSONL tuples - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "roundtrip.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - # load and rewrite as JSONL tuples - srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - reloaded_example = next(goldcorpus.dev_dataset(nlp)) - goldparse = reloaded_example.gold - - assert len(doc) == goldcorpus.count_train() - assert text == reloaded_example.text - assert tags == goldparse.tags - assert deps == goldparse.labels - assert heads == goldparse.heads - assert lemmas == goldparse.lemmas - assert biluo_tags == goldparse.ner - assert "TRAVEL" in goldparse.cats - assert "BAKING" in goldparse.cats - assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] - assert cats["BAKING"] == goldparse.cats["BAKING"] - - -def test_projective_train_vs_nonprojective_dev(doc): - nlp = English() - deps = [t.dep_ for t in doc] - heads = [t.head.i for t in doc] - - with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) - - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) - train_goldparse = train_reloaded_example.gold - - dev_reloaded_example = next(goldcorpus.dev_dataset(nlp)) - dev_goldparse = dev_reloaded_example.gold - - assert is_nonproj_tree([t.head.i for t in doc]) is True - assert is_nonproj_tree(train_goldparse.heads) is False - assert heads[:-1] == train_goldparse.heads[:-1] - assert heads[-1] != train_goldparse.heads[-1] - assert deps[:-1] == train_goldparse.labels[:-1] - assert deps[-1] != train_goldparse.labels[-1] - - assert heads == dev_goldparse.heads - assert deps == dev_goldparse.labels + goldcorpus = Corpus(str(json_file), str(json_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) + reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) + assert len(doc) == goldcorpus.count_train(nlp) + assert text == reloaded_example.reference.text + assert idx == [t.idx for t in reloaded_example.reference] + assert tags == [t.tag_ for t in reloaded_example.reference] + assert pos == [t.pos_ for t in reloaded_example.reference] + assert morphs == [t.morph_ for t in reloaded_example.reference] + assert lemmas == [t.lemma_ for t in reloaded_example.reference] + assert deps == [t.dep_ for t in reloaded_example.reference] + assert heads == [t.head.i for t in reloaded_example.reference] + assert ents == [ + (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents + ] + assert "TRAVEL" in reloaded_example.reference.cats + assert "BAKING" in reloaded_example.reference.cats + assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] + assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] +# Hm, not sure where misalignment check would be handled? In the components too? +# I guess that does make sense. A text categorizer doesn't care if it's +# misaligned... +@pytest.mark.xfail(reason="Outdated") def test_ignore_misaligned(doc): nlp = English() text = doc.text with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" + json_file = tmpdir / "test.json" data = [docs_to_json(doc)] data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + # write to JSON train dicts + srsly.write_json(json_file, data) + goldcorpus = Corpus(str(json_file), str(json_file)) - with pytest.raises(AlignmentError): - train_reloaded_example = next(goldcorpus.train_dataset(nlp)) + with pytest.raises(AlignmentError): + train_reloaded_example = next(goldcorpus.train_dataset(nlp)) with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" + json_file = tmpdir / "test.json" data = [docs_to_json(doc)] data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, data) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + # write to JSON train dicts + srsly.write_json(json_file, data) + goldcorpus = Corpus(str(json_file), str(json_file)) - # doesn't raise an AlignmentError, but there is nothing to iterate over - # because the only example can't be aligned - train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) - assert len(train_reloaded_example) == 0 + # doesn't raise an AlignmentError, but there is nothing to iterate over + # because the only example can't be aligned + train_reloaded_example = list( + goldcorpus.train_dataset(nlp, ignore_misaligned=True) + ) + assert len(train_reloaded_example) == 0 +# We probably want the orth variant logic back, but this test won't be quite +# right -- we need to go from DocBin. def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: - jsonl_file = tmpdir / "test.jsonl" - # write to JSONL train dicts - srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + output_file = tmpdir / "roundtrip.spacy" + data = DocBin(docs=[doc]).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = train_reloaded_example.gold # noqa: F841 + # due to randomness, test only that this runs with no errors for now + train_example = next(goldcorpus.train_dataset(nlp)) + variant_example = make_orth_variants_example( + nlp, train_example, orth_variant_level=0.2 + ) @pytest.mark.parametrize( @@ -439,39 +536,35 @@ def test_align(tokens_a, tokens_b, expected): def test_goldparse_startswith_space(en_tokenizer): text = " a" doc = en_tokenizer(text) - g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0]) - assert g.words == [" ", "a"] - assert g.ner == [None, "U-DATE"] - assert g.labels == [None, "ROOT"] + gold_words = ["a"] + entities = ["U-DATE"] + deps = ["ROOT"] + heads = [0] + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} + ) + ner_tags = example.get_aligned_ner() + assert ner_tags == [None, "U-DATE"] + assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] def test_gold_constructor(): - """Test that the GoldParse constructor works fine""" + """Test that the Example constructor works fine""" nlp = English() doc = nlp("This is a sentence") - gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) - - assert gold.cats["cat1"] - assert not gold.cats["cat2"] - assert gold.words == ["This", "is", "a", "sentence"] - - -def test_gold_orig_annot(): - nlp = English() - doc = nlp("This is a sentence") - gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0}) - - assert gold.orig.words == ["This", "is", "a", "sentence"] - assert gold.cats["cat1"] - - doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0}) - gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig) - assert gold2.orig.words == ["This", "is", "a", "sentence"] - assert not gold2.cats["cat1"] + example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}}) + assert example.get_aligned("ORTH", as_string=True) == [ + "This", + "is", + "a", + "sentence", + ] + assert example.reference.cats["cat1"] + assert not example.reference.cats["cat2"] def test_tuple_format_implicit(): - """Test tuple format with implicit GoldParse creation""" + """Test tuple format""" train_data = [ ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), @@ -486,7 +579,7 @@ def test_tuple_format_implicit(): def test_tuple_format_implicit_invalid(): - """Test that an error is thrown for an implicit invalid GoldParse field""" + """Test that an error is thrown for an implicit invalid field""" train_data = [ ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}), @@ -497,10 +590,11 @@ def test_tuple_format_implicit_invalid(): ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), ] - with pytest.raises(TypeError): + with pytest.raises(KeyError): _train(train_data) + def _train(train_data): nlp = English() ner = nlp.create_pipe("ner") @@ -518,43 +612,23 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() - example = Example() - example.set_token_annotation(**merged_dict) - assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 - assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 + example = Example.from_dict( + Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), + merged_dict, + ) + assert example.text == "Hi there everyone It is just me" split_examples = example.split_sents() assert len(split_examples) == 2 + assert split_examples[0].text == "Hi there everyone " + assert split_examples[1].text == "It is just me" - token_annotation_1 = split_examples[0].token_annotation - assert token_annotation_1.ids == [1, 2, 3] - assert token_annotation_1.words == ["Hi", "there", "everyone"] - assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] - assert token_annotation_1.sent_starts == [1, 0, 0] + token_annotation_1 = split_examples[0].to_dict()["token_annotation"] + assert token_annotation_1["words"] == ["Hi", "there", "everyone"] + assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"] + assert token_annotation_1["sent_starts"] == [1, 0, 0] - token_annotation_2 = split_examples[1].token_annotation - assert token_annotation_2.ids == [4, 5, 6, 7] - assert token_annotation_2.words == ["It", "is", "just", "me"] - assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"] - assert token_annotation_2.sent_starts == [1, 0, 0, 0] - - -def test_tuples_to_example(merged_dict): - ex = Example() - ex.set_token_annotation(**merged_dict) - cats = {"TRAVEL": 1.0, "BAKING": 0.0} - ex.set_doc_annotation(cats=cats) - ex_dict = ex.to_dict() - - assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"] - assert ex_dict["token_annotation"]["words"] == merged_dict["words"] - assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"] - assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"] - assert ex_dict["doc_annotation"]["cats"] == cats - - -def test_empty_example_goldparse(): - nlp = English() - doc = nlp("") - example = Example(doc=doc) - assert len(example.get_gold_parses()) == 1 + token_annotation_2 = split_examples[1].to_dict()["token_annotation"] + assert token_annotation_2["words"] == ["It", "is", "just", "me"] + assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 58db0a040..e5555bbc7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,6 +1,5 @@ import itertools import pytest -from spacy.gold import GoldParse from spacy.language import Language from spacy.tokens import Doc, Span from spacy.vocab import Vocab @@ -24,40 +23,27 @@ def test_language_update(nlp): annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} wrongkeyannots = {"LABEL": True} doc = Doc(nlp.vocab, words=text.split(" ")) - gold = GoldParse(doc, **annots) - # Update with doc and gold objects - nlp.update((doc, gold)) # Update with text and dict nlp.update((text, annots)) # Update with doc object and dict nlp.update((doc, annots)) - # Update with text and gold object - nlp.update((text, gold)) - # Update with empty doc and gold object - nlp.update((None, gold)) # Update badly with pytest.raises(ValueError): nlp.update((doc, None)) - with pytest.raises(TypeError): + with pytest.raises(KeyError): nlp.update((text, wrongkeyannots)) def test_language_evaluate(nlp): text = "hello world" - annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} + annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} doc = Doc(nlp.vocab, words=text.split(" ")) - gold = GoldParse(doc, **annots) - # Evaluate with doc and gold objects - nlp.evaluate([(doc, gold)]) # Evaluate with text and dict nlp.evaluate([(text, annots)]) # Evaluate with doc object and dict nlp.evaluate([(doc, annots)]) - # Evaluate with text and gold object - nlp.evaluate([(text, gold)]) - # Evaluate badly with pytest.raises(Exception): - nlp.evaluate([text, gold]) + nlp.evaluate([text, annots]) def test_evaluate_no_pipe(nlp): diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py new file mode 100644 index 000000000..b89654554 --- /dev/null +++ b/spacy/tests/test_new_example.py @@ -0,0 +1,242 @@ +import pytest +from spacy.gold.example import Example +from spacy.tokens import Doc +from spacy.vocab import Vocab + + +def test_Example_init_requires_doc_objects(): + vocab = Vocab() + with pytest.raises(TypeError): + example = Example(None, None) + with pytest.raises(TypeError): + example = Example(Doc(vocab, words=["hi"]), None) + with pytest.raises(TypeError): + example = Example(None, Doc(vocab, words=["hi"])) + + +def test_Example_from_dict_basic(): + example = Example.from_dict( + Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]} + ) + assert isinstance(example.x, Doc) + assert isinstance(example.y, Doc) + + +@pytest.mark.parametrize( + "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}] +) +def test_Example_from_dict_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(KeyError): + Example.from_dict(predicted, annots) + + +@pytest.mark.parametrize( + "pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]] +) +@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}]) +def test_Example_from_dict_with_tags(pred_words, annots): + vocab = Vocab() + predicted = Doc(vocab, words=pred_words) + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): + assert token.tag_ == annots["tags"][i] + aligned_tags = example.get_aligned("tag", as_string=True) + assert aligned_tags == ["NN" for _ in predicted] + + +def test_aligned_tags(): + pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"] + gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"] + gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"] + annots = {"words": gold_words, "tags": gold_tags} + vocab = Vocab() + predicted = Doc(vocab, words=pred_words) + example = Example.from_dict(predicted, annots) + aligned_tags = example.get_aligned("tag", as_string=True) + assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"] + + +def test_aligned_tags_multi(): + pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"] + gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"] + gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"] + annots = {"words": gold_words, "tags": gold_tags} + vocab = Vocab() + predicted = Doc(vocab, words=pred_words) + example = Example.from_dict(predicted, annots) + aligned_tags = example.get_aligned("tag", as_string=True) + assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"] + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "London", "and", "Berlin", "."], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + "heads": [1, 1, 1, 2, 2, 1], + } + ], +) +def test_Example_from_dict_with_parse(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): + assert token.dep_ == annots["deps"][i] + assert token.head.i == annots["heads"][i] + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["Sarah", "'s", "sister", "flew"], + "morphs": [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + ], + } + ], +) +def test_Example_from_dict_with_morphology(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + for i, token in enumerate(example.reference): + assert token.morph_ == annots["morphs"][i] + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["This", "is", "one", "sentence", "this", "is", "another"], + "sent_starts": [1, 0, 0, 0, 1, 0, 0], + } + ], +) +def test_Example_from_dict_with_sent_start(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.sents)) == 2 + for i, token in enumerate(example.reference): + assert bool(token.is_sent_start) == bool(annots["sent_starts"][i]) + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["This", "is", "a", "sentence"], + "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5}, + } + ], +) +def test_Example_from_dict_with_cats(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert len(list(example.reference.cats)) == 3 + assert example.reference.cats["cat1"] == 1.0 + assert example.reference.cats["cat2"] == 0.0 + assert example.reference.cats["cat3"] == 0.5 + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + } + ], +) +def test_Example_from_dict_with_entities(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + + assert len(list(example.reference.ents)) == 2 + assert [example.reference[i].ent_iob_ for i in range(7)] == [ + "O", + "O", + "B", + "I", + "O", + "B", + "O", + ] + assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2] + + assert example.reference[2].ent_type_ == "LOC" + assert example.reference[3].ent_type_ == "LOC" + assert example.reference[5].ent_type_ == "LOC" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [ + (0, 4, "LOC"), + (21, 27, "LOC"), + ], # not aligned to token boundaries + } + ], +) +def test_Example_from_dict_with_entities_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + # TODO: shouldn't this throw some sort of warning ? + assert len(list(example.reference.ents)) == 0 + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": { + (7, 15): {"Q60": 1.0, "Q64": 0.0}, + (20, 26): {"Q60": 0.0, "Q64": 1.0}, + }, + } + ], +) +def test_Example_from_dict_with_links(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + example = Example.from_dict(predicted, annots) + assert example.reference[0].ent_kb_id_ == "" + assert example.reference[1].ent_kb_id_ == "" + assert example.reference[2].ent_kb_id_ == "Q60" + assert example.reference[3].ent_kb_id_ == "Q60" + assert example.reference[4].ent_kb_id_ == "" + assert example.reference[5].ent_kb_id_ == "Q64" + assert example.reference[6].ent_kb_id_ == "" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, + } + ], +) +def test_Example_from_dict_with_links_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 2e1cf2730..a6684b706 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,12 +1,14 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example, GoldParse +from spacy.gold import Example +from spacy.gold.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc from spacy.lang.en import English + test_las_apple = [ [ "Apple is looking at buying U.K. startup for $ 1 billion", @@ -89,8 +91,9 @@ def test_las_per_type(en_vocab): heads=([h - i for i, h in enumerate(annot["heads"])]), deps=annot["deps"], ) - gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) - scorer.score((doc, gold)) + gold = {"heads": annot["heads"], "deps": annot["deps"]} + example = Example.from_dict(doc, gold) + scorer.score(example) results = scorer.scores assert results["uas"] == 100 @@ -111,9 +114,10 @@ def test_las_per_type(en_vocab): heads=([h - i for i, h in enumerate(annot["heads"])]), deps=annot["deps"], ) - gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) + gold = {"heads": annot["heads"], "deps": annot["deps"]} doc[0].dep_ = "compound" - scorer.score((doc, gold)) + example = Example.from_dict(doc, gold) + scorer.score(example) results = scorer.scores assert results["uas"] == 100 @@ -135,8 +139,8 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example.from_dict(doc, {"entities": entities}) scorer.score(ex) results = scorer.scores @@ -156,8 +160,8 @@ def test_ner_per_type(en_vocab): words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) - ex = Example(doc=doc) - ex.set_token_annotation(entities=annot["entities"]) + entities = biluo_tags_from_offsets(doc, annot["entities"]) + ex = Example.from_dict(doc, {"entities": entities}) scorer.score(ex) results = scorer.scores @@ -181,13 +185,13 @@ def test_ner_per_type(en_vocab): def test_tag_score(tagged_doc): # Gold and Doc are identical scorer = Scorer() - gold = GoldParse( - tagged_doc, - tags=[t.tag_ for t in tagged_doc], - pos=[t.pos_ for t in tagged_doc], - morphs=[t.morph_ for t in tagged_doc], - ) - scorer.score((tagged_doc, gold)) + gold = { + "tags": [t.tag_ for t in tagged_doc], + "pos": [t.pos_ for t in tagged_doc], + "morphs": [t.morph_ for t in tagged_doc], + } + example = Example.from_dict(tagged_doc, gold) + scorer.score(example) results = scorer.scores assert results["tags_acc"] == 100 @@ -204,8 +208,9 @@ def test_tag_score(tagged_doc): morphs = [t.morph_ for t in tagged_doc] morphs[1] = "Number=sing" morphs[2] = "Number=plur" - gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs) - scorer.score((tagged_doc, gold)) + gold = {"tags": tags, "pos": pos, "morphs": morphs} + example = Example.from_dict(tagged_doc, gold) + scorer.score(example) results = scorer.scores assert results["tags_acc"] == 90 diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index a7258449d..65c33c54a 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,5 +1,4 @@ import pytest -from spacy.gold import Example from .util import get_random_doc @@ -25,19 +24,16 @@ from spacy.util import minibatch_by_words ) def test_util_minibatch(doc_sizes, expected_batches): docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 batches = list( - minibatch_by_words( - examples=examples, size=batch_size, tolerance=tol, discard_oversize=True - ) + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True) ) assert [len(batch) for batch in batches] == expected_batches max_size = batch_size + batch_size * tol for batch in batches: - assert sum([len(example.doc) for example in batch]) < max_size + assert sum([len(doc) for doc in batch]) < max_size @pytest.mark.parametrize( @@ -54,12 +50,9 @@ def test_util_minibatch(doc_sizes, expected_batches): def test_util_minibatch_oversize(doc_sizes, expected_batches): """ Test that oversized documents are returned in their own batch""" docs = [get_random_doc(doc_size) for doc_size in doc_sizes] - examples = [Example(doc=doc) for doc in docs] tol = 0.2 batch_size = 1000 batches = list( - minibatch_by_words( - examples=examples, size=batch_size, tolerance=tol, discard_oversize=False - ) + minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) ) assert [len(batch) for batch in batches] == expected_batches diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 3d0a023c9..7c3eaf8ad 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -7,7 +7,7 @@ from pathlib import Path from spacy import Errors from spacy.tokens import Doc, Span -from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA +from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH from spacy.vocab import Vocab @@ -27,15 +27,23 @@ def make_tempdir(): def get_doc( - vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None + vocab, + words=[], + pos=None, + heads=None, + deps=None, + tags=None, + ents=None, + lemmas=None, + morphs=None, ): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) headings = [] values = [] - annotations = [pos, heads, deps, lemmas, tags] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG] + annotations = [pos, heads, deps, lemmas, tags, morphs] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): @@ -61,6 +69,13 @@ def get_doc( attrs[i] = heads[i] else: attrs[i, j] = heads[i] + elif annot is morphs: + for i in range(len(words)): + morph_key = vocab.morphology.add(morphs[i]) + if attrs.ndim == 1: + attrs[i] = morph_key + else: + attrs[i, j] = morph_key else: for i in range(len(words)): if attrs.ndim == 1: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b40113460..2359fd5af 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -218,7 +218,7 @@ cdef class Tokenizer: doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc - def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False): + def pipe(self, texts, batch_size=1000, n_threads=-1): """Tokenize a stream of texts. texts: A sequence of unicode texts. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d3f49550c..a3b089222 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,6 +9,9 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors +ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH") + + class DocBin(object): """Pack Doc objects for binary serialization. @@ -39,7 +42,7 @@ class DocBin(object): document from the DocBin. """ - def __init__(self, attrs=None, store_user_data=False): + def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): """Create a DocBin object to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are @@ -49,7 +52,6 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#init """ - attrs = attrs or [] attrs = sorted([intify_attr(attr) for attr in attrs]) self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] @@ -59,6 +61,8 @@ class DocBin(object): self.user_data = [] self.strings = set() self.store_user_data = store_user_data + for doc in docs: + self.add(doc) def __len__(self): """RETURNS: The number of Doc objects added to the DocBin.""" @@ -79,7 +83,12 @@ class DocBin(object): assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) - self.strings.update(w.text for w in doc) + for token in doc: + self.strings.add(token.text) + self.strings.add(token.tag_) + self.strings.add(token.lemma_) + self.strings.add(token.dep_) + self.strings.add(token.ent_type_) self.cats.append(doc.cats) if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) @@ -98,8 +107,7 @@ class DocBin(object): for i in range(len(self.tokens)): tokens = self.tokens[i] spaces = self.spaces[i] - words = [vocab.strings[orth] for orth in tokens[:, orth_col]] - doc = Doc(vocab, words=words, spaces=spaces) + doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = doc.from_array(self.attrs, tokens) doc.cats = self.cats[i] if self.store_user_data: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index debab6aeb..be8218967 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,6 +3,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from libc.stdint cimport int32_t, uint64_t from collections import Counter import numpy @@ -12,13 +13,14 @@ import srsly from thinc.api import get_array_module from thinc.util import copy_array import warnings +import copy from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER -from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB +from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t @@ -52,6 +54,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return token.pos elif feat_name == TAG: return token.tag + elif feat_name == MORPH: + return token.morph elif feat_name == DEP: return token.dep elif feat_name == HEAD: @@ -184,7 +188,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#init """ self.vocab = vocab - size = 20 + size = max(20, (len(words) if words is not None else 0)) self.mem = Pool() # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can @@ -209,7 +213,6 @@ cdef class Doc: self.user_data = {} if user_data is None else user_data self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) - cdef unicode orth cdef bint has_space if orths_and_spaces is None and words is not None: if spaces is None: @@ -217,19 +220,22 @@ cdef class Doc: elif len(spaces) != len(words): raise ValueError(Errors.E027) orths_and_spaces = zip(words, spaces) + cdef const LexemeC* lexeme if orths_and_spaces is not None: + orths_and_spaces = list(orths_and_spaces) for orth_space in orths_and_spaces: if isinstance(orth_space, unicode): - orth = orth_space + lexeme = self.vocab.get(self.mem, orth_space) has_space = True elif isinstance(orth_space, bytes): raise ValueError(Errors.E028.format(value=orth_space)) + elif isinstance(orth_space[0], unicode): + lexeme = self.vocab.get(self.mem, orth_space[0]) + has_space = orth_space[1] else: - orth, has_space = orth_space - # Note that we pass self.mem here --- we have ownership, if LexemeC - # must be created. - self.push_back( - <const LexemeC*>self.vocab.get(self.mem, orth), has_space) + lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) + has_space = orth_space[1] + self.push_back(lexeme, has_space) # Tough to decide on policy for this. Is an empty doc tagged and parsed? # There's no information we'd like to add to it, so I guess so? if self.length == 0: @@ -517,7 +523,8 @@ cdef class Doc: if start == -1: seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] raise ValueError(Errors.E093.format(seq=" ".join(seq))) - elif token.ent_iob == 2 or token.ent_iob == 0: + elif token.ent_iob == 2 or token.ent_iob == 0 or \ + (token.ent_iob == 3 and token.ent_type == 0): if start != -1: output.append(Span(self, start, i, label=label, kb_id=kb_id)) start = -1 @@ -531,6 +538,8 @@ cdef class Doc: kb_id = token.ent_kb_id if start != -1: output.append(Span(self, start, self.length, label=label, kb_id=kb_id)) + # remove empty-label spans + output = [o for o in output if o.label_ != ""] return tuple(output) def __set__(self, ents): @@ -699,8 +708,12 @@ cdef class Doc: # Handle inputs like doc.to_array(ORTH) py_attr_ids = [py_attr_ids] # Allow strings, e.g. 'lemma' or 'LEMMA' - py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) + try: + py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in py_attr_ids] + except KeyError as msg: + keys = [k for k in IDS.keys() if not k.startswith("FLAG")] + raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i") @@ -747,6 +760,8 @@ cdef class Doc: return dict(counts) def _realloc(self, new_size): + if new_size < self.max_length: + return self.max_length = new_size n = new_size + (PADDING * 2) # What we're storing is a "padded" array. We've jumped forward PADDING @@ -795,10 +810,14 @@ cdef class Doc: if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) - cdef int i, col, abs_head_index + cdef int i, col + cdef int32_t abs_head_index cdef attr_id_t attr_id cdef TokenC* tokens = self.c cdef int length = len(array) + if length != len(self): + raise ValueError("Cannot set array values longer than the document.") + # Get set up for fast loading cdef Pool mem = Pool() cdef int n_attrs = len(attrs) @@ -809,26 +828,52 @@ cdef class Doc: attr_ids[i] = attr_id if len(array.shape) == 1: array = array.reshape((array.size, 1)) + cdef np.ndarray transposed_array = numpy.ascontiguousarray(array.T) + values = <const uint64_t*>transposed_array.data + stride = transposed_array.shape[1] # Check that all heads are within the document bounds if HEAD in attrs: col = attrs.index(HEAD) for i in range(length): # cast index to signed int - abs_head_index = numpy.int32(array[i, col]) + i + abs_head_index = <int32_t>values[col * stride + i] + abs_head_index += i if abs_head_index < 0 or abs_head_index >= length: - raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col]))) + raise ValueError( + Errors.E190.format( + index=i, + value=array[i, col], + rel_head_index=abs_head_index-i + ) + ) # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA if TAG in attrs: col = attrs.index(TAG) for i in range(length): - if array[i, col] != 0: - self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) + value = values[col * stride + i] + if value != 0: + self.vocab.morphology.assign_tag(&tokens[i], value) + # Verify ENT_IOB are proper integers + if ENT_IOB in attrs: + iob_strings = Token.iob_strings() + col = attrs.index(ENT_IOB) + n_iob_strings = len(iob_strings) + for i in range(length): + value = values[col * stride + i] + if value < 0 or value >= n_iob_strings: + raise ValueError( + Errors.E982.format( + values=iob_strings, + value=value + ) + ) # Now load the data for i in range(length): token = &self.c[i] for j in range(n_attrs): if attr_ids[j] != TAG: - Token.set_struct_attr(token, attr_ids[j], array[i, j]) + value = values[j * stride + i] + Token.set_struct_attr(token, attr_ids[j], value) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) @@ -849,6 +894,28 @@ cdef class Doc: """ return numpy.asarray(_get_lca_matrix(self, 0, len(self))) + def copy(self): + cdef Doc other = Doc(self.vocab) + other._vector = copy.deepcopy(self._vector) + other._vector_norm = copy.deepcopy(self._vector_norm) + other.tensor = copy.deepcopy(self.tensor) + other.cats = copy.deepcopy(self.cats) + other.user_data = copy.deepcopy(self.user_data) + other.is_tagged = self.is_tagged + other.is_parsed = self.is_parsed + other.is_morphed = self.is_morphed + other.sentiment = self.sentiment + other.user_hooks = dict(self.user_hooks) + other.user_token_hooks = dict(self.user_token_hooks) + other.user_span_hooks = dict(self.user_span_hooks) + other.length = self.length + other.max_length = self.max_length + buff_size = other.max_length + (PADDING*2) + tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC)) + memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC)) + other.c = &tokens[PADDING] + return other + def to_disk(self, path, **kwargs): """Save the current state to a directory. @@ -881,6 +948,32 @@ cdef class Doc: def to_bytes(self, exclude=tuple(), **kwargs): """Serialize, i.e. export the document contents to a binary string. + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): A losslessly serialized copy of the `Doc`, including + all annotations. + + DOCS: https://spacy.io/api/doc#to_bytes + """ + return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + """Deserialize, i.e. import the document contents from a binary string. + + data (bytes): The string to load from. + exclude (list): String names of serialization fields to exclude. + RETURNS (Doc): Itself. + + DOCS: https://spacy.io/api/doc#from_bytes + """ + return self.from_dict( + srsly.msgpack_loads(bytes_data), + exclude=exclude, + **kwargs + ) + + def to_dict(self, exclude=tuple(), **kwargs): + """Export the document contents to a dictionary for serialization. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. @@ -917,9 +1010,9 @@ cdef class Doc: serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) - return util.to_bytes(serializers, exclude) + return util.to_dict(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_dict(self, msg, exclude=tuple(), **kwargs): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -943,7 +1036,6 @@ cdef class Doc: for key in kwargs: if key in deserializers or key in ("user_data",): raise ValueError(Errors.E128.format(arg=key)) - msg = util.from_bytes(bytes_data, deserializers, exclude) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -975,6 +1067,7 @@ cdef class Doc: self.from_array(msg["array_head"][2:], attrs[:, 2:]) return self + def extend_tensor(self, tensor): """Concatenate a new tensor onto the doc.tensor object. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 320cfaad5..f85a17d69 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -778,6 +778,10 @@ cdef class Token: """ return self.c.ent_iob + @classmethod + def iob_strings(cls): + return ("", "I", "O", "B") + @property def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, @@ -787,8 +791,7 @@ cdef class Token: RETURNS (str): IOB code of named entity tag. """ - iob_strings = ("", "I", "O", "B") - return iob_strings[self.c.ent_iob] + return self.iob_strings()[self.c.ent_iob] property ent_id: """RETURNS (uint64): ID of the entity the token is an instance of, diff --git a/spacy/util.py b/spacy/util.py index ed7ca5b3c..bbd9a73b6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -499,14 +499,6 @@ def get_async(stream, numpy_array): return array -def eg2doc(example): - """Get a Doc object from an Example (or if it's a Doc, use it directly)""" - # Put the import here to avoid circular import problems - from .tokens.doc import Doc - - return example if isinstance(example, Doc) else example.doc - - def env_opt(name, default=None): if type(default) is float: type_convert = float @@ -725,12 +717,13 @@ def decaying(start, stop, decay): curr -= decay -def minibatch_by_words( - examples, size, count_words=len, tolerance=0.2, discard_oversize=False -): +def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by - themselves, or be discarded if discard_oversize=True.""" + themselves, or be discarded if discard_oversize=True. + The argument 'docs' can be a list of strings, Doc's or Example's. """ + from .gold import Example + if isinstance(size, int): size_ = itertools.repeat(size) elif isinstance(size, List): @@ -745,22 +738,27 @@ def minibatch_by_words( batch_size = 0 overflow_size = 0 - for example in examples: - n_words = count_words(example.doc) + for doc in docs: + if isinstance(doc, Example): + n_words = len(doc.reference) + elif isinstance(doc, str): + n_words = len(doc.split()) + else: + n_words = len(doc) # if the current example exceeds the maximum batch size, it is returned separately # but only if discard_oversize=False. if n_words > target_size + tol_size: if not discard_oversize: - yield [example] + yield [doc] # add the example to the current batch if there's no overflow yet and it still fits elif overflow_size == 0 and (batch_size + n_words) <= target_size: - batch.append(example) + batch.append(doc) batch_size += n_words # add the example to the overflow buffer if it fits in the tolerance margin elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): - overflow.append(example) + overflow.append(doc) overflow_size += n_words # yield the previous batch and start a new one. The new one gets the overflow examples. @@ -775,12 +773,12 @@ def minibatch_by_words( # this example still fits if (batch_size + n_words) <= target_size: - batch.append(example) + batch.append(doc) batch_size += n_words # this example fits in overflow elif (batch_size + n_words) <= (target_size + tol_size): - overflow.append(example) + overflow.append(doc) overflow_size += n_words # this example does not fit with the previous overflow: start another new batch @@ -788,7 +786,7 @@ def minibatch_by_words( yield batch target_size = next(size_) tol_size = target_size * tolerance - batch = [example] + batch = [doc] batch_size = n_words # yield the final batch @@ -849,16 +847,23 @@ def filter_spans(spans): def to_bytes(getters, exclude): + return srsly.msgpack_dumps(to_dict(getters, exclude)) + + +def from_bytes(bytes_data, setters, exclude): + return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) + + +def to_dict(getters, exclude): serialized = {} for key, getter in getters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude: serialized[key] = getter() - return srsly.msgpack_dumps(serialized) + return serialized -def from_bytes(bytes_data, setters, exclude): - msg = srsly.msgpack_loads(bytes_data) +def from_dict(msg, setters, exclude): for key, setter in setters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude and key in msg: From 1d672e0c12daecbff34b58dcafe237c66fd4d92e Mon Sep 17 00:00:00 2001 From: Matthw Honnibal <honnibal+gh@gmail.com> Date: Fri, 26 Jun 2020 23:42:41 +0200 Subject: [PATCH 140/203] Revert "attempt to fix _guess_spaces" This reverts commit 5b6ed0575275e86762cc58dab7b01b7fb2a97b63. --- spacy/gold/example.pyx | 4 +++- spacy/tests/test_gold.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index febbf50fc..7f8797043 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -73,7 +73,7 @@ cdef class Example: tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] if not _has_field(tok_dict, "SPACY"): - tok_dict["SPACY"] = _guess_spaces(predicted.text, tok_dict["ORTH"]) + spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) @@ -336,6 +336,8 @@ def _fix_legacy_dict_data(example_dict): else: raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) text = example_dict.get("text", example_dict.get("raw")) + if not _has_field(token_dict, "SPACY"): + token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"]) if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index a7c476688..17f0933d1 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -161,7 +161,6 @@ def test_example_from_dict_no_ner(en_vocab): ner_tags = example.get_aligned_ner() assert ner_tags == [None, None, None, None] - def test_example_from_dict_some_ner(en_vocab): words = ["a", "b", "c", "d"] spaces = [True, True, False, True] From 4ff9a837fc837adaaccf8f14e3820d858405a2ed Mon Sep 17 00:00:00 2001 From: Matthw Honnibal <honnibal+gh@gmail.com> Date: Fri, 26 Jun 2020 23:46:18 +0200 Subject: [PATCH 141/203] Fix _fix_legacy_dict_data in Example --- spacy/gold/example.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 7f8797043..169965c3d 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -336,7 +336,7 @@ def _fix_legacy_dict_data(example_dict): else: raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) text = example_dict.get("text", example_dict.get("raw")) - if not _has_field(token_dict, "SPACY"): + if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"): token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"]) if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. From de96a3950c75150d26040f5d92bdfe1023241aeb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Fri, 26 Jun 2020 23:50:22 +0200 Subject: [PATCH 142/203] Update config --- examples/experiments/onto-joint/defaults.cfg | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index e45758196..ae760becc 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -29,7 +29,7 @@ omit_extra_lookups = false [training.batch_size] @schedules = "compounding.v1" -start = 100 +start = 1000 stop = 1000 compound = 1.001 @@ -40,15 +40,15 @@ beta2 = 0.999 L2_is_weight_decay = true L2 = 0.01 grad_clip = 1.0 -use_averages = true +use_averages = false eps = 1e-8 -learn_rate = 0.001 +#learn_rate = 0.001 -#[optimizer.learn_rate] -#@schedules = "warmup_linear.v1" -#warmup_steps = 250 -#total_steps = 20000 -#initial_rate = 0.001 +[optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.001 [nlp] lang = "en" From 8b305253d3746e94f2b0d0f70c94dbfd53e5d194 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 13:02:10 +0200 Subject: [PATCH 143/203] Update with DVC WIP --- spacy/cli/__init__.py | 2 +- spacy/cli/_app.py | 1 - spacy/cli/project.py | 227 +++++++++++++++++++++++++++++++++++------- 3 files changed, 190 insertions(+), 40 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 14623000a..9af1265d1 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,7 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_get_assets, project_run # noqa: F401 +from .project import project_clone, project_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index 6f64dcb59..2b3ad9524 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -1,4 +1,3 @@ -from typing import Optional import typer from typer.main import get_command diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 3cced4057..12578d813 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import typer import srsly from pathlib import Path @@ -9,14 +9,16 @@ import os import re import shutil import sys +import murmurhash -from ._app import app, Arg, Opt, COMMAND +from ._app import app, Arg, Opt, COMMAND, NAME from .. import about from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir CONFIG_FILE = "project.yml" +DVC_CONFIG = "dvc.yaml" DIRS = [ "assets", "metas", @@ -34,13 +36,18 @@ CACHES = [ os.environ.get("TORCH_HOME"), Path.home() / ".keras", ] +DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit +# it directly and edit the project.yml instead and re-run the project.""" + project_cli = typer.Typer(help="Command-line interface for spaCy projects") @project_cli.callback(invoke_without_command=True) -def callback(): - # This runs before every project command and ensures DVC is installed +def callback(ctx: typer.Context): + """This runs before every project command and ensures DVC is installed and + everything is up to date. + """ try: subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) except Exception: @@ -59,15 +66,21 @@ def project_clone_cli( name: str = Arg(..., help="The name of the template to fetch"), dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") # fmt: on ): """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo, verbose=verbose) + project_clone(name, dest, repo=repo, git=git, verbose=verbose) def project_clone( - name: str, dest: Path, *, repo: str = about.__projects__, verbose: bool = False + name: str, + dest: Path, + *, + repo: str = about.__projects__, + git: bool = False, + verbose: bool = False, ) -> None: dest = ensure_path(dest) check_clone_dest(dest) @@ -86,52 +99,97 @@ def project_clone( dir_path = dest / sub_dir if not dir_path.exists(): dir_path.mkdir(parents=True) + with working_dir(dest): + # TODO: check that .dvc exists in other commands? + init_cmd = ["dvc", "init"] + if not git: + init_cmd.append("--no-scm") + if git: + run_command(["git", "init"]) + run_command(init_cmd) msg.good(f"Your project is now ready!", dest.resolve()) - print(f"To get the assets, run:\npython -m spacy project get-assets {dest}") + print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") -@project_cli.command("get-assets") -def project_get_assets_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False) +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't download anything"), + # fmt: on ): """Use Data Version Control to get the assets for the project.""" - project_get_assets(path) + project_assets(path, dry=dry) -def project_get_assets(project_path: Path) -> None: +def project_assets(project_path: Path, *, dry: bool = False) -> None: + if dry: + msg.warn("Performing a dry run and not downloading anything") project_path = ensure_path(project_path) config = load_project_config(project_path) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Getting {len(assets)} asset(s)") + msg.info(f"Fetching {len(assets)} asset(s)") variables = config.get("variables", {}) for asset in assets: url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) dest_path = project_path / dest check_asset(url) - cmd = ["dvc", "get-url", url, str(dest_path)] + if not dry: + cmd = ["dvc", "get-url", url, str(dest_path)] run_command(cmd) - msg.good(f"Got asset {dest}") + msg.good(f"Fetched asset {dest}") -@project_cli.command("run") +@project_cli.command( + "run-all", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_all_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run all commands. Additional arguments are passed to dvc repro.""" + if show_help: + print_run_help(project_dir) + else: + project_run_all(project_dir, *ctx.args) + + +def project_run_all(project_dir: Path, *dvc_args) -> None: + config = load_project_config(project_dir) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_dir, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + dvc_cmd = ["dvc", "repro", *dvc_args] + run_command(dvc_cmd) + + +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def project_run_cli( # fmt: off + ctx: typer.Context, project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), subcommand: str = Arg(None, help="Name of command defined in project config"), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): """Run scripts defined in the project.""" - if show_help: + if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand) + project_run(project_dir, subcommand, *ctx.args) -def print_run_help(project_dir: Path, subcommand: str) -> None: +def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: """Simulate a CLI help prompt using the info available in the project config.""" config = load_project_config(project_dir) config_commands = config.get("commands", []) @@ -149,28 +207,60 @@ def print_run_help(project_dir: Path, subcommand: str) -> None: msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) -def project_run(project_dir: Path, subcommand: str) -> None: +def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: + config = load_project_config(project_dir) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_dir, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + config_commands = config.get("commands", []) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + if subcommand in config.get("run", []): + # This is one of the pipeline commands tracked in DVC + dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] + run_command(dvc_cmd) + else: + with working_dir(project_dir): + run_commands(commands[subcommand]["script"], variables) + + +@project_cli.command("exec") +def project_exec_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(..., help="Name of command defined in project config"), + # fmt: on +): + """Internals""" + project_exec(project_dir, subcommand) + + +def project_exec(project_dir: Path, subcommand: str): config = load_project_config(project_dir) config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand and subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) with working_dir(project_dir): - if subcommand is None: - all_commands = config.get("run", []) - if not all_commands: - msg.warn("No run commands defined in project config", exits=0) - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - for command in all_commands: - if command not in commands: - msg.fail( - f"Can't find command '{command}' in project config", exits=1 - ) - msg.divider(command) - run_commands(commands[command]["script"], variables) - else: - run_commands(commands[subcommand]["script"], variables) + run_commands(commands[subcommand]["script"], variables) + + +@project_cli.command("update-dvc") +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + config = load_project_config(project_dir) + updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) + if updated: + msg.good(f"Updated DVC config from {CONFIG_FILE}") + else: + msg.info(f"No changes found in {CONFIG_FILE}, no update needed") app.add_typer(project_cli, name="project") @@ -187,7 +277,63 @@ def load_project_config(path: Path) -> Dict[str, Any]: return config -def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: +def update_dvc_config( + path: Path, + config: Dict[str, Any], + verbose: bool = False, + silent: bool = False, + force: bool = False, +) -> bool: + """Re-run the DVC commands in dry mode and update dvc.yml file in the + project directory. The file is auto-generated based on the config. + """ + config_hash = get_hash(config) + dvc_config_path = path / DVC_CONFIG + if dvc_config_path.exists(): + # Cneck if the file was generated using the current config, if not, redo + with dvc_config_path.open("r", encoding="utf8") as f: + ref_hash = f.readline().strip().replace("# ", "") + if ref_hash == config_hash and not force: + return False # Nothing has changed in project config, don't need to update + dvc_config_path.unlink() + variables = config.get("variables", {}) + commands = [] + # We only want to include commands that are part of the main list of "run" + # commands in project.yml and should be run in sequence + config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + for name in config.get("run", []): + if name not in config_commands: + msg.fail(f"Can't find command '{name}' in project config", exits=1) + command = config_commands[name] + deps = command.get("deps", []) + outputs = command.get("outputs", []) + outputs_no_cache = command.get("outputs_no_cache", []) + if not deps and not outputs and not outputs_no_cache: + continue + # Default to "." as the project path since dvc.yaml is auto-generated + # and we don't want arbitrary paths in there + project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] + deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] + outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] + outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] + dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] + if verbose: + dvc_cmd.append("--verbose") + if silent: + dvc_cmd.append("--quiet") + full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] + commands.append(" ".join(full_cmd)) + run_commands(commands, variables, silent=True) + with dvc_config_path.open("r+", encoding="utf8") as f: + content = f.read() + f.seek(0, 0) + f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") + return True + + +def run_commands( + commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False +) -> None: for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) @@ -195,7 +341,8 @@ def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) # TODO: is this needed / a good idea? if len(command) and command[0] == "python": command[0] = sys.executable - print(" ".join(command)) + if not silent: + print(" ".join(command)) run_command(command) @@ -225,3 +372,7 @@ def check_clone_dest(dest: Path) -> None: f"Can't clone project, parent directory doesn't exist: {dest.parent}", exits=1, ) + + +def get_hash(data) -> str: + return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True))) From c96b4a37b69d81b298da810a56dd52664698a0a1 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 14:15:41 +0200 Subject: [PATCH 144/203] Update DVC integration --- spacy/cli/project.py | 79 +++++++++++++++++++++++++++++++++++--------- spacy/schemas.py | 3 ++ 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 12578d813..d9c6c402e 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -10,6 +10,7 @@ import re import shutil import sys import murmurhash +import hashlib from ._app import app, Arg, Opt, COMMAND, NAME from .. import about @@ -67,11 +68,12 @@ def project_clone_cli( dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), + no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") # fmt: on ): """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo, git=git, verbose=verbose) + project_clone(name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose) def project_clone( @@ -80,6 +82,7 @@ def project_clone( *, repo: str = about.__projects__, git: bool = False, + no_init: bool = False, verbose: bool = False, ) -> None: dest = ensure_path(dest) @@ -99,6 +102,25 @@ def project_clone( dir_path = dest / sub_dir if not dir_path.exists(): dir_path.mkdir(parents=True) + if not no_init: + project_init(dest, git=git) + msg.good(f"Your project is now ready!", dest.resolve()) + print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") + + +@project_cli.command("init") +def project_init_cli( + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), +): + """Initialize a project directory with DVC and Git (optional). This should + typically be taken care of automatically when you run the "project clone" + command. + """ + project_init(path, git=git) + + +def project_init(dest: Path, *, git: bool = False): with working_dir(dest): # TODO: check that .dvc exists in other commands? init_cmd = ["dvc", "init"] @@ -107,26 +129,27 @@ def project_clone( if git: run_command(["git", "init"]) run_command(init_cmd) - msg.good(f"Your project is now ready!", dest.resolve()) - print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") + # TODO: find a better solution for this? + run_command(["dvc", "config", "core.analytics", "false"]) @project_cli.command("assets") def project_assets_cli( # fmt: off path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), - dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't download anything"), # fmt: on ): """Use Data Version Control to get the assets for the project.""" - project_assets(path, dry=dry) + project_assets(path) -def project_assets(project_path: Path, *, dry: bool = False) -> None: - if dry: - msg.warn("Performing a dry run and not downloading anything") +def project_assets(project_path: Path) -> None: project_path = ensure_path(project_path) config = load_project_config(project_path) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_path, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) @@ -135,12 +158,30 @@ def project_assets(project_path: Path, *, dry: bool = False) -> None: for asset in assets: url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) - dest_path = project_path / dest - check_asset(url) - if not dry: - cmd = ["dvc", "get-url", url, str(dest_path)] - run_command(cmd) - msg.good(f"Fetched asset {dest}") + fetch_asset(project_path, url, dest, asset.get("checksum")) + + +def fetch_asset(project_path: Path, url: str, dest: Path, checksum: str = None): + check_asset(url) + dest_path = project_path / dest + if dest_path.exists() and checksum: + # If there's already a file, check for checksum + # TODO: add support for chaches + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return + with working_dir(project_path): + try: + dvc_cmd = ["dvc", "get-url", url, str(dest_path)] + # If this fails, we don't want to output an error or info message + out = subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL) + print(out) + except subprocess.CalledProcessError: + # TODO: Can we read out Weak ETags error? + # TODO: replace curl + run_command(["curl", url, "--output", str(dest_path)]) + run_command(["dvc", "add", str(dest_path)]) + msg.good(f"Fetched asset {dest}") @project_cli.command( @@ -168,7 +209,8 @@ def project_run_all(project_dir: Path, *dvc_args) -> None: if updated: msg.good(f"Updated DVC config from changed {CONFIG_FILE}") dvc_cmd = ["dvc", "repro", *dvc_args] - run_command(dvc_cmd) + with working_dir(project_dir): + run_command(dvc_cmd) @project_cli.command( @@ -323,7 +365,8 @@ def update_dvc_config( dvc_cmd.append("--quiet") full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] commands.append(" ".join(full_cmd)) - run_commands(commands, variables, silent=True) + with working_dir(path): + run_commands(commands, variables, silent=True) with dvc_config_path.open("r+", encoding="utf8") as f: content = f.read() f.seek(0, 0) @@ -376,3 +419,7 @@ def check_clone_dest(dest: Path) -> None: def get_hash(data) -> str: return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True))) + + +def get_checksum(path: Path) -> str: + return hashlib.md5(path.read_bytes()).hexdigest() diff --git a/spacy/schemas.py b/spacy/schemas.py index 43694b325..38e08b4cb 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -220,8 +220,11 @@ class TrainingSchema(BaseModel): class ProjectConfigAsset(BaseModel): + # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") url: StrictStr = Field(..., title="URL of asset") + checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + # fmt: on class ProjectConfigCommand(BaseModel): From 8979dc254f0e2646cc31ab374cb3e89ac5d67f35 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 14:40:28 +0200 Subject: [PATCH 145/203] Update project init --- spacy/cli/project.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index d9c6c402e..f0c7ee964 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -117,20 +117,30 @@ def project_init_cli( typically be taken care of automatically when you run the "project clone" command. """ - project_init(path, git=git) + project_init(path, git=git, silent=True) -def project_init(dest: Path, *, git: bool = False): +def project_init( + dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False +): with working_dir(dest): # TODO: check that .dvc exists in other commands? init_cmd = ["dvc", "init"] + if silent: + init_cmd.append("--quiet") if not git: init_cmd.append("--no-scm") if git: run_command(["git", "init"]) run_command(init_cmd) - # TODO: find a better solution for this? - run_command(["dvc", "config", "core.analytics", "false"]) + if not analytics: + # TODO: find a better solution for this? + run_command(["dvc", "config", "core.analytics", "false"]) + config = load_project_config(dest) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(dest, config, silent=True) + if updated: + msg.good(f"Updated DVC config from {CONFIG_FILE}") @project_cli.command("assets") From 165c37ccbae3111a43beb7d1270f907fd0e788ec Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 15:03:21 +0200 Subject: [PATCH 146/203] Update project.py --- spacy/cli/project.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index f0c7ee964..d1c6e603d 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -73,7 +73,9 @@ def project_clone_cli( # fmt: on ): """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose) + project_clone( + name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose, silent=True + ) def project_clone( @@ -83,6 +85,7 @@ def project_clone( repo: str = about.__projects__, git: bool = False, no_init: bool = False, + silent: bool = False, verbose: bool = False, ) -> None: dest = ensure_path(dest) @@ -90,10 +93,13 @@ def project_clone( # When cloning a subdirectory with DVC, it will create a folder of that name # within the destination dir, so we use a tempdir and then copy it into the # parent directory to create the cloned directory + dest = dest.resolve() with make_tempdir() as tmp_dir: cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)] if verbose: - cmd.append("-v") + cmd.append("--verbose") + if silent: + cmd.append("--quiet") print(" ".join(cmd)) run_command(cmd) shutil.move(str(tmp_dir / Path(name).name), str(dest)) @@ -103,8 +109,8 @@ def project_clone( if not dir_path.exists(): dir_path.mkdir(parents=True) if not no_init: - project_init(dest, git=git) - msg.good(f"Your project is now ready!", dest.resolve()) + project_init(dest, git=git, silent=silent) + msg.good(f"Your project is now ready!", dest) print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") From fe06697150ba7ce0cf28dfe37cdf9d84b58fa234 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 20:36:08 +0200 Subject: [PATCH 147/203] Fix package command and add version option --- spacy/cli/package.py | 65 +++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 24d9a0a08..dbc485848 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,8 +16,9 @@ def package_cli( # fmt: off input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), + meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), # fmt: on ): @@ -32,6 +33,7 @@ def package_cli( input_dir, output_dir, meta_path=meta_path, + version=version, create_meta=create_meta, force=force, silent=False, @@ -42,6 +44,7 @@ def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, + version: Optional[str] = None, create_meta: bool = False, force: bool = False, silent: bool = True, @@ -61,10 +64,13 @@ def package( if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) + meta = get_meta(input_dir, meta) + if version is not None: + meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: - meta = generate_meta(input_dir, meta, msg) + meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) @@ -101,20 +107,20 @@ def create_file(file_path: Path, contents: str) -> None: file_path.open("w", encoding="utf-8").write(contents) -def generate_meta( - model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer +def get_meta( + model_path: Union[str, Path], existing_meta: Dict[str, Any] ) -> Dict[str, Any]: - meta = existing_meta or {} - settings = [ - ("lang", "Model language", meta.get("lang", "en")), - ("name", "Model name", meta.get("name", "model")), - ("version", "Model version", meta.get("version", "0.0.0")), - ("description", "Model description", meta.get("description", False)), - ("author", "Author", meta.get("author", False)), - ("email", "Author email", meta.get("email", False)), - ("url", "Author website", meta.get("url", False)), - ("license", "License", meta.get("license", "MIT")), - ] + meta = { + "lang": "en", + "name": "model", + "version": "0.0.0", + "description": None, + "author": None, + "email": None, + "url": None, + "license": "MIT", + } + meta.update(existing_meta) nlp = util.load_model_from_path(Path(model_path)) meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["pipeline"] = nlp.pipe_names @@ -124,6 +130,23 @@ def generate_meta( "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } + if about.__title__ != "spacy": + meta["parent_package"] = about.__title__ + return meta + + +def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]: + meta = existing_meta or {} + settings = [ + ("lang", "Model language", meta.get("lang", "en")), + ("name", "Model name", meta.get("name", "model")), + ("version", "Model version", meta.get("version", "0.0.0")), + ("description", "Model description", meta.get("description", None)), + ("author", "Author", meta.get("author", None)), + ("email", "Author email", meta.get("email", None)), + ("url", "Author website", meta.get("url", None)), + ("license", "License", meta.get("license", "MIT")), + ] msg.divider("Generating meta.json") msg.text( "Enter the package settings for your model. The following information " @@ -132,8 +155,6 @@ def generate_meta( for setting, desc, default in settings: response = get_raw_input(desc, default) meta[setting] = default if response == "" and default else response - if about.__title__ != "spacy": - meta["parent_package"] = about.__title__ return meta @@ -184,12 +205,12 @@ def setup_package(): setup( name=model_name, - description=meta['description'], - author=meta['author'], - author_email=meta['email'], - url=meta['url'], + description=meta.get('description'), + author=meta.get('author'), + author_email=meta.get('email'), + url=meta.get('url'), version=meta['version'], - license=meta['license'], + license=meta.get('license'), packages=[model_name], package_data={model_name: list_files(model_dir)}, install_requires=list_requirements(meta), From 6678bd80c22de22ee4497bc68c6c98480f481198 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 20:57:26 +0200 Subject: [PATCH 148/203] Check if deps exist in non-DVC commands --- spacy/cli/project.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index d1c6e603d..d59537bb8 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -281,8 +281,15 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] run_command(dvc_cmd) else: + cmd = commands[subcommand] + # Deps in non-DVC commands aren't tracked, but if they're defined, + # make sure they exist before running the command + for dep in cmd.get("deps", []): + if not (project_dir / dep).exists(): + err = f"Missing dependency specified by command '{subcommand}': {dep}" + msg.fail(err, exits=1) with working_dir(project_dir): - run_commands(commands[subcommand]["script"], variables) + run_commands(cmd["script"], variables) @project_cli.command("exec") From df22d490b187a4148bc02accf6c631f53a85290b Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 21:13:06 +0200 Subject: [PATCH 149/203] Tidy up types --- spacy/cli/project.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index d59537bb8..8a5e3ded0 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -177,7 +177,9 @@ def project_assets(project_path: Path) -> None: fetch_asset(project_path, url, dest, asset.get("checksum")) -def fetch_asset(project_path: Path, url: str, dest: Path, checksum: str = None): +def fetch_asset( + project_path: Path, url: str, dest: Path, checksum: Optional[str] = None +) -> None: check_asset(url) dest_path = project_path / dest if dest_path.exists() and checksum: From 42eb381ec6107b68e7683d5d0244358b54a15aca Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 21:13:11 +0200 Subject: [PATCH 150/203] Improve output handling in evaluate --- spacy/cli/evaluate.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a18e51623..fcc7fbf9b 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,6 +2,8 @@ from typing import Optional, List from timeit import default_timer as timer from wasabi import Printer from pathlib import Path +import re +import srsly from ..gold import Corpus from ..tokens import Doc @@ -16,13 +18,12 @@ def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), + output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), - - # fmt: on + # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an @@ -31,24 +32,24 @@ def evaluate_cli( evaluate( model, data_path, + output=output, gpu_id=gpu_id, gold_preproc=gold_preproc, displacy_path=displacy_path, displacy_limit=displacy_limit, silent=False, - return_scores=return_scores, ) def evaluate( model: str, data_path: Path, + output: Optional[Path], gpu_id: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, - return_scores: bool = False, ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) util.fix_random_seed() @@ -56,6 +57,7 @@ def evaluate( util.use_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) + output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) @@ -105,8 +107,11 @@ def evaluate( ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - if return_scores: - return scorer.scores + + data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} + if output_path is not None: + srsly.write_json(output_path, data) + return data def render_parses( From e33d2b1bea819472541a8027e6638a538c900994 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 21:15:13 +0200 Subject: [PATCH 151/203] Add success message --- spacy/cli/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index fcc7fbf9b..039a596d4 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -111,6 +111,7 @@ def evaluate( data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} if output_path is not None: srsly.write_json(output_path, data) + msg.good(f"Saved results to {output_path}") return data From d8c70b415e83ba0753d94551a785fe5f1bc695fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sat, 27 Jun 2020 21:15:25 +0200 Subject: [PATCH 152/203] Fix Example usage in evaluate --- spacy/cli/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index fcc7fbf9b..2ff9e3066 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -72,7 +72,7 @@ def evaluate( begin = timer() scorer = nlp.evaluate(dev_dataset, verbose=False) end = timer() - nwords = sum(len(ex.doc) for ex in dev_dataset) + nwords = sum(len(ex.predicted) for ex in dev_dataset) results = { "Time": f"{end - begin:.2f} s", "Words": nwords, @@ -95,7 +95,7 @@ def evaluate( msg.table(results, title="Results") if displacy_path: - docs = [ex.doc for ex in dev_dataset] + docs = [ex.predicted for ex in dev_dataset] render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( From cd0dd782761abc83849fbb6ba3fdad5253b5b9c8 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sat, 27 Jun 2020 21:16:57 +0200 Subject: [PATCH 153/203] Simplify model loading (now supported via load_model) --- spacy/cli/evaluate.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 039a596d4..49d02ca9a 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -64,10 +64,7 @@ def evaluate( if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = Corpus(data_path, data_path) - if model.startswith("blank:"): - nlp = util.get_lang_class(model.replace("blank:", ""))() - else: - nlp = util.load_model(model) + nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_dataset, verbose=False) From ed46951842e2428f74e1ec9a5b2a0b09281ad156 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 12:24:59 +0200 Subject: [PATCH 154/203] Update --- spacy/cli/project.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 8a5e3ded0..f1d98feeb 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -130,7 +130,6 @@ def project_init( dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False ): with working_dir(dest): - # TODO: check that .dvc exists in other commands? init_cmd = ["dvc", "init"] if silent: init_cmd.append("--quiet") @@ -143,10 +142,7 @@ def project_init( # TODO: find a better solution for this? run_command(["dvc", "config", "core.analytics", "false"]) config = load_project_config(dest) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(dest, config, silent=True) - if updated: - msg.good(f"Updated DVC config from {CONFIG_FILE}") + setup_check_dvc(dest, config) @project_cli.command("assets") @@ -162,10 +158,7 @@ def project_assets_cli( def project_assets(project_path: Path) -> None: project_path = ensure_path(project_path) config = load_project_config(project_path) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_path, config, silent=True) - if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + setup_check_dvc(project_path, config) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) @@ -222,10 +215,7 @@ def project_run_all_cli( def project_run_all(project_dir: Path, *dvc_args) -> None: config = load_project_config(project_dir) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_dir, config, silent=True) - if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + setup_check_dvc(project_dir, config) dvc_cmd = ["dvc", "repro", *dvc_args] with working_dir(project_dir): run_command(dvc_cmd) @@ -269,10 +259,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: config = load_project_config(project_dir) - with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_dir, config, silent=True) - if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + setup_check_dvc(project_dir, config) config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} @@ -399,6 +386,15 @@ def update_dvc_config( return True +def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None: + if not (project_path / ".dvc").exists(): + msg.fail("Project not initialized as a DVC project", exits=1) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_path, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + + def run_commands( commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False ) -> None: @@ -409,6 +405,8 @@ def run_commands( # TODO: is this needed / a good idea? if len(command) and command[0] == "python": command[0] = sys.executable + elif len(command) and command[0] == "pip": + command = [sys.executable, "-m", "pip", *command[1:]] if not silent: print(" ".join(command)) run_command(command) From d6aa4cb478e9ba9c85f0c20de30f4b991a2b5432 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 12:40:11 +0200 Subject: [PATCH 155/203] Update asset logic --- spacy/cli/project.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index f1d98feeb..4a11c17f3 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -174,10 +174,10 @@ def fetch_asset( project_path: Path, url: str, dest: Path, checksum: Optional[str] = None ) -> None: check_asset(url) - dest_path = project_path / dest + dest_path = (project_path / dest).resolve() if dest_path.exists() and checksum: # If there's already a file, check for checksum - # TODO: add support for chaches + # TODO: add support for caches if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") return @@ -188,9 +188,8 @@ def fetch_asset( out = subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL) print(out) except subprocess.CalledProcessError: - # TODO: Can we read out Weak ETags error? # TODO: replace curl - run_command(["curl", url, "--output", str(dest_path)]) + run_command(["curl", url, "--output", str(dest_path), "--progress-bar"]) run_command(["dvc", "add", str(dest_path)]) msg.good(f"Fetched asset {dest}") From f3853442864bc691aa12d99b5b0b32819e3cc3f5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 13:07:31 +0200 Subject: [PATCH 156/203] Update asset logic and add import-url --- spacy/cli/project.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 4a11c17f3..f8338d9b1 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -177,20 +177,27 @@ def fetch_asset( dest_path = (project_path / dest).resolve() if dest_path.exists() and checksum: # If there's already a file, check for checksum - # TODO: add support for caches + # TODO: add support for caches (dvc import-url with local path) if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") return with working_dir(project_path): try: + # If these fail, we don't want to output an error or info message. + # Try with tracking the source first, then just downloading with + # DVC, then a regular non-DVC download. + dvc_cmd = ["dvc", "import-url", url, str(dest_path)] + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + except subprocess.CalledProcessError: dvc_cmd = ["dvc", "get-url", url, str(dest_path)] - # If this fails, we don't want to output an error or info message - out = subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL) - print(out) + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + run_command(["dvc", "add", str(dest_path)]) except subprocess.CalledProcessError: # TODO: replace curl run_command(["curl", url, "--output", str(dest_path), "--progress-bar"]) - run_command(["dvc", "add", str(dest_path)]) + run_command(["dvc", "add", str(dest_path)]) + if checksum and checksum != get_checksum(dest_path): + msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.good(f"Fetched asset {dest}") From 1b331237aa419d9e55c41728121726dd389e5d76 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 13:17:19 +0200 Subject: [PATCH 157/203] Update hashing and config update --- spacy/cli/project.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index f8338d9b1..d1e549e96 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -9,7 +9,6 @@ import os import re import shutil import sys -import murmurhash import hashlib from ._app import app, Arg, Opt, COMMAND, NAME @@ -348,6 +347,7 @@ def update_dvc_config( project directory. The file is auto-generated based on the config. """ config_hash = get_hash(config) + path = path.resolve() dvc_config_path = path / DVC_CONFIG if dvc_config_path.exists(): # Cneck if the file was generated using the current config, if not, redo @@ -447,7 +447,8 @@ def check_clone_dest(dest: Path) -> None: def get_hash(data) -> str: - return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True))) + data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") + return hashlib.md5(data_str).hexdigest() def get_checksum(path: Path) -> str: From e08257d4012b6df624c88b45ba28e8f9829ce2ad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 28 Jun 2020 14:07:32 +0200 Subject: [PATCH 158/203] Add example of how to do sparse-checkout --- spacy/cli/_git_sparse_checkout_example.py | 62 +++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 spacy/cli/_git_sparse_checkout_example.py diff --git a/spacy/cli/_git_sparse_checkout_example.py b/spacy/cli/_git_sparse_checkout_example.py new file mode 100644 index 000000000..2bb3d6734 --- /dev/null +++ b/spacy/cli/_git_sparse_checkout_example.py @@ -0,0 +1,62 @@ +import tempfile +import typer +from pathlib import Path +import subprocess +import shlex +import shutil +from contextlib import contextmanager + + +@contextmanager +def make_tempdir(): + d = Path(tempfile.mkdtemp()) + yield d + shutil.rmtree(str(d)) + + + +def clone_repo(repo, temp_dir): + subprocess.check_call([ + "git", + "clone", + repo, + temp_dir, + "--no-checkout", + "--depth", "1", + "--config", "core.sparseCheckout=true" + ]) + + +def checkout_and_fetch(temp_dir): + subprocess.check_call([ + "git", + "-C", temp_dir, + "fetch" + ]) + subprocess.check_call([ + "git", + "-C", temp_dir, + "checkout" + ]) + + +def set_sparse_checkout_dir(temp_dir, subpath): + with (temp_dir / ".git" / "info" / "sparse-checkout").open("w") as file_: + file_.write(subpath) + + +def main(repo: str, subpath: str, dest: Path): + with make_tempdir() as temp_dir: + clone_repo(repo, temp_dir) + print("After clone", list(temp_dir.iterdir())) + set_sparse_checkout_dir(temp_dir, subpath) + checkout_and_fetch(temp_dir) + print("After checkout", list(temp_dir.iterdir())) + assert (temp_dir / subpath) in list(temp_dir.iterdir()) + shutil.copytree(temp_dir / subpath, dest / subpath, dirs_exist_ok=True) + print("Exists after cleanup?", temp_dir.exists()) + print("Destination", list(dest.iterdir())) + + +if __name__ == "__main__": + typer.run(main) From 2f6ee0d018998c1c21562c959c89d5427d78fe6b Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 15:08:35 +0200 Subject: [PATCH 159/203] Tidy up, document and add custom clone logic --- spacy/cli/_git_sparse_checkout_example.py | 62 --- spacy/cli/project.py | 487 +++++++++++++++------- spacy/util.py | 25 ++ 3 files changed, 352 insertions(+), 222 deletions(-) delete mode 100644 spacy/cli/_git_sparse_checkout_example.py diff --git a/spacy/cli/_git_sparse_checkout_example.py b/spacy/cli/_git_sparse_checkout_example.py deleted file mode 100644 index 2bb3d6734..000000000 --- a/spacy/cli/_git_sparse_checkout_example.py +++ /dev/null @@ -1,62 +0,0 @@ -import tempfile -import typer -from pathlib import Path -import subprocess -import shlex -import shutil -from contextlib import contextmanager - - -@contextmanager -def make_tempdir(): - d = Path(tempfile.mkdtemp()) - yield d - shutil.rmtree(str(d)) - - - -def clone_repo(repo, temp_dir): - subprocess.check_call([ - "git", - "clone", - repo, - temp_dir, - "--no-checkout", - "--depth", "1", - "--config", "core.sparseCheckout=true" - ]) - - -def checkout_and_fetch(temp_dir): - subprocess.check_call([ - "git", - "-C", temp_dir, - "fetch" - ]) - subprocess.check_call([ - "git", - "-C", temp_dir, - "checkout" - ]) - - -def set_sparse_checkout_dir(temp_dir, subpath): - with (temp_dir / ".git" / "info" / "sparse-checkout").open("w") as file_: - file_.write(subpath) - - -def main(repo: str, subpath: str, dest: Path): - with make_tempdir() as temp_dir: - clone_repo(repo, temp_dir) - print("After clone", list(temp_dir.iterdir())) - set_sparse_checkout_dir(temp_dir, subpath) - checkout_and_fetch(temp_dir) - print("After checkout", list(temp_dir.iterdir())) - assert (temp_dir / subpath) in list(temp_dir.iterdir()) - shutil.copytree(temp_dir / subpath, dest / subpath, dirs_exist_ok=True) - print("Exists after cleanup?", temp_dir.exists()) - print("Destination", list(dest.iterdir())) - - -if __name__ == "__main__": - typer.run(main) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index d1e549e96..70bba0e51 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -9,12 +9,12 @@ import os import re import shutil import sys -import hashlib from ._app import app, Arg, Opt, COMMAND, NAME from .. import about from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir +from ..util import get_hash, get_checksum CONFIG_FILE = "project.yml" @@ -45,19 +45,13 @@ project_cli = typer.Typer(help="Command-line interface for spaCy projects") @project_cli.callback(invoke_without_command=True) def callback(ctx: typer.Context): - """This runs before every project command and ensures DVC is installed and - everything is up to date. - """ - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "spaCy projects require DVC (Data Version Control) and the 'dvc' command", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) + """This runs before every project command and ensures DVC is installed.""" + ensure_dvc() + + +################ +# CLI COMMANDS # +################ @project_cli.command("clone") @@ -68,13 +62,144 @@ def project_clone_cli( repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), - verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") # fmt: on ): - """Clone a project template from a repository.""" - project_clone( - name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose, silent=True - ) + """Clone a project template from a repository. Calls into "git" and will + only download the files from the given subdirectory. The GitHub repo + defaults to the official spaCy template repo, but can be customized + (including using a private repo). Setting the --git flag will also + initialize the project directory as a Git repo. If the project is intended + to be a Git repo, it should be initialized with Git first, before + initializing DVC (Data Version Control). This allows DVC to integrate with + Git. + """ + project_clone(name, dest, repo=repo, git=git, no_init=no_init) + + +@project_cli.command("init") +def project_init_cli( + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), +): + """Initialize a project directory with DVC and optionally Git. This should + typically be taken care of automatically when you run the "project clone" + command, but you can also run it separately. If the project is intended to + be a Git repo, it should be initialized with Git first, before initializing + DVC. This allows DVC to integrate with Git. + """ + project_init(path, git=git, silent=True) + + +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + # fmt: on +): + """Use DVC (Data Version Control) to fetch the assets for the project, + defined in the "assets" section of the project config. If possible, DVC + will try to track the files so you can pull changes from upstream. It will + also try and store the checksum so the assets are versioned. If th file + can't be tracked or checked, it will be downloaded using curl. If a checksum + is provided in the project config, the file is only downloaded if no local + file with the same checksum exists. + """ + project_assets(project_dir) + + +@project_cli.command( + "run-all", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_all_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run all commands defined in the project. This command will use DVC and + the defined outputs and dependencies in the project config to determine + which steps need to be re-run and where to start. This means you're only + re-generating data if the inputs have changed. + + This command calls into "dvc repro" and all additional arguments are passed + to the "dvc repro" command: https://dvc.org/doc/command-reference/repro + """ + if show_help: + print_run_help(project_dir) + else: + project_run_all(project_dir, *ctx.args) + + +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(None, help="Name of command defined in project config"), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run a named script defined in the project config. If the command is + part of the default pipeline defined in the "run" section, DVC is used to + determine whether the step should re-run if its inputs have changed, or + whether everything is up to date. If the script is not part of the default + pipeline, it will be called separately without DVC. + + If DVC is used, the command calls into "dvc repro" and all additional + arguments are passed to the "dvc repro" command: + https://dvc.org/doc/command-reference/repro + """ + if show_help or not subcommand: + print_run_help(project_dir, subcommand) + else: + project_run(project_dir, subcommand, *ctx.args) + + +@project_cli.command("exec", hidden=True) +def project_exec_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(..., help="Name of command defined in project config"), + # fmt: on +): + """Execute a command defined in the project config. This CLI command is + only called internally in auto-generated DVC pipelines, as a shortcut for + multi-step commands in the project config. You typically shouldn't have to + call it yourself. To run a command, call "run" or "run-all". + """ + project_exec(project_dir, subcommand) + + +@project_cli.command("update-dvc") +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + """Update the auto-generated DVC config file. Uses the steps defined in the + "run" section of the project config. This typically happens automatically + when running a command, but can also be triggered manually if needed. + """ + config = load_project_config(project_dir) + updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) + if updated: + msg.good(f"Updated DVC config from {CONFIG_FILE}") + else: + msg.info(f"No changes found in {CONFIG_FILE}, no update needed") + + +app.add_typer(project_cli, name="project") + + +################# +# CLI FUNCTIONS # +################# def project_clone( @@ -84,51 +209,55 @@ def project_clone( repo: str = about.__projects__, git: bool = False, no_init: bool = False, - silent: bool = False, - verbose: bool = False, ) -> None: + """Clone a project template from a repository. + + name (str): Name of subdirectory to clone. + dest (Path): Destination path of cloned project. + repo (str): URL of Git repo containing project templates. + git (bool): Initialize project as Git repo. Should be set to True if project + is intended as a repo, since it will allow DVC to integrate with Git. + no_init (bool): Don't initialize DVC and Git automatically. If True, the + "init" command or "git init" and "dvc init" need to be run manually. + """ dest = ensure_path(dest) - check_clone_dest(dest) - # When cloning a subdirectory with DVC, it will create a folder of that name - # within the destination dir, so we use a tempdir and then copy it into the - # parent directory to create the cloned directory - dest = dest.resolve() + check_clone(name, dest, repo) + project_dir = dest.resolve() + # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: - cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)] - if verbose: - cmd.append("--verbose") - if silent: - cmd.append("--quiet") - print(" ".join(cmd)) - run_command(cmd) - shutil.move(str(tmp_dir / Path(name).name), str(dest)) + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" + run_command(shlex.split(cmd)) + with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: + f.write(name) + run_command(["git", "-C", tmp_dir, "fetch"]) + run_command(["git", "-C", tmp_dir, "checkout"]) + shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) msg.good(f"Cloned project '{name}' from {repo}") for sub_dir in DIRS: - dir_path = dest / sub_dir + dir_path = project_dir / sub_dir if not dir_path.exists(): dir_path.mkdir(parents=True) if not no_init: - project_init(dest, git=git, silent=silent) + project_init(project_dir, git=git, silent=True) msg.good(f"Your project is now ready!", dest) - print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") - - -@project_cli.command("init") -def project_init_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), - git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), -): - """Initialize a project directory with DVC and Git (optional). This should - typically be taken care of automatically when you run the "project clone" - command. - """ - project_init(path, git=git, silent=True) + print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") def project_init( - dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False + project_dir: Path, + *, + git: bool = False, + silent: bool = False, + analytics: bool = False, ): - with working_dir(dest): + """Initialize a project as a DVC and (optionally) as a Git repo. + + project_dir (Path): Path to project directory. + git (bool): Also call "git init" to initialize directory as a Git repo. + silent (bool): Don't print any output (via DVC). + analytics (bool): Opt-in to DVC analytics (defaults to False). + """ + with working_dir(project_dir): init_cmd = ["dvc", "init"] if silent: init_cmd.append("--quiet") @@ -137,25 +266,20 @@ def project_init( if git: run_command(["git", "init"]) run_command(init_cmd) + # We don't want to have analytics on by default – our users should + # opt-in explicitly. If they want it, they can always enable it. if not analytics: - # TODO: find a better solution for this? run_command(["dvc", "config", "core.analytics", "false"]) - config = load_project_config(dest) - setup_check_dvc(dest, config) + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) -@project_cli.command("assets") -def project_assets_cli( - # fmt: off - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), - # fmt: on -): - """Use Data Version Control to get the assets for the project.""" - project_assets(path) +def project_assets(project_dir: Path) -> None: + """Fetch assets for a project using DVC if possible. - -def project_assets(project_path: Path) -> None: - project_path = ensure_path(project_path) + project_dir (Path): Path to project directory. + """ + project_path = ensure_path(project_dir) config = load_project_config(project_path) setup_check_dvc(project_path, config) assets = config.get("assets", {}) @@ -172,7 +296,17 @@ def project_assets(project_path: Path) -> None: def fetch_asset( project_path: Path, url: str, dest: Path, checksum: Optional[str] = None ) -> None: - check_asset(url) + """Fetch an asset from a given URL or path. Will try to import the file + using DVC's import-url if possible (fully tracked and versioned) and falls + back to get-url (versioned) and a non-DVC download if necessary. If a + checksum is provided and a local file exists, it's only re-downloaded if the + checksum doesn't match. + + project_path (Path): Path to project directory. + url (str): URL or path to asset. + checksum (Optional[str]): Optional expected checksum of local file. + """ + url = convert_asset_url(url) dest_path = (project_path / dest).resolve() if dest_path.exists() and checksum: # If there's already a file, check for checksum @@ -185,12 +319,13 @@ def fetch_asset( # If these fail, we don't want to output an error or info message. # Try with tracking the source first, then just downloading with # DVC, then a regular non-DVC download. - dvc_cmd = ["dvc", "import-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - dvc_cmd = ["dvc", "get-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - run_command(["dvc", "add", str(dest_path)]) + try: + dvc_cmd = ["dvc", "import-url", url, str(dest_path)] + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + except subprocess.CalledProcessError: + dvc_cmd = ["dvc", "get-url", url, str(dest_path)] + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + run_command(["dvc", "add", str(dest_path)]) except subprocess.CalledProcessError: # TODO: replace curl run_command(["curl", url, "--output", str(dest_path), "--progress-bar"]) @@ -200,25 +335,12 @@ def fetch_asset( msg.good(f"Fetched asset {dest}") -@project_cli.command( - "run-all", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_all_cli( - # fmt: off - ctx: typer.Context, - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run all commands. Additional arguments are passed to dvc repro.""" - if show_help: - print_run_help(project_dir) - else: - project_run_all(project_dir, *ctx.args) - - def project_run_all(project_dir: Path, *dvc_args) -> None: + """Run all commands defined in the project using DVC. + + project_dir (Path): Path to project directory. + *dvc_args: Other arguments passed to "dvc repro". + """ config = load_project_config(project_dir) setup_check_dvc(project_dir, config) dvc_cmd = ["dvc", "repro", *dvc_args] @@ -226,27 +348,16 @@ def project_run_all(project_dir: Path, *dvc_args) -> None: run_command(dvc_cmd) -@project_cli.command( - "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_run_cli( - # fmt: off - ctx: typer.Context, - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - subcommand: str = Arg(None, help="Name of command defined in project config"), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run scripts defined in the project.""" - if show_help or not subcommand: - print_run_help(project_dir, subcommand) - else: - project_run(project_dir, subcommand, *ctx.args) - - def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: - """Simulate a CLI help prompt using the info available in the project config.""" + """Simulate a CLI help prompt using the info available in the project config. + + project_dir (Path): The project directory. + subcommand (Optional[str]): The subcommand or None. If a subcommand is + provided, the subcommand help is shown. Otherwise, the top-level help + and a list of available commands is printed. + """ config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) config_commands = config.get("commands", []) commands = {cmd["name"]: cmd for cmd in config_commands} if subcommand: @@ -260,9 +371,20 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: print(f"\nAvailable commands in {CONFIG_FILE}") print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + msg.text("Run all commands defined in the 'run' block of the project config:") + print(f"{COMMAND} project run-all {project_dir}") def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: + """Run a named script defined in the project config. If the script is part + of the default pipeline (defined in the "run" section), DVC is used to + execute the command, so it can determine whether to rerun it. It then + calls into "exec" to execute it. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + *dvc_args: Other arguments passed to "dvc repro". + """ config = load_project_config(project_dir) setup_check_dvc(project_dir, config) config_commands = config.get("commands", []) @@ -286,18 +408,12 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: run_commands(cmd["script"], variables) -@project_cli.command("exec") -def project_exec_cli( - # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - subcommand: str = Arg(..., help="Name of command defined in project config"), - # fmt: on -): - """Internals""" - project_exec(project_dir, subcommand) - - def project_exec(project_dir: Path, subcommand: str): + """Execute a command defined in the project config. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + """ config = load_project_config(project_dir) config_commands = config.get("commands", []) variables = config.get("variables", {}) @@ -306,26 +422,17 @@ def project_exec(project_dir: Path, subcommand: str): run_commands(commands[subcommand]["script"], variables) -@project_cli.command("update-dvc") -def project_update_dvc_cli( - # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), - verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), - force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), - # fmt: on -): - config = load_project_config(project_dir) - updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) - if updated: - msg.good(f"Updated DVC config from {CONFIG_FILE}") - else: - msg.info(f"No changes found in {CONFIG_FILE}, no update needed") - - -app.add_typer(project_cli, name="project") +########### +# HELPERS # +########### def load_project_config(path: Path) -> Dict[str, Any]: + """Load the project config file from a directory and validate it. + + path (Path): The path to the project directory. + RETURNS (Dict[str, Any]): The loaded project config. + """ config_path = path / CONFIG_FILE if not config_path.exists(): msg.fail("Can't find project config", config_path, exits=1) @@ -343,8 +450,17 @@ def update_dvc_config( silent: bool = False, force: bool = False, ) -> bool: - """Re-run the DVC commands in dry mode and update dvc.yml file in the - project directory. The file is auto-generated based on the config. + """Re-run the DVC commands in dry mode and update dvc.yaml file in the + project directory. The file is auto-generated based on the config. The + first line of the auto-generated file specifies the hash of the config + dict, so if any of the config values change, the DVC config is regenerated. + + path (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project config. + verbose (bool): Whether to print additional info (via DVC). + silent (bool): Don't output anything (via DVC). + force (bool): Force update, even if hashes match. + RETURNS (bool): Whether the DVC config file was updated. """ config_hash = get_hash(config) path = path.resolve() @@ -392,11 +508,40 @@ def update_dvc_config( return True -def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None: - if not (project_path / ".dvc").exists(): - msg.fail("Project not initialized as a DVC project", exits=1) +def ensure_dvc() -> None: + """Ensure that the "dvc" command is available and show an error if not.""" + try: + subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + "spaCy projects require DVC (Data Version Control) and the 'dvc' command", + "You can install the Python package from pip (pip install dvc) or " + "conda (conda install -c conda-forge dvc). For more details, see the " + "documentation: https://dvc.org/doc/install", + exits=1, + ) + + +def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: + """Check that the project is set up correctly with DVC and update its + config if needed. Will raise an error if the project is not an initialized + DVC project. + + project_dir (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project config. + """ + if not project_dir.exists(): + msg.fail(f"Can't find project directory: {project_dir}") + if not (project_dir / ".dvc").exists(): + msg.fail( + "Project not initialized as a DVC project.", + f"Make sure that the project template was cloned correctly. To " + f"initialize the project directory manually, you can run: " + f"{COMMAND} project init {project_dir}", + exits=1, + ) with msg.loading("Updating DVC config..."): - updated = update_dvc_config(project_path, config, silent=True) + updated = update_dvc_config(project_dir, config, silent=True) if updated: msg.good(f"Updated DVC config from changed {CONFIG_FILE}") @@ -404,6 +549,14 @@ def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None: def run_commands( commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False ) -> None: + """Run a sequence of commands in a subprocess, in order. + + commands (List[str]): The split commands. + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + silent (boll): Don't print the commands. + """ for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) @@ -418,21 +571,44 @@ def run_commands( run_command(command) -def check_asset(url: str) -> None: +def convert_asset_url(url: str) -> str: + """Check and convert the asset URL if needed. + + url (str): The asset URL. + RETURNS (str): The converted URL. + """ # If the asset URL is a regular GitHub URL it's likely a mistake - # TODO: support loading from GitHub URLs? Automatically convert to raw? if re.match("(http(s?)):\/\/github.com", url): + converted = url.replace("github.com", "raw.githubusercontent.com") + converted = re.sub(r"/(tree|blob)/", "/", converted) msg.warn( "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. If you want to " - "download the raw file, click on 'Download' on the GitHub page " - "and copy the raw.githubusercontent.com URL instead." + "the source of the page, not the actual file. Converting the URL " + "to a raw URL.", + converted, ) - # url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/").replace("/tree/", "/") + return converted + return url -def check_clone_dest(dest: Path) -> None: - """Check and validate that the destination path can be used to clone.""" +def check_clone(name: str, dest: Path, repo: str) -> None: + """Check and validate that the destination path can be used to clone. Will + check that Git is available and that the destination path is suitable. + + name (str): Name of the directory to clone from the repo. + dest (Path): Local destination of cloned directory. + repo (str): URL of the repo to clone from. + """ + try: + subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"To clone a project without Git, copy the files from the '{name}' " + f"directory in the {repo} to {dest} manually and then run:", + f"{COMMAND} project init {dest}", + exits=1, + ) if not dest: msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) if dest.exists(): @@ -444,12 +620,3 @@ def check_clone_dest(dest: Path) -> None: f"Can't clone project, parent directory doesn't exist: {dest.parent}", exits=1, ) - - -def get_hash(data) -> str: - data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") - return hashlib.md5(data_str).hexdigest() - - -def get_checksum(path: Path) -> str: - return hashlib.md5(path.read_bytes()).hexdigest() diff --git a/spacy/util.py b/spacy/util.py index 4300a07ff..3f0a1ec6f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -21,6 +21,7 @@ import subprocess from contextlib import contextmanager import tempfile import shutil +import hashlib try: @@ -459,11 +460,35 @@ def working_dir(path: Union[str, Path]) -> None: @contextmanager def make_tempdir(): + """Execute a block in a temporary directory and remove the directory and + its contents at the end of the with block. + + YIELDS (Path): The path of the temp directory. + """ d = Path(tempfile.mkdtemp()) yield d shutil.rmtree(str(d)) +def get_hash(data) -> str: + """Get the hash for a JSON-serializable object. + + data: The data to hash. + RETURNS (str): The hash. + """ + data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") + return hashlib.md5(data_str).hexdigest() + + +def get_checksum(path: Union[Path, str]) -> str: + """Get the checksum for a file given its file path. + + path (Union[Path, str]): The file path. + RETURNS (str): The checksum. + """ + return hashlib.md5(Path(path).read_bytes()).hexdigest() + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer. From 90b7fa8feda2b97d93476731c263c39f6bd74061 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 15:33:53 +0200 Subject: [PATCH 160/203] Run DVC command in project dir --- spacy/cli/project.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 70bba0e51..f5f41cc3a 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -395,7 +395,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: if subcommand in config.get("run", []): # This is one of the pipeline commands tracked in DVC dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] - run_command(dvc_cmd) + with working_dir(project_dir): + run_command(dvc_cmd) else: cmd = commands[subcommand] # Deps in non-DVC commands aren't tracked, but if they're defined, From dbfa292ed3cce441462f94787acac9e13fcb572d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 15:34:28 +0200 Subject: [PATCH 161/203] Output more stats in evaluate --- spacy/cli/evaluate.py | 52 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 67123ecf1..a9ddfe9be 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,4 +1,4 @@ -from typing import Optional, List +from typing import Optional, List, Dict from timeit import default_timer as timer from wasabi import Printer from pathlib import Path @@ -89,8 +89,20 @@ def evaluate( "Sent R": f"{scorer.sent_r:.2f}", "Sent F": f"{scorer.sent_f:.2f}", } + data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} + msg.table(results, title="Results") + if scorer.ents_per_type: + data["ents_per_type"] = scorer.ents_per_type + print_ents_per_type(msg, scorer.ents_per_type) + if scorer.textcats_f_per_cat: + data["textcats_f_per_cat"] = scorer.textcats_f_per_cat + print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat) + if scorer.textcats_auc_per_cat: + data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat + print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat) + if displacy_path: docs = [ex.predicted for ex in dev_dataset] render_deps = "parser" in nlp.meta.get("pipeline", []) @@ -105,7 +117,6 @@ def evaluate( ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} if output_path is not None: srsly.write_json(output_path, data) msg.good(f"Saved results to {output_path}") @@ -131,3 +142,40 @@ def render_parses( ) with (output_path / "parses.html").open("w", encoding="utf8") as file_: file_.write(html) + + +def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: + data = [ + (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") + for k, v in scores.items() + ] + msg.table( + data, + header=("", "P", "R", "F"), + aligns=("l", "r", "r", "r"), + title="NER (per type)", + ) + + +def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: + data = [ + (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") + for k, v in scores.items() + ] + msg.table( + data, + header=("", "P", "R", "F"), + aligns=("l", "r", "r", "r"), + title="Textcat F (per type)", + ) + + +def print_textcats_auc_per_cat( + msg: Printer, scores: Dict[str, Dict[str, float]] +) -> None: + msg.table( + [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()], + header=("", "ROC AUC"), + aligns=("l", "r"), + title="Textcat ROC AUC (per label)", + ) From dbe86b3453e1883410dba7377799ee3281896349 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 15:45:19 +0200 Subject: [PATCH 162/203] Update project.py --- spacy/cli/project.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index f5f41cc3a..8d06c494a 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -38,9 +38,15 @@ CACHES = [ ] DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit # it directly and edit the project.yml instead and re-run the project.""" +CLI_HELP = f"""Command-line interface for spaCy projects and working with project +templates. You'd typically start by cloning a project template to a local +directory and fetching its assets like datasets etc. See the project's +{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data +Version Control) to manage input and output files and to ensure steps are only +re-run if their inputs change. +""" - -project_cli = typer.Typer(help="Command-line interface for spaCy projects") +project_cli = typer.Typer(help=CLI_HELP) @project_cli.callback(invoke_without_command=True) @@ -96,7 +102,7 @@ def project_assets_cli( project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), # fmt: on ): - """Use DVC (Data Version Control) to fetch the assets for the project, + """Use DVC (Data Version Control) to fetch project assets. Assets are defined in the "assets" section of the project config. If possible, DVC will try to track the files so you can pull changes from upstream. It will also try and store the checksum so the assets are versioned. If th file From 569376e34ea331f6530dd32d30fe9b9321af098f Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Sun, 28 Jun 2020 16:25:53 +0200 Subject: [PATCH 163/203] Replace curl with requests --- spacy/cli/project.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 8d06c494a..5011a13f9 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -9,6 +9,8 @@ import os import re import shutil import sys +import requests +import tqdm from ._app import app, Arg, Opt, COMMAND, NAME from .. import about @@ -106,7 +108,7 @@ def project_assets_cli( defined in the "assets" section of the project config. If possible, DVC will try to track the files so you can pull changes from upstream. It will also try and store the checksum so the assets are versioned. If th file - can't be tracked or checked, it will be downloaded using curl. If a checksum + can't be tracked or checked, it will be downloaded without DVC. If a checksum is provided in the project config, the file is only downloaded if no local file with the same checksum exists. """ @@ -320,6 +322,7 @@ def fetch_asset( if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") return + dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"] with working_dir(project_path): try: # If these fail, we don't want to output an error or info message. @@ -331,11 +334,13 @@ def fetch_asset( except subprocess.CalledProcessError: dvc_cmd = ["dvc", "get-url", url, str(dest_path)] print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - run_command(["dvc", "add", str(dest_path)]) + run_command(dvc_add_cmd) except subprocess.CalledProcessError: - # TODO: replace curl - run_command(["curl", url, "--output", str(dest_path), "--progress-bar"]) - run_command(["dvc", "add", str(dest_path)]) + try: + download_file(url, dest_path) + except requests.exceptions.HTTPError as e: + msg.fail(f"Download failed: {dest}", e) + run_command(dvc_add_cmd) if checksum and checksum != get_checksum(dest_path): msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.good(f"Fetched asset {dest}") @@ -627,3 +632,26 @@ def check_clone(name: str, dest: Path, repo: str) -> None: f"Can't clone project, parent directory doesn't exist: {dest.parent}", exits=1, ) + + +def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: + """Download a file using requests. + + url (str): The URL of the file. + dest (Path): The destination path. + chunk_size (int): The size of chunks to read/write. + """ + response = requests.get(url, stream=True) + response.raise_for_status() + total = int(response.headers.get("content-length", 0)) + progress_settings = { + "total": total, + "unit": "iB", + "unit_scale": True, + "unit_divisor": chunk_size, + "leave": False, + } + with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: + for data in response.iter_content(chunk_size=chunk_size): + size = f.write(data) + bar.update(size) From 58c8f731bd7614027d537f984e799d47cdacad1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 28 Jun 2020 23:53:14 +0200 Subject: [PATCH 164/203] Set version to v3.0.0.dev9 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index c3b2cb091..54753b5a1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0" +__version__ = "3.0.0.dev9" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 7c9178d5035b5e74e97fa7495f121d35ce1b9c41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 28 Jun 2020 23:56:45 +0200 Subject: [PATCH 165/203] Update requirements --- requirements.txt | 2 +- setup.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 654c8e278..e67f481fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc==8.0.0a9 +thinc==8.0.0a11 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 83085340d..09f43247f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,13 +36,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a9 + thinc==8.0.0a11 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc==8.0.0a9 + thinc==8.0.0a11 blis>=0.4.0,<0.5.0 wasabi>=0.7.0,<1.1.0 srsly>=2.1.0,<3.0.0 From e14bf9decb3a278fc03c2f3f743c64f18cacc5f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Sun, 28 Jun 2020 23:58:10 +0200 Subject: [PATCH 166/203] Set version to v3.0.0.dev9 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 54753b5a1..c58d7d23c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev9" +__version__ = "3.0.0.dev10" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From cfeb2ba4d79aaf351280a223755b3876283cf644 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Mon, 29 Jun 2020 09:51:20 +0200 Subject: [PATCH 167/203] updating thinc also in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 66a06c1d9..592156bd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a9", + "thinc==8.0.0a11", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" From da504737012e5c92c191b8ad37072f2122ec8666 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal <honnibal+gh@gmail.com> Date: Mon, 29 Jun 2020 12:17:41 +0200 Subject: [PATCH 168/203] Tweak efficiency of arc_eager.set_costs --- spacy/syntax/arc_eager.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index fcc05de3f..f129ee7d1 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -729,12 +729,13 @@ cdef class ArcEager(TransitionSystem): cdef ArcEagerGold gold_ = gold gold_.update(stcls) gold_state = gold_.c - n_gold = 0 + cdef int n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls.c, self.c[i].label): is_valid[i] = True costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) - n_gold += costs[i] <= 0 + if costs[i] <= 0: + n_gold += 1 else: is_valid[i] = False costs[i] = 9000 From acbf6345c9d15e2bfa3d91746a83fa4d32fb9d3d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 29 Jun 2020 13:56:07 +0200 Subject: [PATCH 169/203] Fix thinc dependency --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 66a06c1d9..592156bd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a9", + "thinc==8.0.0a11", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" From 2d9604d39cbf30edf7c4170bb872045e36c7090f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 29 Jun 2020 13:56:46 +0200 Subject: [PATCH 170/203] Set version to v3.0.0.dev11 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index c58d7d23c..7a5e2bd43 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev10" +__version__ = "3.0.0.dev11" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From fc3cb1fa9ebccc9d2604bcdaede3e7961efe29de Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Mon, 29 Jun 2020 13:59:17 +0200 Subject: [PATCH 171/203] NER align tests (#5656) * one_to_man works better. misalignment doesn't yet. * fix tests * restore example * xfail alignment tests --- spacy/gold/example.pyx | 8 ++-- spacy/tests/test_gold.py | 81 ++++++++++++++++++++++++++-------------- 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 169965c3d..505c2a633 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -47,7 +47,7 @@ cdef class Example: def __set__(self, doc): self.x = doc - + property reference: def __get__(self): return self.y @@ -60,7 +60,7 @@ cdef class Example: self.x.copy(), self.y.copy() ) - + @classmethod def from_dict(cls, Doc predicted, dict example_dict): if example_dict is None: @@ -78,7 +78,7 @@ cdef class Example: predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) ) - + @property def alignment(self): if self._alignment is None: @@ -151,7 +151,7 @@ cdef class Example: x_text = self.x.text[end_char:] x_text_offset = end_char x_tags = biluo_tags_from_offsets( - self.x, + self.x, [(e.start_char, e.end_char, e.label_) for e in x_spans], missing=None ) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 17f0933d1..96acb8982 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -230,14 +230,13 @@ def test_json2docs_no_ner(en_vocab): Doc( doc.vocab, words=[w.text for w in doc], - spaces=[bool(w.whitespace_) for w in doc] + spaces=[bool(w.whitespace_) for w in doc], ), - doc + doc, ) ner_tags = eg.get_aligned_ner() assert ner_tags == [None, None, None, None, None] - def test_split_sentences(en_vocab): words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] @@ -283,8 +282,8 @@ def test_split_sentences(en_vocab): assert split_examples[1].text == "had loads of fun " -def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): - # one-to-many +@pytest.mark.xfail(reason="Alignment should be fixed after example refactor") +def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): words = ["I", "flew to", "San Francisco Valley", "."] spaces = [True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) @@ -292,9 +291,28 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "O", "U-LOC", "O"] + + entities = [ + (len("I "), len("I flew to"), "ORG"), + (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + ] + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "U-ORG", "U-LOC", "O"] + + entities = [ + (len("I "), len("I flew"), "ORG"), + (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + ] + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() assert ner_tags == ["O", None, "U-LOC", "O"] - - # many-to-one + + +def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) @@ -304,31 +322,38 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] - # misaligned + entities = [ + (len("I "), len("I flew to"), "ORG"), + (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + ] + gold_words = ["I", "flew to", "San Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"] + + +@pytest.mark.xfail(reason="Alignment should be fixed after example refactor") +def test_gold_biluo_misaligned(en_vocab, en_tokenizer): words = ["I flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - offset_start = len("I flew to ") - offset_end = len("I flew to San Francisco Valley") - entities = [(offset_start, offset_end, "LOC")] - links = {(offset_start, offset_end): {"Q816843": 1.0}} + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] gold_words = ["I", "flew to", "San", "Francisco Valley", "."] - example = Example.from_dict( - doc, {"words": gold_words, "entities": entities, "links": links} - ) + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() - assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"] - #assert example.get_aligned("ENT_KB_ID", as_string=True) == [ - # "", - # "", - # "Q816843", - # "Q816843", - # "", - #] - #assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == { - # "Q816843": 1.0 - #} + assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"] + entities = [ + (len("I "), len("I flew to"), "ORG"), + (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + ] + gold_words = ["I", "flew to", "San", "Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + ner_tags = example.get_aligned_ner() + assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"] + + +def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces( ["I", "flew", "to", "San Francisco", "Valley", "."], @@ -344,7 +369,8 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] - # from issue #4791 + +def test_gold_biluo_4791(en_vocab, en_tokenizer): doc = en_tokenizer("I'll return the ₹54 amount") gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] @@ -593,7 +619,6 @@ def test_tuple_format_implicit_invalid(): _train(train_data) - def _train(train_data): nlp = English() ner = nlp.create_pipe("ner") From 8d3c0306e17340215cd6195a543b196c80e5e177 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Date: Mon, 29 Jun 2020 14:33:00 +0200 Subject: [PATCH 172/203] refactor fixes (#5664) * fixes in ud_train, UX for morphs * update pyproject with new version of thinc * fixes in debug_data script * cleanup of old unused error messages * remove obsolete TempErrors * move error messages to errors.py * add ENT_KB_ID to default DocBin serialization * few fixes to simple_ner * fix tags --- bin/ud/ud_train.py | 37 +++++---------- examples/training/conllu.py | 2 +- examples/training/train_ner.py | 2 +- spacy/cli/debug_data.py | 35 +++++++------- spacy/errors.py | 83 ++++++---------------------------- spacy/gold/corpus.py | 2 +- spacy/gold/example.pyx | 24 +++++----- spacy/gold/gold_io.pyx | 2 +- spacy/morphology.pyx | 4 +- spacy/pipeline/simple_ner.py | 9 ++-- spacy/tokens/_serialize.py | 2 +- spacy/tokens/doc.pyx | 2 +- spacy/tokens/morphanalysis.pyx | 3 ++ 13 files changed, 68 insertions(+), 139 deletions(-) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 7bf5dbb5e..88c534d0a 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -78,8 +78,7 @@ def read_data( head = int(head) - 1 if head != "0" else id_ sent["words"].append(word) sent["tags"].append(tag) - sent["morphology"].append(_parse_morph_string(morph)) - sent["morphology"][-1].add("POS_%s" % pos) + sent["morphs"].append(_compile_morph_string(morph, pos)) sent["heads"].append(head) sent["deps"].append("ROOT" if dep == "root" else dep) sent["spaces"].append(space_after == "_") @@ -88,12 +87,12 @@ def read_data( if oracle_segments: docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) golds.append(sent) - assert golds[-1].morphology is not None + assert golds[-1]["morphs"] is not None sent_annots.append(sent) if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: doc, gold = _make_gold(nlp, None, sent_annots) - assert gold.morphology is not None + assert gold["morphs"] is not None sent_annots = [] docs.append(doc) golds.append(gold) @@ -109,17 +108,10 @@ def read_data( return golds_to_gold_data(docs, golds) -def _parse_morph_string(morph_string): +def _compile_morph_string(morph_string, pos): if morph_string == '_': - return set() - output = [] - replacements = {'1': 'one', '2': 'two', '3': 'three'} - for feature in morph_string.split('|'): - key, value = feature.split('=') - value = replacements.get(value, value) - value = value.split(',')[0] - output.append('%s_%s' % (key, value.lower())) - return set(output) + return f"POS={pos}" + return morph_string + f"|POS={pos}" def read_conllu(file_): @@ -155,7 +147,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): sent_starts = [] for sent in sent_annots: gold["heads"].extend(len(gold["words"])+head for head in sent["heads"]) - for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]: + for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]: gold[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent["words"]) - 1)) @@ -168,7 +160,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): doc = nlp.make_doc(text) gold.pop("spaces") gold["sent_starts"] = sent_starts - for i in range(len(gold.heads)): + for i in range(len(gold["heads"])): if random.random() < drop_deps: gold["heads"][i] = None gold["labels"][i] = None @@ -185,7 +177,7 @@ def golds_to_gold_data(docs, golds): """Get out the training data format used by begin_training""" data = [] for doc, gold in zip(docs, golds): - example = Example.from_dict(doc, gold) + example = Example.from_dict(doc, dict(gold)) data.append(example) return data @@ -354,8 +346,7 @@ def initialize_pipeline(nlp, examples, config, device): if config.multitask_sent: nlp.parser.add_multitask_objective("sent_start") for eg in examples: - gold = eg.gold - for tag in gold.tags: + for tag in eg.get_aligned("TAG", as_string=True): if tag is not None: nlp.tagger.add_label(tag) if torch is not None and device != -1: @@ -489,10 +480,6 @@ def main( Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) - Token.set_extension("get_conllu_lines", method=get_token_conllu) - Token.set_extension("begins_fused", default=False) - Token.set_extension("inside_fused", default=False) - spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False @@ -535,10 +522,10 @@ def main( else: batches = minibatch(examples, size=batch_sizes) losses = {} - n_train_words = sum(len(eg.doc) for eg in examples) + n_train_words = sum(len(eg.predicted) for eg in examples) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: - pbar.update(sum(len(ex.doc) for ex in batch)) + pbar.update(sum(len(ex.predicted) for ex in batch)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( batch, diff --git a/examples/training/conllu.py b/examples/training/conllu.py index 0758775cf..ecc07ccf2 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -283,7 +283,7 @@ def initialize_pipeline(nlp, examples, config): nlp.parser.moves.add_action(2, "subtok") nlp.add_pipe(nlp.create_pipe("tagger")) for eg in examples: - for tag in eg.gold.tags: + for tag in eg.get_aligned("TAG", as_string=True): if tag is not None: nlp.tagger.add_label(tag) # Replace labels that didn't make the frequency cutoff diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index f439fda23..98b428bf8 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -56,7 +56,7 @@ def main(model=None, output_dir=None, n_iter=100): print("Add label", ent[2]) ner.add_label(ent[2]) - with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): + with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module="spacy") diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 09c513d89..6026c4b52 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -102,9 +102,6 @@ def debug_data( corpus = Corpus(train_path, dev_path) try: train_dataset = list(corpus.train_dataset(nlp)) - train_dataset_unpreprocessed = list( - corpus.train_dataset_without_preprocessing(nlp) - ) except ValueError as e: loading_train_error_message = f"Training data cannot be loaded: {e}" try: @@ -120,11 +117,9 @@ def debug_data( msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_dataset constantly - gold_train_data = _compile_gold(train_dataset, pipeline, nlp) - gold_train_unpreprocessed_data = _compile_gold( - train_dataset_unpreprocessed, pipeline - ) - gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp) + gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True) + gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False) + gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] @@ -497,7 +492,7 @@ def _load_file(file_path: Path, msg: Printer) -> None: def _compile_gold( - examples: Sequence[Example], pipeline: List[str], nlp: Language + examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool ) -> Dict[str, Any]: data = { "ner": Counter(), @@ -517,9 +512,9 @@ def _compile_gold( "n_cats_multilabel": 0, "texts": set(), } - for example in examples: - gold = example.reference - doc = example.predicted + for eg in examples: + gold = eg.reference + doc = eg.predicted valid_words = [x for x in gold if x is not None] data["words"].update(valid_words) data["n_words"] += len(valid_words) @@ -530,7 +525,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in pipeline: - for i, label in enumerate(gold.ner): + for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: @@ -556,16 +551,18 @@ def _compile_gold( if list(gold.cats.values()).count(1.0) != 1: data["n_cats_multilabel"] += 1 if "tagger" in pipeline: - data["tags"].update([x for x in gold.tags if x is not None]) + tags = eg.get_aligned("TAG", as_string=True) + data["tags"].update([x for x in tags if x is not None]) if "parser" in pipeline: - data["deps"].update([x for x in gold.labels if x is not None]) - for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)): + aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj) + data["deps"].update([x for x in aligned_deps if x is not None]) + for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)): if head == i: data["roots"].update([dep]) data["n_sents"] += 1 - if nonproj.is_nonproj_tree(gold.heads): + if nonproj.is_nonproj_tree(aligned_heads): data["n_nonproj"] += 1 - if nonproj.contains_cycle(gold.heads): + if nonproj.contains_cycle(aligned_heads): data["n_cycles"] += 1 return data @@ -581,7 +578,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int: for eg in data: labels = [ label.split("-")[1] - for label in eg.gold.ner + for label in eg.get_aligned_ner() if label not in ("O", "-", None) ] if label not in labels: diff --git a/spacy/errors.py b/spacy/errors.py index e152bb1ff..b3e6efdd4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -132,6 +132,7 @@ class Warnings(object): "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") # TODO: fix numbering after merging develop into master + W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " "input data correctly formatted ?") W094 = ("Model '{model}' ({model_version}) specifies an under-constrained " @@ -154,7 +155,7 @@ class Warnings(object): "so a default configuration was used.") W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " "but got '{type}' instead, so ignoring it.") - W100 = ("Skipping unsupported morphological feature(s): {feature}. " + W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "string \"Field1=Value1,Value2|Field2=Value3\".") @@ -182,18 +183,13 @@ class Errors(object): "`nlp.select_pipes()`, you should remove them explicitly with " "`nlp.remove_pipe()` before the pipeline is restored. Names of " "the new components: {names}") - E009 = ("The `update` method expects same number of docs and golds, but " - "got: {n_docs} docs, {n_golds} golds.") E010 = ("Word vectors set to length 0. This may be because you don't have " "a model installed or loaded, or because your model doesn't " "include word vectors. For more info, see the docs:\n" "https://spacy.io/usage/models") E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") - E013 = ("Error selecting action in matcher") E014 = ("Unknown tag ID: {tag}") - E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use " - "`force=True` to overwrite.") E016 = ("MultitaskObjective target should be function or one of: dep, " "tag, ent, dep_tag_offset, ent_tag.") E017 = ("Can only add unicode or bytes. Got type: {value_type}") @@ -201,21 +197,8 @@ class Errors(object): "refers to an issue with the `Vocab` or `StringStore`.") E019 = ("Can't create transition with unknown action ID: {action}. Action " "IDs are enumerated in spacy/syntax/{src}.pyx.") - E020 = ("Could not find a gold-standard action to supervise the " - "dependency parser. The tree is non-projective (i.e. it has " - "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). " - "The ArcEager transition system only supports projective trees. " - "To learn non-projective representations, transform the data " - "before training and after parsing. Either pass " - "`make_projective=True` to the GoldParse class, or use " - "spacy.syntax.nonproj.preprocess_training_data.") - E021 = ("Could not find a gold-standard action to supervise the " - "dependency parser. The GoldParse was projective. The transition " - "system has {n_actions} actions. State at failure: {state}") E022 = ("Could not find a transition with the name '{name}' in the NER " "model.") - E023 = ("Error cleaning up beam: The same state occurred twice at " - "memory address {addr} and position {i}.") E024 = ("Could not find an optimal move to supervise the parser. Usually, " "this means that the model can't be updated in a way that's valid " "and satisfies the correct annotations specified in the GoldParse. " @@ -259,7 +242,6 @@ class Errors(object): "offset {start}.") E037 = ("Error calculating span: Can't find a token ending at character " "offset {end}.") - E038 = ("Error finding sentence for span. Infinite loop detected.") E039 = ("Array bounds exceeded while searching for root word. This likely " "means the parse tree is in an invalid state. Please report this " "issue here: http://github.com/explosion/spaCy/issues") @@ -290,8 +272,6 @@ class Errors(object): E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}") E060 = ("Cannot add new key to vectors: the table is full. Current shape: " "({rows}, {cols}).") - E061 = ("Bad file name: {filename}. Example of a valid file name: " - "'vectors.128.f.bin'") E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 " "and 63 are occupied. You can replace one by specifying the " "`flag_id` explicitly, e.g. " @@ -305,39 +285,17 @@ class Errors(object): "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") E065 = ("Only one of the vector table's width and shape can be specified. " "Got width {width} and shape {shape}.") - E066 = ("Error creating model helper for extracting columns. Can only " - "extract columns by positive integer. Got: {value}.") E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " "an entity) without a preceding 'B' (beginning of an entity). " "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") - E069 = ("Invalid gold-standard parse tree. Found cycle between word " - "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting " - "with tokens: {doc_tokens}.") - E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " - "does not align with number of annotations ({n_annots}).") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " "match the one in the vocab ({vocab_orth}).") - E072 = ("Error serializing lexeme: expected data length {length}, " - "got {bad_length}.") E073 = ("Cannot assign vector of length {new_length}. Existing vectors " "are of length {length}. You can use `vocab.reset_vectors` to " "clear the existing vectors and resize the table.") E074 = ("Error interpreting compiled match pattern: patterns are expected " "to end with the attribute {attr}. Got: {bad_attr}.") - E075 = ("Error accepting match: length ({length}) > maximum length " - "({max_len}).") - E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc " - "has {words} words.") - E077 = ("Error computing {value}: number of Docs ({n_docs}) does not " - "equal number of GoldParse objects ({n_golds}) in batch.") - E078 = ("Error computing score: number of words in Doc ({words_doc}) does " - "not equal number of words in GoldParse ({words_gold}).") - E079 = ("Error computing states in beam: number of predicted beams " - "({pbeams}) does not equal number of gold beams ({gbeams}).") - E080 = ("Duplicate state found in beam: {key}.") - E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " - "does not equal number of losses ({losses}).") E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "match.") @@ -345,8 +303,6 @@ class Errors(object): "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}") E084 = ("Error assigning label ID {label} to span: not in StringStore.") E085 = ("Can't create lexeme for string '{string}'.") - E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does " - "not match hash {hash_id} in StringStore.") E087 = ("Unknown displaCy style: {style}.") E088 = ("Text of length {length} exceeds maximum of {max_length}. The " "v2.x parser and NER models require roughly 1GB of temporary " @@ -388,7 +344,6 @@ class Errors(object): E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " "you're setting don't overlap.") - E104 = ("Can't find JSON schema for '{name}'.") E105 = ("The Doc.print_tree() method is now deprecated. Please use " "Doc.to_json() instead or write your own function.") E106 = ("Can't find doc._.{attr} attribute specified in the underscore " @@ -411,8 +366,6 @@ class Errors(object): "practically no advantage over pickling the parent Doc directly. " "So instead of pickling the span, pickle the Doc it belongs to or " "use Span.as_doc to convert the span to a standalone Doc object.") - E113 = ("The newly split token can only have one root (head = 0).") - E114 = ("The newly split token needs to have a root (head = 0).") E115 = ("All subtokens must have associated heads.") E116 = ("Cannot currently add labels to pretrained text classifier. Add " "labels before training begins. This functionality was available " @@ -435,12 +388,9 @@ class Errors(object): "equal to span length ({span_len}).") E122 = ("Cannot find token to be split. Did it get merged?") E123 = ("Cannot find head of token to be split. Did it get merged?") - E124 = ("Cannot read from file: {path}. Supported formats: {formats}") E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") - E127 = ("Cannot create phrase pattern representation for length 0. This " - "is likely a bug in spaCy.") E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword " "arguments to exclude fields from being serialized or deserialized " "is now deprecated. Please use the `exclude` argument instead. " @@ -482,8 +432,6 @@ class Errors(object): "provided {found}.") E143 = ("Labels for component '{name}' not initialized. Did you forget to " "call add_label()?") - E144 = ("Could not find parameter `{param}` when building the entity " - "linker model.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -495,8 +443,6 @@ class Errors(object): "the component matches the model being loaded.") E150 = ("The language of the `nlp` object and the `vocab` should be the " "same, but found '{nlp}' and '{vocab}' respectively.") - E151 = ("Trying to call nlp.update without required annotation types. " - "Expected top-level keys: {exp}. Got: {unexp}.") E152 = ("The attribute {attr} is not supported for token patterns. " "Please use the option validate=True with Matcher, PhraseMatcher, " "or EntityRuler for more details.") @@ -533,11 +479,6 @@ class Errors(object): "that case.") E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" "Current DocBin: {current}\nOther DocBin: {other}") - E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can " - "happen if the tagger was trained with a different set of " - "morphological features. If you're using a pretrained model, make " - "sure that your models are up to date:\npython -m spacy validate") - E168 = ("Unknown field: {field}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") E171 = ("Matcher.add received invalid on_match callback argument: expected " @@ -548,8 +489,6 @@ class Errors(object): E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " "Lookups containing the lemmatization tables. See the docs for " "details: https://spacy.io/api/lemmatizer#init") - E174 = ("Architecture '{name}' not found in registry. Available " - "names: {names}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") @@ -597,10 +536,19 @@ class Errors(object): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") - E200 = ("Specifying a base model with a pretrained component '{component}' " - "can not be combined with adding a pretrained Tok2Vec layer.") # TODO: fix numbering after merging develop into master + E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " + "array and {doc_length} for the Doc itself.") + E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") + E973 = ("Unexpected type for NER data") + E974 = ("Unknown {obj} attribute: {key}") + E975 = ("The method Example.from_dict expects a Doc as first argument, " + "but got {type}") + E976 = ("The method Example.from_dict expects a dict as second argument, " + "but received None.") + E977 = ("Can not compare a MorphAnalysis with a string object. " + "This is likely a bug in spaCy, so feel free to open an issue.") E978 = ("The {method} method of component {name} takes a list of Example objects, " "but found {types} instead.") E979 = ("Cannot convert {type} to an Example object.") @@ -648,13 +596,8 @@ class Errors(object): @add_codes class TempErrors(object): T003 = ("Resizing pretrained Tagger models is not currently supported.") - T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") - T008 = ("Bad configuration of Tagger. This is probably a bug within " - "spaCy. We changed the name of an internal attribute for loading " - "pretrained vectors, and the class has been passed the old name " - "(pretrained_dims) but not the new name (pretrained_vectors).") # fmt: on diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 086c95fb2..42637ce5c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -45,7 +45,7 @@ class Corpus: def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: - if max_length >= 1 and len(reference) >= max_length: + if len(reference) >= max_length >= 1: if reference.is_sentenced: for ref_sent in reference.sents: yield Example( diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 505c2a633..5e36156a9 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -2,7 +2,6 @@ import warnings import numpy -from ..tokens import Token from ..tokens.doc cimport Doc from ..tokens.span cimport Span from ..tokens.span import Span @@ -11,9 +10,8 @@ from .align cimport Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import spans_from_biluo_tags from .align import Alignment -from ..errors import Errors, AlignmentError +from ..errors import Errors, Warnings from ..syntax import nonproj -from ..util import get_words_and_spaces cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): @@ -32,11 +30,10 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cdef class Example: def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): """ Doc can either be text, or an actual Doc """ - msg = "Example.__init__ got None for '{arg}'. Requires Doc." if predicted is None: - raise TypeError(msg.format(arg="predicted")) + raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: - raise TypeError(msg.format(arg="reference")) + raise TypeError(Errors.E972.format(arg="reference")) self.x = predicted self.y = reference self._alignment = alignment @@ -64,9 +61,9 @@ cdef class Example: @classmethod def from_dict(cls, Doc predicted, dict example_dict): if example_dict is None: - raise ValueError("Example.from_dict expected dict, received None") + raise ValueError(Errors.E976) if not isinstance(predicted, Doc): - raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}") + raise TypeError(Errors.E975.format(type=type(predicted))) example_dict = _fix_legacy_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict) if "ORTH" not in tok_dict: @@ -118,7 +115,8 @@ cdef class Example: aligned_deps = [None] * self.x.length heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] - heads, deps = nonproj.projectivize(heads, deps) + if projectivize: + heads, deps = nonproj.projectivize(heads, deps) for cand_i in range(self.x.length): gold_i = cand_to_gold[cand_i] if gold_i is not None: # Alignment found @@ -245,11 +243,11 @@ def _annot2array(vocab, tok_annot, doc_annot): elif key == "cats": pass else: - raise ValueError(f"Unknown doc attribute: {key}") + raise ValueError(Errors.E974.format(obj="doc", key=key)) for key, value in tok_annot.items(): if key not in IDS: - raise ValueError(f"Unknown token attribute: {key}") + raise ValueError(Errors.E974.format(obj="token", key=key)) elif key in ["ORTH", "SPACY"]: pass elif key == "HEAD": @@ -289,7 +287,7 @@ def _add_entities_to_doc(doc, ner_data): doc.ents = ner_data doc.ents = [span for span in ner_data if span.label_] else: - raise ValueError("Unexpected type for NER data") + raise ValueError(Errors.E973) def _parse_example_dict_data(example_dict): @@ -341,7 +339,7 @@ def _fix_legacy_dict_data(example_dict): if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") - warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set") + warnings.warn(Warnings.W092) return { "token_annotation": token_dict, "doc_annotation": doc_dict diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 8dbb5f395..fbf8ebea7 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -145,7 +145,7 @@ def json_to_annotations(doc): example["doc_annotation"] = dict( cats=cats, entities=ner_tags, - links=paragraph.get("links", []) # TODO: fix/test + links=paragraph.get("links", []) ) yield example diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e9d640f81..78e8e17c0 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -107,9 +107,9 @@ cdef class Morphology: Returns the hash of the new analysis. """ cdef MorphAnalysisC* tag_ptr - if features == self.EMPTY_MORPH: - features = "" if isinstance(features, str): + if features == self.EMPTY_MORPH: + features = "" tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features]) if tag_ptr != NULL: return tag_ptr.key diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 9a8991557..e4a1e15e9 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -70,7 +70,7 @@ class SimpleNER(Pipe): def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): if not any(_has_ner(eg) for eg in examples): return 0 - docs = [eg.doc for eg in examples] + docs = [eg.predicted for eg in examples] set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) loss, d_scores = self.get_loss(examples, scores) @@ -89,7 +89,8 @@ class SimpleNER(Pipe): d_scores = [] truths = [] for eg in examples: - gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner] + tags = eg.get_aligned("TAG", as_string=True) + gold_tags = [(tag if tag != "-" else None) for tag in tags] if not self.is_biluo: gold_tags = biluo_to_iob(gold_tags) truths.append(gold_tags) @@ -128,8 +129,8 @@ class SimpleNER(Pipe): pass -def _has_ner(eg): - for ner_tag in eg.gold.ner: +def _has_ner(example): + for ner_tag in example.get_aligned_ner(): if ner_tag != "-" and ner_tag is not None: return True else: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index a3b089222..d16515a57 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors -ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH") +ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") class DocBin(object): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index be8218967..28590e91e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -816,7 +816,7 @@ cdef class Doc: cdef TokenC* tokens = self.c cdef int length = len(array) if length != len(self): - raise ValueError("Cannot set array values longer than the document.") + raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self))) # Get set up for fast loading cdef Pool mem = Pool() diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index 77e499968..c49aede4d 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -1,6 +1,7 @@ from libc.string cimport memset cimport numpy as np +from ..errors import Errors from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t from ..morphology cimport list_features, check_feature, get_by_field @@ -49,6 +50,8 @@ cdef class MorphAnalysis: return self.key def __eq__(self, other): + if isinstance(other, str): + raise ValueError(Errors.E977) return self.key == other.key def __ne__(self, other): From 67928036f2e12e886fe6f4334393d53ffedfedb2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal <honnibal+gh@gmail.com> Date: Mon, 29 Jun 2020 14:45:43 +0200 Subject: [PATCH 173/203] Set version to v3.0.0.dev12 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 7a5e2bd43..6749bff15 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev11" +__version__ = "3.0.0.dev12" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From bf43ebbf61097c15e4add76b0163d852fbca3953 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 16:32:25 +0200 Subject: [PATCH 174/203] fix typo's --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 5011a13f9..185578392 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -107,7 +107,7 @@ def project_assets_cli( """Use DVC (Data Version Control) to fetch project assets. Assets are defined in the "assets" section of the project config. If possible, DVC will try to track the files so you can pull changes from upstream. It will - also try and store the checksum so the assets are versioned. If th file + also try and store the checksum so the assets are versioned. If the file can't be tracked or checked, it will be downloaded without DVC. If a checksum is provided in the project config, the file is only downloaded if no local file with the same checksum exists. @@ -567,7 +567,7 @@ def run_commands( variables (Dict[str, str]): Dictionary of variable names, mapped to their values. Will be used to substitute format string variables in the commands. - silent (boll): Don't print the commands. + silent (bool): Don't print the commands. """ for command in commands: # Substitute variables, e.g. "./{NAME}.json" From f8dddeda2722cb45f5ef4410b5ff4d7be3faf1bd Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 16:38:15 +0200 Subject: [PATCH 175/203] print help msg when just calling 'project' without args --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 185578392..44ff78f2b 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -202,7 +202,7 @@ def project_update_dvc_cli( msg.info(f"No changes found in {CONFIG_FILE}, no update needed") -app.add_typer(project_cli, name="project") +app.add_typer(project_cli, name="project", no_args_is_help=True) ################# From 24664efa234a7c1138858a4e3734ce234e819371 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 16:54:19 +0200 Subject: [PATCH 176/203] Import project_run_all function --- spacy/cli/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index db409f431..5dc3070b6 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -16,6 +16,7 @@ from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 from .project import project_clone, project_assets, project_run # noqa: F401 +from .project import project_run_all # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) From 7c08713baa75094726ebafbd8c092911b72f839f Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 16:54:47 +0200 Subject: [PATCH 177/203] Improve error messages --- spacy/cli/project.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 5011a13f9..c02c1cf98 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Sequence import typer import srsly from pathlib import Path @@ -372,8 +372,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: config_commands = config.get("commands", []) commands = {cmd["name"]: cmd for cmd in config_commands} if subcommand: - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + validate_subcommand(commands.keys(), subcommand) print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") help_text = commands[subcommand].get("help") if help_text: @@ -401,8 +400,7 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + validate_subcommand(commands.keys(), subcommand) if subcommand in config.get("run", []): # This is one of the pipeline commands tracked in DVC dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] @@ -448,10 +446,14 @@ def load_project_config(path: Path) -> Dict[str, Any]: config_path = path / CONFIG_FILE if not config_path.exists(): msg.fail("Can't find project config", config_path, exits=1) - config = srsly.read_yaml(config_path) + invalid_err = f"Invalid project config in {CONFIG_FILE}" + try: + config = srsly.read_yaml(config_path) + except ValueError as e: + msg.fail(invalid_err, e, exits=1) errors = validate(ProjectConfigSchema, config) if errors: - msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) + msg.fail(invalid_err, "\n".join(errors), exits=1) return config @@ -490,8 +492,7 @@ def update_dvc_config( # commands in project.yml and should be run in sequence config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} for name in config.get("run", []): - if name not in config_commands: - msg.fail(f"Can't find command '{name}' in project config", exits=1) + validate_subcommand(config_commands.keys(), name) command = config_commands[name] deps = command.get("deps", []) outputs = command.get("outputs", []) @@ -634,6 +635,20 @@ def check_clone(name: str, dest: Path, repo: str) -> None: ) +def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: + """Check that a subcommand is valid and defined. Raises an error otherwise. + + commands (Sequence[str]): The available commands. + subcommand (str): The subcommand. + """ + if subcommand not in commands: + msg.fail( + f"Can't find command '{subcommand}' in {CONFIG_FILE}. " + f"Available commands: {', '.join(commands)}", + exits=1, + ) + + def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: """Download a file using requests. From 126050f259c3bfbef43129bc156c0a6ac381d1f5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 16:55:24 +0200 Subject: [PATCH 178/203] Improve asset fetching Get all paths first and run dvc add once so it only shows one progress bar and one combined git command (if repo is git repo) --- spacy/cli/project.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index c02c1cf98..861a48f4b 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -295,15 +295,21 @@ def project_assets(project_dir: Path) -> None: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") variables = config.get("variables", {}) + fetched_assets = [] for asset in assets: url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) - fetch_asset(project_path, url, dest, asset.get("checksum")) + fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) + if fetched_path: + fetched_assets.append(str(fetched_path)) + if fetched_assets: + with working_dir(project_path): + run_command(["dvc", "add", *fetched_assets, "--external"]) def fetch_asset( project_path: Path, url: str, dest: Path, checksum: Optional[str] = None -) -> None: +) -> Optional[Path]: """Fetch an asset from a given URL or path. Will try to import the file using DVC's import-url if possible (fully tracked and versioned) and falls back to get-url (versioned) and a non-DVC download if necessary. If a @@ -313,6 +319,8 @@ def fetch_asset( project_path (Path): Path to project directory. url (str): URL or path to asset. checksum (Optional[str]): Optional expected checksum of local file. + RETURNS (Optional[Path]): The path to the fetched asset or None if fetching + the asset failed. """ url = convert_asset_url(url) dest_path = (project_path / dest).resolve() @@ -321,8 +329,7 @@ def fetch_asset( # TODO: add support for caches (dvc import-url with local path) if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") - return - dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"] + return dest_path with working_dir(project_path): try: # If these fail, we don't want to output an error or info message. @@ -334,16 +341,16 @@ def fetch_asset( except subprocess.CalledProcessError: dvc_cmd = ["dvc", "get-url", url, str(dest_path)] print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - run_command(dvc_add_cmd) except subprocess.CalledProcessError: try: download_file(url, dest_path) except requests.exceptions.HTTPError as e: msg.fail(f"Download failed: {dest}", e) - run_command(dvc_add_cmd) + return None if checksum and checksum != get_checksum(dest_path): msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.good(f"Fetched asset {dest}") + return dest_path def project_run_all(project_dir: Path, *dvc_args) -> None: From 3487214ba182572a8696152ec671f08078e0dc91 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 17:45:47 +0200 Subject: [PATCH 179/203] fix shlex.split for non-posix --- spacy/cli/project.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 44ff78f2b..1196eb3dd 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -14,6 +14,7 @@ import tqdm from ._app import app, Arg, Opt, COMMAND, NAME from .. import about +from ..compat import is_windows from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir from ..util import get_hash, get_checksum @@ -234,7 +235,10 @@ def project_clone( # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" - run_command(shlex.split(cmd)) + try: + run_command(shlex.split(cmd, posix=not is_windows)) + except: + raise RuntimeError(f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'.") with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: f.write(name) run_command(["git", "-C", tmp_dir, "fetch"]) From efe7eb71f2e3949f41bacbcc390177243fc4c727 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 17:46:08 +0200 Subject: [PATCH 180/203] create subfolder in working dir --- spacy/cli/project.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 1196eb3dd..7470e23ea 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -82,6 +82,8 @@ def project_clone_cli( initializing DVC (Data Version Control). This allows DVC to integrate with Git. """ + if dest == Path.cwd(): + dest = dest / name project_clone(name, dest, repo=repo, git=git, no_init=no_init) From 894b8e7ff652a26c71ebbe6fa3d75a25104991c8 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 18:16:39 +0200 Subject: [PATCH 181/203] throw warning (instead of crashing) when temp dir can't be cleaned --- spacy/cli/project.py | 4 ++-- spacy/errors.py | 1 + spacy/util.py | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 7470e23ea..dd6b4da11 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -243,8 +243,8 @@ def project_clone( raise RuntimeError(f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'.") with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: f.write(name) - run_command(["git", "-C", tmp_dir, "fetch"]) - run_command(["git", "-C", tmp_dir, "checkout"]) + run_command(["git", "-C", str(tmp_dir), "fetch"]) + run_command(["git", "-C", str(tmp_dir), "checkout"]) shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) msg.good(f"Cloned project '{name}' from {repo}") for sub_dir in DIRS: diff --git a/spacy/errors.py b/spacy/errors.py index b3e6efdd4..c54aa804b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -132,6 +132,7 @@ class Warnings(object): "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") # TODO: fix numbering after merging develop into master + W091 = ("Could not clean/remove the temp directory at {dir}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " "input data correctly formatted ?") diff --git a/spacy/util.py b/spacy/util.py index 3f0a1ec6f..5be83e20f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -467,7 +467,10 @@ def make_tempdir(): """ d = Path(tempfile.mkdtemp()) yield d - shutil.rmtree(str(d)) + try: + shutil.rmtree(str(d)) + except PermissionError: + warnings.warn(Warnings.W091.format(dir=d)) def get_hash(data) -> str: From ff233d5743a24d445d6e6c1e70790d71c6147ec2 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 18:22:33 +0200 Subject: [PATCH 182/203] print details on error msg (e.g. PermissionError on specific file) --- spacy/errors.py | 2 +- spacy/util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c54aa804b..1af673569 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -132,7 +132,7 @@ class Warnings(object): "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") # TODO: fix numbering after merging develop into master - W091 = ("Could not clean/remove the temp directory at {dir}.") + W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " "input data correctly formatted ?") diff --git a/spacy/util.py b/spacy/util.py index 5be83e20f..cdaed7a92 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -469,8 +469,8 @@ def make_tempdir(): yield d try: shutil.rmtree(str(d)) - except PermissionError: - warnings.warn(Warnings.W091.format(dir=d)) + except PermissionError as e: + warnings.warn(Warnings.W091.format(dir=d, msg=e)) def get_hash(data) -> str: From 1176783310095c7c31ba9963a2aa1d1266c66707 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Mon, 29 Jun 2020 18:37:42 +0200 Subject: [PATCH 183/203] fix one more shlex.split --- spacy/cli/project.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index dd6b4da11..16e87cb40 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -246,7 +246,7 @@ def project_clone( run_command(["git", "-C", str(tmp_dir), "fetch"]) run_command(["git", "-C", str(tmp_dir), "checkout"]) shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) - msg.good(f"Cloned project '{name}' from {repo}") + msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") for sub_dir in DIRS: dir_path = project_dir / sub_dir if not dir_path.exists(): @@ -484,7 +484,7 @@ def update_dvc_config( path = path.resolve() dvc_config_path = path / DVC_CONFIG if dvc_config_path.exists(): - # Cneck if the file was generated using the current config, if not, redo + # Check if the file was generated using the current config, if not, redo with dvc_config_path.open("r", encoding="utf8") as f: ref_hash = f.readline().strip().replace("# ", "") if ref_hash == config_hash and not force: @@ -578,7 +578,7 @@ def run_commands( for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) - command = shlex.split(command) + command = shlex.split(command, posix=not is_windows) # TODO: is this needed / a good idea? if len(command) and command[0] == "python": command[0] = sys.executable From 1d2c646e579f0e018e2eed363cc0a0240dde8d47 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 20:07:21 +0200 Subject: [PATCH 184/203] Fix init and remove .dvc/plots --- spacy/cli/project.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 861a48f4b..5382e5bdc 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -21,6 +21,7 @@ from ..util import get_hash, get_checksum CONFIG_FILE = "project.yml" DVC_CONFIG = "dvc.yaml" +DVC_DIR = ".dvc" DIRS = [ "assets", "metas", @@ -88,6 +89,7 @@ def project_clone_cli( def project_init_cli( path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), + force: bool = Opt(False, "--force", "-F", help="Force initiziation"), ): """Initialize a project directory with DVC and optionally Git. This should typically be taken care of automatically when you run the "project clone" @@ -95,7 +97,7 @@ def project_init_cli( be a Git repo, it should be initialized with Git first, before initializing DVC. This allows DVC to integrate with Git. """ - project_init(path, git=git, silent=True) + project_init(path, git=git, force=force, silent=True) @project_cli.command("assets") @@ -246,7 +248,7 @@ def project_clone( if not dir_path.exists(): dir_path.mkdir(parents=True) if not no_init: - project_init(project_dir, git=git, silent=True) + project_init(project_dir, git=git, force=True, silent=True) msg.good(f"Your project is now ready!", dest) print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") @@ -255,6 +257,7 @@ def project_init( project_dir: Path, *, git: bool = False, + force: bool = False, silent: bool = False, analytics: bool = False, ): @@ -265,19 +268,29 @@ def project_init( silent (bool): Don't print any output (via DVC). analytics (bool): Opt-in to DVC analytics (defaults to False). """ + project_dir = project_dir.resolve() with working_dir(project_dir): + if git: + run_command(["git", "init"]) init_cmd = ["dvc", "init"] if silent: init_cmd.append("--quiet") if not git: init_cmd.append("--no-scm") - if git: - run_command(["git", "init"]) + if force: + init_cmd.append("--force") run_command(init_cmd) # We don't want to have analytics on by default – our users should # opt-in explicitly. If they want it, they can always enable it. if not analytics: run_command(["dvc", "config", "core.analytics", "false"]) + # Remove unused and confusing plot templates from .dvc directory + # TODO: maybe we shouldn't do this, but it's otherwise super confusing + # once you commit your changes via Git and it creates a bunch of files + # that have no purpose + plots_dir = project_dir / DVC_DIR / "plots" + if plots_dir.exists(): + shutil.rmtree(str(plots_dir)) config = load_project_config(project_dir) setup_check_dvc(project_dir, config) From c874dde66c84d9b162c87e09d7a84548c3c36f6e Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 20:11:34 +0200 Subject: [PATCH 185/203] Show help on "spacy project" --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 5382e5bdc..27dc83e29 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -49,7 +49,7 @@ Version Control) to manage input and output files and to ensure steps are only re-run if their inputs change. """ -project_cli = typer.Typer(help=CLI_HELP) +project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True) @project_cli.callback(invoke_without_command=True) From e8033df81ecde6baec2e7f69027827e4ee726939 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Mon, 29 Jun 2020 20:30:42 +0200 Subject: [PATCH 186/203] Also handle python3 and pip3 --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 27dc83e29..1bb92f749 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -595,9 +595,9 @@ def run_commands( command = command.format(**variables) command = shlex.split(command) # TODO: is this needed / a good idea? - if len(command) and command[0] == "python": + if len(command) and command[0] in ("python", "python3"): command[0] = sys.executable - elif len(command) and command[0] == "pip": + elif len(command) and command[0] in ("pip", "pip3"): command = [sys.executable, "-m", "pip", *command[1:]] if not silent: print(" ".join(command)) From 7e4cbda89ac86cb2e45701c8335f7bfb9f8f4e33 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 11:09:53 +0200 Subject: [PATCH 187/203] fix project_init for relative path --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 16e87cb40..4588e3336 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -284,8 +284,8 @@ def project_init( # opt-in explicitly. If they want it, they can always enable it. if not analytics: run_command(["dvc", "config", "core.analytics", "false"]) - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) def project_assets(project_dir: Path) -> None: From d23be563eb8ee4e72b9389e675a7e4dcb7140941 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 11:23:35 +0200 Subject: [PATCH 188/203] remove redundant setting of no_args_is_help --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 3d2aa9440..50517d594 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -207,7 +207,7 @@ def project_update_dvc_cli( msg.info(f"No changes found in {CONFIG_FILE}, no update needed") -app.add_typer(project_cli, name="project", no_args_is_help=True) +app.add_typer(project_cli, name="project") ################# From 140c4896a0c8fb0f5798aaa6785d6f169d5f6ce8 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 12:54:15 +0200 Subject: [PATCH 189/203] split_command util function --- spacy/cli/project.py | 8 +++----- spacy/util.py | 10 +++++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 50517d594..c82e4e774 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -4,7 +4,6 @@ import srsly from pathlib import Path from wasabi import msg import subprocess -import shlex import os import re import shutil @@ -14,10 +13,9 @@ import tqdm from ._app import app, Arg, Opt, COMMAND, NAME from .. import about -from ..compat import is_windows from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir -from ..util import get_hash, get_checksum +from ..util import get_hash, get_checksum, split_command CONFIG_FILE = "project.yml" @@ -240,7 +238,7 @@ def project_clone( with make_tempdir() as tmp_dir: cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" try: - run_command(shlex.split(cmd, posix=not is_windows)) + run_command(split_command(cmd)) except: raise RuntimeError(f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'.") with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: @@ -599,7 +597,7 @@ def run_commands( for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) - command = shlex.split(command, posix=not is_windows) + command = split_command(command) # TODO: is this needed / a good idea? if len(command) and command[0] in ("python", "python3"): command[0] = sys.executable diff --git a/spacy/util.py b/spacy/util.py index cdaed7a92..a61bbf044 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -22,7 +22,7 @@ from contextlib import contextmanager import tempfile import shutil import hashlib - +import shlex try: import cupy.random @@ -35,7 +35,7 @@ except ImportError: import importlib_metadata from .symbols import ORTH -from .compat import cupy, CudaStream +from .compat import cupy, CudaStream, is_windows from .errors import Errors, Warnings from . import about @@ -925,7 +925,11 @@ def from_disk(path, readers, exclude): # Split to support file names like meta.json if key.split(".")[0] not in exclude: reader(path / key) - return path + return + + +def split_command(command): + return shlex.split(command, posix=not is_windows) def import_file(name, loc): From 7584fdafec2022bb19bd458eca07f1903c98ef20 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 12:59:13 +0200 Subject: [PATCH 190/203] Fix typo --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index a61bbf044..7f70a131a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -925,7 +925,7 @@ def from_disk(path, readers, exclude): # Split to support file names like meta.json if key.split(".")[0] not in exclude: reader(path / key) - return + return path def split_command(command): From 3aca404735eab7be48fffcc19dc905ee6db753a5 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 13:17:00 +0200 Subject: [PATCH 191/203] Make run_command take string and list --- spacy/util.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 7f70a131a..4b0bdbae5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -434,11 +434,24 @@ def get_package_path(name): return Path(pkg.__file__).parent -def run_command(command: List[str]) -> None: - """Run a command on the command line as a subprocess. +def split_command(command: str) -> List[str]: + """Split a string command using shlex. Handles platform compatibility. - command (list): The split command. + command (str) : The command to split + RETURNS (List[str]): The split command. """ + return shlex.split(command, posix=not is_windows) + + +def run_command(command: Union[str, List[str]]) -> None: + """Run a command on the command line as a subprocess. If the subprocess + returns a non-zero exit code, a system exit is performed. + + command (str / List[str]): The command. If provided as a string, the + string will be split using shlex.split. + """ + if isinstance(command, str): + command = split_command(command) status = subprocess.call(command, env=os.environ.copy()) if status != 0: sys.exit(status) @@ -928,10 +941,6 @@ def from_disk(path, readers, exclude): return path -def split_command(command): - return shlex.split(command, posix=not is_windows) - - def import_file(name, loc): """Import module from a file. Used to load models from a directory. From c5e31acb06bb52358dd0097ebd459f3b9117115c Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 13:17:14 +0200 Subject: [PATCH 192/203] Make working_dir yield absolute cwd path --- spacy/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 4b0bdbae5..f8acebb63 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -462,11 +462,14 @@ def working_dir(path: Union[str, Path]) -> None: """Change current working directory and returns to previous on exit. path (str / Path): The directory to navigate to. + YIELDS (Path): The absolute path to the current working directory. This + should be used if the block needs to perform actions within the working + directory, to prevent mismatches with relative paths. """ prev_cwd = Path.cwd() os.chdir(str(path)) try: - yield + yield Path(path).resolve() finally: os.chdir(prev_cwd) From 72175b5c601c7b76f75d2e688b723489e2331b9d Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 13:17:26 +0200 Subject: [PATCH 193/203] Update project command --- spacy/cli/project.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index c82e4e774..6ac604bdf 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -15,7 +15,7 @@ from ._app import app, Arg, Opt, COMMAND, NAME from .. import about from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir -from ..util import get_hash, get_checksum, split_command +from ..util import get_hash, get_checksum CONFIG_FILE = "project.yml" @@ -238,9 +238,10 @@ def project_clone( with make_tempdir() as tmp_dir: cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" try: - run_command(split_command(cmd)) - except: - raise RuntimeError(f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'.") + run_command(cmd) + except SystemExit: + err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'" + msg.fail(err) with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: f.write(name) run_command(["git", "-C", str(tmp_dir), "fetch"]) @@ -272,8 +273,7 @@ def project_init( silent (bool): Don't print any output (via DVC). analytics (bool): Opt-in to DVC analytics (defaults to False). """ - project_dir = project_dir.resolve() - with working_dir(project_dir): + with working_dir(project_dir.resolve()) as cwd: if git: run_command(["git", "init"]) init_cmd = ["dvc", "init"] @@ -292,11 +292,11 @@ def project_init( # TODO: maybe we shouldn't do this, but it's otherwise super confusing # once you commit your changes via Git and it creates a bunch of files # that have no purpose - plots_dir = project_dir / DVC_DIR / "plots" + plots_dir = cwd / DVC_DIR / "plots" if plots_dir.exists(): shutil.rmtree(str(plots_dir)) - config = load_project_config(project_dir) - setup_check_dvc(project_dir, config) + config = load_project_config(cwd) + setup_check_dvc(cwd, config) def project_assets(project_dir: Path) -> None: @@ -597,8 +597,13 @@ def run_commands( for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) - command = split_command(command) - # TODO: is this needed / a good idea? + # Not sure if this is needed or a good idea. Motivation: users may often + # use commands in their config that reference "python" and we want to + # make sure that it's always executing the same Python that spaCy is + # executed with and the pip in the same env, not some other Python/pip. + # Also ensures cross-compatibility if user 1 writes "python3" (because + # that's how it's set up on their system), and user 2 without the + # shortcut tries to re-run the command. if len(command) and command[0] in ("python", "python3"): command[0] = sys.executable elif len(command) and command[0] in ("pip", "pip3"): From 8e205059705a97bd1b7b978b787c1878bbbb5309 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 13:29:45 +0200 Subject: [PATCH 194/203] Resolve within working_dir context manager --- spacy/cli/project.py | 2 +- spacy/util.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 6ac604bdf..f0315dfe0 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -273,7 +273,7 @@ def project_init( silent (bool): Don't print any output (via DVC). analytics (bool): Opt-in to DVC analytics (defaults to False). """ - with working_dir(project_dir.resolve()) as cwd: + with working_dir(project_dir) as cwd: if git: run_command(["git", "init"]) init_cmd = ["dvc", "init"] diff --git a/spacy/util.py b/spacy/util.py index f8acebb63..1d998ec36 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -467,11 +467,12 @@ def working_dir(path: Union[str, Path]) -> None: directory, to prevent mismatches with relative paths. """ prev_cwd = Path.cwd() - os.chdir(str(path)) + current = Path(path).resolve() + os.chdir(str(current)) try: - yield Path(path).resolve() + yield current finally: - os.chdir(prev_cwd) + os.chdir(str(prev_cwd)) @contextmanager From b2281119259681f04ffe178579efcb16a9b06f81 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 14:54:45 +0200 Subject: [PATCH 195/203] fix funny printing --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index f0315dfe0..1c6d36edf 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -609,7 +609,7 @@ def run_commands( elif len(command) and command[0] in ("pip", "pip3"): command = [sys.executable, "-m", "pip", *command[1:]] if not silent: - print(" ".join(command)) + print(f"Running command: {command}") run_command(command) From a46b76f188290adc17e3bf5f883c089c18383ec7 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 15:39:24 +0200 Subject: [PATCH 196/203] use current working dir as default throughout --- spacy/cli/project.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 1c6d36edf..ce2f801ad 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -88,7 +88,7 @@ def project_clone_cli( @project_cli.command("init") def project_init_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), force: bool = Opt(False, "--force", "-F", help="Force initiziation"), ): @@ -104,7 +104,7 @@ def project_init_cli( @project_cli.command("assets") def project_assets_cli( # fmt: off - project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), # fmt: on ): """Use DVC (Data Version Control) to fetch project assets. Assets are @@ -125,7 +125,7 @@ def project_assets_cli( def project_run_all_cli( # fmt: off ctx: typer.Context, - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): @@ -149,7 +149,7 @@ def project_run_all_cli( def project_run_cli( # fmt: off ctx: typer.Context, - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), subcommand: str = Arg(None, help="Name of command defined in project config"), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on @@ -173,7 +173,7 @@ def project_run_cli( @project_cli.command("exec", hidden=True) def project_exec_cli( # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), subcommand: str = Arg(..., help="Name of command defined in project config"), # fmt: on ): @@ -188,7 +188,7 @@ def project_exec_cli( @project_cli.command("update-dvc") def project_update_dvc_cli( # fmt: off - project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), # fmt: on From 1ae6fa2554e067f79ea4290a08b36b770e754c07 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 16:04:53 +0200 Subject: [PATCH 197/203] move subcommand one place up as project_dir has default --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index ce2f801ad..ce7c30fb4 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -149,8 +149,8 @@ def project_run_all_cli( def project_run_cli( # fmt: off ctx: typer.Context, - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), subcommand: str = Arg(None, help="Name of command defined in project config"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): @@ -173,8 +173,8 @@ def project_run_cli( @project_cli.command("exec", hidden=True) def project_exec_cli( # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), subcommand: str = Arg(..., help="Name of command defined in project config"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), # fmt: on ): """Execute a command defined in the project config. This CLI command is From cd632d8ec23c48d59cad982ad60fd619b993deb0 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 17:19:36 +0200 Subject: [PATCH 198/203] move folder for exec argument one up --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index ce7c30fb4..59c515dcb 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -525,7 +525,7 @@ def update_dvc_config( continue # Default to "." as the project path since dvc.yaml is auto-generated # and we don't want arbitrary paths in there - project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] + project_cmd = ["python", "-m", NAME, "project", ".", "exec", name] deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] From 39953c7c60e97cdd9af5b90e4a26634528548a19 Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 17:28:09 +0200 Subject: [PATCH 199/203] fix print_run_help with new arg order --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 59c515dcb..6d0ec7991 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -397,13 +397,13 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: commands = {cmd["name"]: cmd for cmd in config_commands} if subcommand: validate_subcommand(commands.keys(), subcommand) - print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") + print(f"Usage: {COMMAND} project run {subcommand} {project_dir}") help_text = commands[subcommand].get("help") if help_text: msg.text(f"\n{help_text}\n") else: print(f"\nAvailable commands in {CONFIG_FILE}") - print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") + print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) msg.text("Run all commands defined in the 'run' block of the project config:") print(f"{COMMAND} project run-all {project_dir}") From 60f97bc519ecf607f7711c3ae0402f89d939989b Mon Sep 17 00:00:00 2001 From: svlandeg <sofie.vanlandeghem@gmail.com> Date: Tue, 30 Jun 2020 17:28:43 +0200 Subject: [PATCH 200/203] add custom warning when run_command fails --- spacy/errors.py | 1 + spacy/util.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 1af673569..66a3c61da 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -539,6 +539,7 @@ class Errors(object): E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " "array and {doc_length} for the Doc itself.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") diff --git a/spacy/util.py b/spacy/util.py index 1d998ec36..7c29bed8e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -452,7 +452,12 @@ def run_command(command: Union[str, List[str]]) -> None: """ if isinstance(command, str): command = split_command(command) - status = subprocess.call(command, env=os.environ.copy()) + try: + status = subprocess.call(command, env=os.environ.copy()) + except FileNotFoundError: + raise FileNotFoundError( + Errors.E970.format(str_command=" ".join(command), tool=command[0]) + ) if status != 0: sys.exit(status) From 6da3500728af537df22f7b147a624956b74c4397 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 20:35:51 +0200 Subject: [PATCH 201/203] Fix command substitution --- spacy/cli/project.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 6d0ec7991..4cda55956 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -15,7 +15,7 @@ from ._app import app, Arg, Opt, COMMAND, NAME from .. import about from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir -from ..util import get_hash, get_checksum +from ..util import get_hash, get_checksum, split_command CONFIG_FILE = "project.yml" @@ -588,7 +588,7 @@ def run_commands( ) -> None: """Run a sequence of commands in a subprocess, in order. - commands (List[str]): The split commands. + commands (List[str]): The string commands. variables (Dict[str, str]): Dictionary of variable names, mapped to their values. Will be used to substitute format string variables in the commands. @@ -597,6 +597,7 @@ def run_commands( for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) + command = split_command(command) # Not sure if this is needed or a good idea. Motivation: users may often # use commands in their config that reference "python" and we want to # make sure that it's always executing the same Python that spaCy is From d64644d9d1f353fbb7167a93ed8d2b79c7a66864 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 20:36:30 +0200 Subject: [PATCH 202/203] Adjust auto-formatting --- spacy/cli/project.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 4cda55956..cba8a07dc 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -88,9 +88,11 @@ def project_clone_cli( @project_cli.command("init") def project_init_cli( + # fmt: off path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), force: bool = Opt(False, "--force", "-F", help="Force initiziation"), + # fmt: on ): """Initialize a project directory with DVC and optionally Git. This should typically be taken care of automatically when you run the "project clone" From b032943c34fd79a724ca87cb289efd5972f30b28 Mon Sep 17 00:00:00 2001 From: Ines Montani <ines@ines.io> Date: Tue, 30 Jun 2020 21:33:41 +0200 Subject: [PATCH 203/203] Fix funny printing again --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index cba8a07dc..5f125816c 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -612,7 +612,7 @@ def run_commands( elif len(command) and command[0] in ("pip", "pip3"): command = [sys.executable, "-m", "pip", *command[1:]] if not silent: - print(f"Running command: {command}") + print(f"Running command: {' '.join(command)}") run_command(command)