From ece8be4feca968cd294c00e0423d8daceafa639d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 May 2021 11:32:22 +0200 Subject: [PATCH 001/140] extend test to training with replaced tok2vec layer --- spacy/tests/pipeline/test_tok2vec.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index e3b71c502..7a9e96b14 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -218,6 +218,13 @@ def test_replace_listeners(): nlp.replace_listeners("tok2vec", "tagger", ["model.yolo"]) with pytest.raises(ValueError): nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec", "model.yolo"]) + # attempt training with the new pipeline + optimizer = nlp.initialize(lambda: examples) + for i in range(2): + losses = {} + nlp.update(examples, sgd=optimizer, losses=losses) + assert losses["tok2vec"] == 0.0 + assert losses["tagger"] > 0.0 cfg_string_multi = """ From 44a3a585992bcdf7625aacbb3984796f489cb10e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 May 2021 16:01:02 +0200 Subject: [PATCH 002/140] call replace_listener attr if it's available --- spacy/language.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 95a902380..4959716e2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1801,7 +1801,10 @@ class Language: util.set_dot_to_object(pipe_cfg, listener_path, tok2vec_cfg["model"]) # Go over the listener layers and replace them for listener in pipe_listeners: - util.replace_model_node(pipe.model, listener, tok2vec.model.copy()) + new_model = tok2vec.model.copy() + if "replace_listener" in new_model.attrs: + new_model = new_model.attrs["replace_listener"](new_model) + util.replace_model_node(pipe.model, listener, new_model) tok2vec.remove_listener(listener, pipe_name) def to_disk( From 235e9f548868510611541846d931f53df9c99c95 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 May 2021 17:19:38 +0200 Subject: [PATCH 003/140] call replace_listener_cfg attr if it's available --- spacy/language.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 4959716e2..c30333dc9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1764,6 +1764,7 @@ class Language: raise ValueError(err) tok2vec = self.get_pipe(tok2vec_name) tok2vec_cfg = self.get_pipe_config(tok2vec_name) + tok2vec_model = tok2vec.model if ( not hasattr(tok2vec, "model") or not hasattr(tok2vec, "listener_map") @@ -1772,6 +1773,7 @@ class Language: ): raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec))) pipe_listeners = tok2vec.listener_map.get(pipe_name, []) + pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: util.logger.debug(f"Replacing listeners of component '{pipe_name}'") @@ -1786,7 +1788,6 @@ class Language: n_listeners=len(pipe_listeners), ) raise ValueError(err) - pipe = self.get_pipe(pipe_name) # Update the config accordingly by copying the tok2vec model to all # sections defined in the listener paths for listener_path in listeners: @@ -1798,12 +1799,16 @@ class Language: name=pipe_name, tok2vec=tok2vec_name, path=listener_path ) raise ValueError(err) - util.set_dot_to_object(pipe_cfg, listener_path, tok2vec_cfg["model"]) + new_config = tok2vec_cfg["model"] + if "replace_listener_cfg" in tok2vec_model.attrs: + replace_func = tok2vec_model.attrs["replace_listener_cfg"] + new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]) + util.set_dot_to_object(pipe_cfg, listener_path, new_config) # Go over the listener layers and replace them for listener in pipe_listeners: - new_model = tok2vec.model.copy() - if "replace_listener" in new_model.attrs: - new_model = new_model.attrs["replace_listener"](new_model) + new_model = tok2vec_model.copy() + if "replace_listener" in tok2vec_model.attrs: + new_model = tok2vec_model.attrs["replace_listener"](new_model) util.replace_model_node(pipe.model, listener, new_model) tok2vec.remove_listener(listener, pipe_name) From 4e69fcaa50c582e9b4808c453d5cab70f1224071 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 19 May 2021 12:00:07 +0200 Subject: [PATCH 004/140] Disable GPU CI tests (#8143) --- azure-pipelines.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bea65cae2..5840b916b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -82,18 +82,18 @@ jobs: python_version: '$(python.version)' architecture: 'x64' - - job: "TestGPU" - dependsOn: "Validate" - strategy: - matrix: - Python38LinuxX64_GPU: - python.version: '3.8' - pool: - name: "LinuxX64_GPU" - steps: - - template: .github/azure-steps.yml - parameters: - python_version: '$(python.version)' - architecture: 'x64' - gpu: true - num_build_jobs: 24 +# - job: "TestGPU" +# dependsOn: "Validate" +# strategy: +# matrix: +# Python38LinuxX64_GPU: +# python.version: '3.8' +# pool: +# name: "LinuxX64_GPU" +# steps: +# - template: .github/azure-steps.yml +# parameters: +# python_version: '$(python.version)' +# architecture: 'x64' +# gpu: true +# num_build_jobs: 24 From cd6bd91c3a17d99674b5ed8c3b1092696ee59373 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 May 2021 14:48:09 +0200 Subject: [PATCH 005/140] Switch default train corpus max_length to 0 in quickstart (#8142) The behavior of `spacy.Corpus.v1` is unexpected enough for `max_length != 0` that `0` is a better default for users creating a new config with the quickstart. If not, documents are skipped, sometimes the entire corpus is skipped, and sometimes documents are (quite unexpectedly for your average user) split into sentences. --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index e43c21bbd..0d422318b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -372,7 +372,7 @@ factory = "{{ pipe }}" [corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -max_length = {{ 500 if hardware == "gpu" else 2000 }} +max_length = 0 [corpora.dev] @readers = "spacy.Corpus.v1" From fc37715cfb75a1c9133ea774f57995b8d03989d1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 28 May 2021 18:15:31 +0200 Subject: [PATCH 006/140] ensure 'spacy ray' works (#7799) * ensure 'spacy ray' works * better fix by changing entry point --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 846ccf4b3..84ef0e5a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,7 +61,7 @@ install_requires = [options.entry_points] console_scripts = - spacy = spacy.cli:app + spacy = spacy.cli:setup_cli [options.extras_require] lookups = From 04239e94c71bf8e4512676085a92fabcfcb42bb4 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 May 2021 16:36:17 +0900 Subject: [PATCH 007/140] Use a context manager when reading model (fix #7036) (#8244) --- spacy/pipeline/trainable_pipe.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 64e33f800..fe51f38e5 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe): def load_model(p): try: - self.model.from_bytes(p.open("rb").read()) + with open(p, "rb") as mfile: + self.model.from_bytes(mfile.read()) except AttributeError: raise ValueError(Errors.E149) from None From d1a221a37419c76f69c4a3a7f64cc4afb0eea3c8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 May 2021 17:03:40 +0900 Subject: [PATCH 008/140] Add all symbols in Unicode Currency Symbols block (#8212) * Add all symbols in Unicode Currency Symbols block In #8102 it came up that the rupee symbol was treated different from dollar / euro / yen symbols. This adds many symbols not already included. * Fix test * Fix training test --- spacy/lang/char_classes.py | 5 ++++- spacy/tests/training/test_training.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 6fbc45817..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -260,7 +260,10 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" +_currency = ( + r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ " + r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿" +) # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 321c08c1e..0ea5f0fcc 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): def test_gold_biluo_4791(en_vocab, en_tokenizer): - doc = en_tokenizer("I'll return the ₹54 amount") - gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + doc = en_tokenizer("I'll return the A54 amount") + gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] example = Example.from_dict( From ff91e6dac7c4fab06d950c4d88adcbcdac9488bd Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 31 May 2021 10:20:27 +0200 Subject: [PATCH 009/140] Show warning if entity_ruler runs without patterns (#7807) * Show warning if entity_ruler runs without patterns * Show warning if matcher runs without patterns * fix wording * unit test for warning once (WIP) * warn W036 only once * cleanup * create filter_warning helper --- spacy/__init__.py | 6 ++--- spacy/errors.py | 30 ++++++++++++++++++++++- spacy/matcher/matcher.pyx | 6 +++++ spacy/pipeline/entityruler.py | 9 ++++++- spacy/tests/matcher/test_matcher_api.py | 9 +++++++ spacy/tests/pipeline/test_entity_ruler.py | 11 +++++++++ 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 1eef7e621..d07931cfd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,10 +1,10 @@ from typing import Union, Iterable, Dict, Any from pathlib import Path -import warnings import sys -warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa -warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa +# set library-specific custom warning handling before doing anything else +from .errors import setup_default_warnings +setup_default_warnings() # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 diff --git a/spacy/errors.py b/spacy/errors.py index 7be118503..f26558327 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,3 +1,6 @@ +import warnings + + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -12,6 +15,30 @@ def add_codes(err_cls): return ErrorsWithCodes() +def setup_default_warnings(): + # ignore certain numpy warnings + filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa + filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa + + # warn about entity_ruler & matcher having no patterns only once + for pipe in ["matcher", "entity_ruler"]: + filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) + + +def filter_warning(action: str, error_msg: str): + """Customize how spaCy should handle a certain warning. + + error_msg (str): e.g. "W006", or a full error message + action (str): "default", "error", "ignore", "always", "module" or "once" + """ + warnings.filterwarnings(action, message=_escape_warning_msg(error_msg)) + + +def _escape_warning_msg(msg): + """To filter with warnings.filterwarnings, the [] brackets need to be escaped""" + return msg.replace("[", "\\[").replace("]", "\\]") + + # fmt: off @add_codes @@ -80,8 +107,9 @@ class Warnings: "@misc = \"spacy.LookupsDataLoader.v1\"\n" "lang = ${{nlp.lang}}\n" "tables = [\"lexeme_norm\"]\n") - W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' + W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " "attribute or operator.") + W036 = ("The component '{name}' does not have any patterns defined.") # New warnings added in v3.x W086 = ("Component '{listener}' will be (re)trained, but it needs the component " diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f389b4abd..7b1cfb633 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -138,6 +138,11 @@ cdef class Matcher: self._filter[key] = greedy self._patterns[key].extend(patterns) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name="matcher")) + def remove(self, key): """Remove a rule from the matcher. A KeyError is raised if the key does not exist. @@ -215,6 +220,7 @@ cdef class Matcher: If with_alignments is set to True and as_spans is set to False, A list of `(match_id, start, end, alignments)` tuples is returned. """ + self._require_patterns() if isinstance(doclike, Doc): doc = doclike length = len(doc) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4e61dbca7..2afbc2523 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,3 +1,4 @@ +import warnings from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence from collections import defaultdict from pathlib import Path @@ -6,7 +7,7 @@ import srsly from .pipe import Pipe from ..training import Example from ..language import Language -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher @@ -144,6 +145,7 @@ class EntityRuler(Pipe): error_handler(self.name, self, [doc], e) def match(self, doc: Doc): + self._require_patterns() matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] @@ -330,6 +332,11 @@ class EntityRuler(Pipe): self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(dict) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name=self.name)) + def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 548da7dc6..d3772a931 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) +def test_matcher_empty_patterns_warns(en_vocab): + matcher = Matcher(en_vocab) + assert len(matcher) == 0 + doc = Doc(en_vocab, words=["This", "is", "quite", "something"]) + with pytest.warns(UserWarning): + matcher(doc) + assert len(doc.ents) == 0 + + def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = Doc(en_vocab, words=text.split(" ")) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 2f6da79d6..a382532d2 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns): assert doc.ents[1].label_ == "BYE" +def test_entity_ruler_no_patterns_warns(nlp): + ruler = EntityRuler(nlp) + assert len(ruler) == 0 + assert len(ruler.labels) == 0 + nlp.add_pipe("entity_ruler") + assert nlp.pipe_names == ["entity_ruler"] + with pytest.warns(UserWarning): + doc = nlp("hello world bye bye") + assert len(doc.ents) == 0 + + def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns ruler = nlp.add_pipe("entity_ruler") From fff662e41fc680a26887e178653bf8bfe1f87f9d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 31 May 2021 10:21:06 +0200 Subject: [PATCH 010/140] Ensemble textcat with listener (#8012) * add unit test for two listeners, with a textcat ensemble in the middle * return zero gradients instead of None in accumulate_gradient --- spacy/pipeline/tok2vec.py | 1 + spacy/tests/pipeline/test_tok2vec.py | 93 +++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 3ee324d50..00d9548a4 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -173,6 +173,7 @@ class Tok2Vec(TrainablePipe): for i in range(len(one_d_tokvecs)): d_tokvecs[i] += one_d_tokvecs[i] losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] def backprop(one_d_tokvecs): """Callback to actually do the backprop. Passed to last listener.""" diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 7a9e96b14..809a79dd6 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -129,8 +129,8 @@ cfg_string = """ """ TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ("I like green eggs", {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}), + ("Eat blue ham", {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}), ] @@ -318,3 +318,92 @@ def test_replace_listeners_from_config(): new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] == "spacy.Tok2VecListener.v1" ) + + +cfg_string_multi_textcat = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","textcat_multilabel","tagger"] + + [components] + + [components.textcat_multilabel] + factory = "textcat_multilabel" + + [components.textcat_multilabel.model] + @architectures = "spacy.TextCatEnsemble.v2" + nO = null + + [components.textcat_multilabel.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.textcat_multilabel.model.linear_model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = false + ngram_size = 1 + no_output_layer = false + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v1" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_listeners_textcat(): + orig_config = Config().from_str(cfg_string_multi_textcat) + nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "textcat_multilabel", "tagger"] + tagger = nlp.get_pipe("tagger") + textcat = nlp.get_pipe("textcat_multilabel") + tok2vec = nlp.get_pipe("tok2vec") + tagger_tok2vec = tagger.model.get_ref("tok2vec") + textcat_tok2vec = textcat.model.get_ref("tok2vec") + assert isinstance(tok2vec, Tok2Vec) + assert isinstance(tagger_tok2vec, Tok2VecListener) + assert isinstance(textcat_tok2vec, Tok2VecListener) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + docs = list(nlp.pipe(["Eat blue ham", "I like green eggs"])) + cats0 = docs[0].cats + assert cats0["preference"] < 0.1 + assert cats0["imperative"] > 0.9 + cats1 = docs[1].cats + assert cats1["preference"] > 0.1 + assert cats1["imperative"] < 0.9 + assert([t.tag_ for t in docs[0]] == ["V", "J", "N"]) + assert([t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]) From 6b7971408066a6c29dbda5323e0c66183c984e46 Mon Sep 17 00:00:00 2001 From: Narayan Acharya Date: Mon, 31 May 2021 04:36:52 -0400 Subject: [PATCH 011/140] Address missing config overrides post load of models (#8208) --- .github/contributors/narayanacharya6.md | 106 +++++++++++++++++++++++ spacy/language.py | 5 +- spacy/tests/regression/test_issue8190.py | 28 ++++++ spacy/util.py | 5 +- 4 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 .github/contributors/narayanacharya6.md create mode 100644 spacy/tests/regression/test_issue8190.py diff --git a/.github/contributors/narayanacharya6.md b/.github/contributors/narayanacharya6.md new file mode 100644 index 000000000..e4bf7703f --- /dev/null +++ b/.github/contributors/narayanacharya6.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Narayan Acharya | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 29 APR 2021 | +| GitHub username | narayanacharya6 | +| Website (optional) | narayanacharya.com | \ No newline at end of file diff --git a/spacy/language.py b/spacy/language.py index 2696d3655..d148df573 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1846,7 +1846,8 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + overrides: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1875,7 +1876,7 @@ class Language: deserializers = {} if Path(path / "config.cfg").exists(): deserializers["config.cfg"] = lambda p: self.config.from_disk( - p, interpolate=False + p, interpolate=False, overrides=overrides ) deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = deserialize_vocab diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py new file mode 100644 index 000000000..800a1638d --- /dev/null +++ b/spacy/tests/regression/test_issue8190.py @@ -0,0 +1,28 @@ +import spacy +from spacy.lang.en import English +from ..util import make_tempdir + + +def test_issue8190(): + """Test that config overrides are not lost after load is complete.""" + source_cfg = { + "nlp": { + "lang": "en", + }, + "custom": { + "key": "value" + } + + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + nlp = spacy.load(source_path, config={ + "custom": { + "key": "updated_value" + } + }) + + assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/util.py b/spacy/util.py index 84142d5d8..928106dbb 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -387,9 +387,10 @@ def load_model_from_path( if not meta: meta = get_model_meta(model_path) config_path = model_path / "config.cfg" - config = load_config(config_path, overrides=dict_to_dot(config)) + overrides = dict_to_dot(config) + config = load_config(config_path, overrides=overrides) nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude) - return nlp.from_disk(model_path, exclude=exclude) + return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) def load_model_from_config( From b0467d2972467b16c7d9a96780a22602b1c38bce Mon Sep 17 00:00:00 2001 From: Michael K Date: Mon, 31 May 2021 08:38:29 +0000 Subject: [PATCH 012/140] Add project urls to package metadata (#7728) This adds the links to PyPI. To see that in action check out https://pypi.org/project/Django/ (source code: https://github.com/django/django/blob/b8c9e9fae14676d2e81242cb8df1e2eeef9c3a2d/setup.cfg#L27-L32) --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 84ef0e5a7..6baa2f7bb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,9 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Topic :: Scientific/Engineering +project_urls = + Release notes = https://github.com/explosion/spaCy/releases + Source = https://github.com/explosion/spaCy [options] zip_safe = false From 283f64a98d462badc5febe64da6c6a1704b5aa02 Mon Sep 17 00:00:00 2001 From: Dhruv Naik Date: Mon, 31 May 2021 14:08:53 +0530 Subject: [PATCH 013/140] Fix bug from Entityruler: ent_ids returns None for phrases (#8169) * bugfix for explosion/spaCy#8168 * add test for explosion/spaCy#8168 --- spacy/pipeline/entityruler.py | 2 +- spacy/tests/regression/test_issue8168.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue8168.py diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2afbc2523..a74d2f303 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -303,7 +303,7 @@ class EntityRuler(Pipe): self.nlp.pipe(phrase_pattern_texts), phrase_pattern_ids, ): - phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id} + phrase_pattern = {"label": label, "pattern": pattern} if ent_id: phrase_pattern["id"] = ent_id phrase_patterns.append(phrase_pattern) diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py new file mode 100644 index 000000000..cf5a9fc7a --- /dev/null +++ b/spacy/tests/regression/test_issue8168.py @@ -0,0 +1,11 @@ +from spacy.lang.en import English + +def test_issue8168(): + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [{"label": "ORG", "pattern": "Apple"}, + {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"}, + {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}] + ruler.add_patterns(patterns) + + assert ruler._ent_ids == {8043148519967183733: ('GPE', 'san-francisco')} \ No newline at end of file From dc8d8d15d20c89cf246970cfbccea59b1abf6ad0 Mon Sep 17 00:00:00 2001 From: Kristian Boda Date: Mon, 31 May 2021 09:40:48 +0100 Subject: [PATCH 014/140] Add hmrb to spaCy Universe (#8129) * docs: add hmrb to spacy universe * docs: add sentence on spacy versions * docs: update description and images * misc: add spaCy Contributor Agreement --- .github/contributors/bodak.md | 106 ++++++++++++++++++++++++++++++++++ website/meta/universe.json | 49 ++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 .github/contributors/bodak.md diff --git a/.github/contributors/bodak.md b/.github/contributors/bodak.md new file mode 100644 index 000000000..f87224f81 --- /dev/null +++ b/.github/contributors/bodak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Kristian Boda | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 18.05.2021 | +| GitHub username | bodak | +| Website (optional) | | diff --git a/website/meta/universe.json b/website/meta/universe.json index 02f814c8b..d855af8aa 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3083,6 +3083,55 @@ }, "category": ["research", "standalone", "scientific"], "tags": ["Text Analytics", "Coherence", "Cohesion"] + }, + { + "id": "hmrb", + "title": "Hammurabi", + "slogan": "Python Rule Processing Engine 🏺", + "description": "Hammurabi works as a rule engine to parse input using a defined set of rules. It uses a simple and readable syntax to define complex rules to handle phrase matching. The syntax supports nested logical statements, regular expressions, reusable or side-loaded variables and match triggered callback functions to modularize your rules. The latest version works with both spaCy 2.X and 3.X. For more information check the documentation on [ReadTheDocs](https://hmrb.readthedocs.io/en/latest/).", + "github": "babylonhealth/hmrb", + "pip": "hmrb", + "code_example": [ + "import spacy # __version__ 3.0+", + "from hmrb.core import SpacyCore", + "", + "grammar = \"\"\"", + "Var is_hurting:", + "(", + " optional (lemma: \"be\")", + " (lemma: \"hurt\")", + ")", + "Law:", + " - package: \"headache\"", + " - callback: \"mark_headache\"", + "(", + " (lemma: \"head\", pos: \"NOUN\")", + " $is_hurting", + ")\"\"\"", + "", + "conf = {", + " \"rules\": grammar", + " \"callbacks\": {", + " \"mark_headache\": \"callbacks.headache_handler\",", + " },", + " \"map_doc\": \"augmenters.jsonify_span\",", + " \"sort_length\": True,", + "}", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"hammurabi\", config=conf)", + "nlp(sentences)" + ], + "code_language": "python", + "thumb": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "image": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "author": "Kristian Boda", + "author_links": { + "github": "bodak", + "twitter": "bodak", + "website": "https://github.com/babylonhealth/" + }, + "category": ["pipeline", "standalone", "scientific", "biomedical"], + "tags": ["babylonhealth", "rule-engine", "matcher"] } ], From 5aba213349d0d521e9cae3a72044431d38606219 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 24 May 2021 20:31:43 +0900 Subject: [PATCH 015/140] Fix skweak Github URL Github entry should not contain url, just user/repo --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index d855af8aa..add089a0c 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -83,7 +83,7 @@ "title": "skweak", "slogan": "Weak supervision for NLP", "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", - "github": "https://github.com/NorskRegnesentral/skweak", + "github": "NorskRegnesentral/skweak", "pip": "skweak", "code_example": [ "import spacy, re", From d54631f68b2dc739bb6dd215ddc3ac14ee2465c6 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 May 2021 18:04:29 +0900 Subject: [PATCH 016/140] Fix other open calls without context managers (#8245) --- spacy/cli/convert.py | 3 ++- spacy/pipeline/entity_linker.py | 3 ++- spacy/tests/tokenizer/test_tokenizer.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index d13a4fc80..c84aa6431 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -115,7 +115,8 @@ def convert( ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] for input_loc in walk_directory(Path(input_path), converter): - input_data = input_loc.open("r", encoding="utf-8").read() + with input_loc.open("r", encoding="utf-8") as infile: + input_data = infile.read() # Use converter function to convert data func = CONVERTERS[converter] docs = func( diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 66070916e..21d5e9db1 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -439,7 +439,8 @@ class EntityLinker(TrainablePipe): def load_model(p): try: - self.model.from_bytes(p.open("rb").read()) + with p.open("rb") as infile: + self.model.from_bytes(infile.read()) except AttributeError: raise ValueError(Errors.E149) from None diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 6cfeaf014..c1ba1df36 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize("file_name", ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name - text = loc.open("r", encoding="utf8").read() + with loc.open("r", encoding="utf8") as infile: + text = infile.read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 From 4aa1a7d5a38cd97980ca9516284b7a5f00c2ed9c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Jun 2021 11:16:57 +0200 Subject: [PATCH 017/140] Remove unsupported attrs from attrs.IDS (#8132) The attributes `PROB`, `CLUSTER` and `SENT_END` are not supported by `Lexeme.get_struct_attr` so should not be included through `attrs.IDS` as supported attributes in `Doc.to_array` and other methods. --- spacy/attrs.pyx | 3 --- spacy/tests/vocab_vectors/test_vocab_api.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index b15db7599..9122de17b 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -74,7 +74,6 @@ IDS = { "SUFFIX": SUFFIX, "LENGTH": LENGTH, - "CLUSTER": CLUSTER, "LEMMA": LEMMA, "POS": POS, "TAG": TAG, @@ -85,9 +84,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, - "SENT_END": SENT_END, "SPACY": SPACY, - "PROB": PROB, "LANG": LANG, "MORPH": MORPH, "IDX": IDX diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index a687059be..56ef1d108 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,5 +1,5 @@ import pytest -from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.attrs import LEMMA, ORTH, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB @@ -30,7 +30,6 @@ def test_vocab_api_shape_attr(en_vocab, text): ("VERB", VERB), ("LEMMA", LEMMA), ("ORTH", ORTH), - ("PROB", PROB), ], ) def test_vocab_api_symbols(en_vocab, string, symbol): From 3672464e254c42398a8f576ee22ff28c83e03990 Mon Sep 17 00:00:00 2001 From: Vito De Tullio Date: Wed, 2 Jun 2021 11:25:30 +0200 Subject: [PATCH 018/140] applying suggestion to avoid mypy errors (#8265) * applying suggestion to avoid mypy errors * sign contributor agreement --- .github/contributors/ZeeD.md | 106 ++++++++++++++++++++++++++++++++++ spacy/ml/models/multi_task.py | 4 +- 2 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/ZeeD.md diff --git a/.github/contributors/ZeeD.md b/.github/contributors/ZeeD.md new file mode 100644 index 000000000..460f91e19 --- /dev/null +++ b/.github/contributors/ZeeD.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Vito De Tullio | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-06-01 | +| GitHub username | ZeeD | +| Website (optional) | | diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index cbfa59eea..d4d2d638b 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -13,7 +13,7 @@ from functools import partial if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from ...vocab import Vocab # noqa: F401 - from ...tokens import Doc # noqa: F401 + from ...tokens.doc import Doc # noqa: F401 @registry.architectures("spacy.PretrainVectors.v1") @@ -205,7 +205,7 @@ def _apply_mask( docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15 ) -> Tuple[numpy.ndarray, List["Doc"]]: # This needs to be here to avoid circular imports - from ...tokens import Doc # noqa: F811 + from ...tokens.doc import Doc # noqa: F811 N = sum(len(doc) for doc in docs) mask = numpy.random.uniform(0.0, 1.0, (N,)) From ff5cf3606cf354c7639ce40b81cacc93bfb03583 Mon Sep 17 00:00:00 2001 From: Jean-Hugues Roy Date: Wed, 2 Jun 2021 05:50:49 -0400 Subject: [PATCH 019/140] Improvements to French stopwords list (#7941) * "y" etc. Many changes described in pull request * Update spacy/lang/fr/stop_words.py * Update spacy/lang/fr/stop_words.py Co-authored-by: Sofie Van Landeghem --- spacy/lang/fr/stop_words.py | 54 ++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ab1f2f4a7..b32ee3d71 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,30 +1,31 @@ STOP_WORDS = set( """ a à â abord afin ah ai aie ainsi ait allaient allons -alors anterieur anterieure anterieures apres après as assez attendu au -aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront +alors anterieur anterieure anterieures antérieur antérieure antérieures +apres après as assez attendu au +aupres auquel aura auraient aurait auront aussi autre autrement autres autrui aux auxquelles auxquels avaient avais avait avant avec avoir avons ayant bas basee bat -c' c’ ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui -celui-ci celui-là cent cependant certain certaine certaines certains certes ces +c' c’ ça car ce ceci cela celle celle-ci celle-la celle-là celles celles-ci celles-la celles-là +celui celui-ci celui-la celui-là cent cependant certain certaine certaines certains certes ces cet cette ceux ceux-ci ceux-là chacun chacune chaque chez ci cinq cinquantaine cinquante cinquantième cinquième combien comme comment compris concernant -d' d’ da dans de debout dedans dehors deja delà depuis derriere +d' d’ da dans de debout dedans dehors deja dejà delà depuis derriere derrière des desormais desquelles desquels dessous dessus deux deuxième -deuxièmement devant devers devra different differentes differents différent +deuxièmement devant devers devra different differente differentes differents différent différente différentes différents dire directe directement dit dite dits divers diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont -douze douzième du duquel durant dès désormais +douze douzième du duquel durant dès déja déjà désormais -effet egale egalement egales eh elle elle-même elles elles-mêmes en encore +effet egalement eh elle elle-meme elle-même elles elles-memes elles-mêmes en encore enfin entre envers environ es ès est et etaient étaient etais étais etait était -etant étant etc été etre être eu eux eux-mêmes exactement excepté +etant étant etc etre être eu eux eux-mêmes exactement excepté également -fais faisaient faisant fait façon feront font +fais faisaient faisant fait facon façon feront font gens @@ -36,45 +37,48 @@ j' j’ je jusqu jusque juste l' l’ la laisser laquelle le lequel les lesquelles lesquels leur leurs longtemps lors lorsque lui lui-meme lui-même là lès -m' m’ ma maint maintenant mais malgre me meme memes merci mes mien +m' m’ ma maint maintenant mais malgre malgré me meme memes merci mes mien mienne miennes miens mille moi moi-meme moi-même moindres moins mon même mêmes n' n’ na ne neanmoins neuvième ni nombreuses nombreux nos notamment -notre nous nous-mêmes nouvea nul néanmoins nôtre nôtres +notre nous nous-mêmes nouveau nul néanmoins nôtre nôtres -o ô on ont onze onzième ore ou ouias oust outre +o ô on ont onze onzième or ou ouias ouste outre ouvert ouverte ouverts où -par parce parfois parle parlent parler parmi parseme partant +par parce parfois parle parlent parler parmi partant pas pendant pense permet personne peu peut peuvent peux plus -plusieurs plutôt possible possibles pour pourquoi -pourrais pourrait pouvait prealable precisement premier première premièrement -pres procedant proche près pu puis puisque +plusieurs plutot plutôt possible possibles pour pourquoi +pourrais pourrait pouvait prealable precisement +premier première premièrement +pres procedant proche près préalable précisement pu puis puisque -qu' qu’ quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt +qu' qu’ quand quant quant-à-soi quarante quatorze quatre quatre-vingt quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque quelques quels qui quiconque quinze quoi quoique relative relativement rend rendre restant reste -restent retour revoici revoilà +restent retour revoici revoila revoilà s' s’ sa sait sans sauf se seize selon semblable semblaient semble semblent sent sept septième sera seraient serait seront ses seul seule -seulement si sien sienne siennes siens sinon six sixième soi soi-même soit -soixante son sont sous souvent specifique specifiques stop +seulement seuls seules si sien sienne siennes siens sinon six sixième soi soi-meme soi-même soit +soixante son sont sous souvent specifique specifiques spécifique spécifiques stop suffisant suffisante suffit suis suit suivant suivante suivantes suivants suivre sur surtout t' t’ ta tant te tel telle tellement telles tels tenant tend tenir tente -tes tien tienne tiennes tiens toi toi-même ton touchant toujours tous -tout toute toutes treize trente tres trois troisième troisièmement +tes tien tienne tiennes tiens toi toi-meme toi-même ton touchant toujours tous +tout toute toutes treize trente tres trois troisième troisièmement très tu té un une unes uns -va vais vas vers via vingt voici voilà vont vos -votre vous vous-mêmes vu vé vôtre vôtres +va vais vas vers via vingt voici voila voilà vont vos +votre votres vous vous-mêmes vu vé vôtre vôtres + +y """.split() ) From d959603d517f49dd90cf36378b93797eef02f67f Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 3 Jun 2021 16:05:26 +0900 Subject: [PATCH 020/140] Don't add duplicate patterns all the time in EntityRuler (fix #8216) (#8246) * Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests --- spacy/matcher/phrasematcher.pyx | 2 ++ spacy/pipeline/entityruler.py | 39 ++++++++++------------- spacy/tests/parser/test_ner.py | 2 +- spacy/tests/pipeline/test_entity_ruler.py | 13 ++++++++ spacy/tests/regression/test_issue8216.py | 34 ++++++++++++++++++++ 5 files changed, 67 insertions(+), 23 deletions(-) create mode 100644 spacy/tests/regression/test_issue8216.py diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index e5ff2202c..d8486b84b 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -50,6 +50,8 @@ cdef class PhraseMatcher: if isinstance(attr, (int, long)): self.attr = attr else: + if attr is None: + attr = "ORTH" attr = attr.upper() if attr == "TEXT": attr = "ORTH" diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4e61dbca7..03730f772 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -101,17 +101,12 @@ class EntityRuler(Pipe): self.overwrite = overwrite_ents self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) + self._validate = validate self.matcher = Matcher(nlp.vocab, validate=validate) - if phrase_matcher_attr is not None: - if phrase_matcher_attr.upper() == "TEXT": - phrase_matcher_attr = "ORTH" - self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr, validate=validate - ) - else: - self.phrase_matcher_attr = None - self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) + self.phrase_matcher_attr = phrase_matcher_attr + self.phrase_matcher = PhraseMatcher( + nlp.vocab, attr=self.phrase_matcher_attr, validate=validate + ) self.ent_id_sep = ent_id_sep self._ent_ids = defaultdict(dict) if patterns is not None: @@ -315,20 +310,22 @@ class EntityRuler(Pipe): pattern = entry["pattern"] if isinstance(pattern, Doc): self.phrase_patterns[label].append(pattern) + self.phrase_matcher.add(label, [pattern]) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) + self.matcher.add(label, [pattern]) else: raise ValueError(Errors.E097.format(pattern=pattern)) - for label, patterns in self.token_patterns.items(): - self.matcher.add(label, patterns) - for label, patterns in self.phrase_patterns.items(): - self.phrase_matcher.add(label, patterns) def clear(self) -> None: """Reset all patterns.""" self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(dict) + self.matcher = Matcher(self.nlp.vocab, validate=self._validate) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate + ) def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep @@ -374,10 +371,9 @@ class EntityRuler(Pipe): self.add_patterns(cfg.get("patterns", cfg)) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) - if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) else: self.add_patterns(cfg) @@ -428,10 +424,9 @@ class EntityRuler(Pipe): self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) from_disk(path, deserializers_patterns, {}) return self diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index dffdff1ec..1b9d0b255 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -252,12 +252,12 @@ def test_ruler_before_ner(): # 1 : Entity Ruler - should set "this" to B and everything else to empty patterns = [{"label": "THING", "pattern": "This"}] ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) # 2: untrained NER - should set everything else to O untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") nlp.initialize() + ruler.add_patterns(patterns) doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 2f6da79d6..79ad44abd 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -78,6 +78,19 @@ def test_entity_ruler_init_clear(nlp, patterns): assert len(ruler.labels) == 0 +def test_entity_ruler_clear(nlp, patterns): + """Test that initialization clears patterns.""" + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + assert len(ruler.labels) == 4 + doc = nlp("hello world") + assert len(doc.ents) == 1 + ruler.clear() + assert len(ruler.labels) == 0 + doc = nlp("hello world") + assert len(doc.ents) == 0 + + def test_entity_ruler_existing(nlp, patterns): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py new file mode 100644 index 000000000..528d4b6f9 --- /dev/null +++ b/spacy/tests/regression/test_issue8216.py @@ -0,0 +1,34 @@ +import pytest + +from spacy import registry +from spacy.language import Language +from spacy.pipeline import EntityRuler + + +@pytest.fixture +def nlp(): + return Language() + + +@pytest.fixture +@registry.misc("entity_ruler_patterns") +def patterns(): + return [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"}, + ] + + +def test_entity_ruler_fix8216(nlp, patterns): + """Test that patterns don't get added excessively.""" + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) + ruler.add_patterns(patterns) + pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert pattern_count > 0 + ruler.add_patterns([]) + after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert after_count == pattern_count From 07082c96921f9df9f10f0429379518ae83ff6829 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 4 Jun 2021 14:56:07 +0200 Subject: [PATCH 021/140] Exclude generated .cpp files from package (#8271) --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 8008b4507..99fc174bd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz recursive-include spacy/cli *.json *.yml recursive-include licenses * +recursive-exclude spacy *.cpp From f0277bdeabbcfd59da0242172bc334f821420f87 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 31 May 2021 10:20:27 +0200 Subject: [PATCH 022/140] Show warning if entity_ruler runs without patterns (#7807) * Show warning if entity_ruler runs without patterns * Show warning if matcher runs without patterns * fix wording * unit test for warning once (WIP) * warn W036 only once * cleanup * create filter_warning helper --- spacy/__init__.py | 6 ++--- spacy/errors.py | 30 ++++++++++++++++++++++- spacy/matcher/matcher.pyx | 6 +++++ spacy/pipeline/entityruler.py | 9 ++++++- spacy/tests/matcher/test_matcher_api.py | 9 +++++++ spacy/tests/pipeline/test_entity_ruler.py | 11 +++++++++ 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 1eef7e621..d07931cfd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,10 +1,10 @@ from typing import Union, Iterable, Dict, Any from pathlib import Path -import warnings import sys -warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa -warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa +# set library-specific custom warning handling before doing anything else +from .errors import setup_default_warnings +setup_default_warnings() # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 diff --git a/spacy/errors.py b/spacy/errors.py index 7cf9e54e4..ce0d735af 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,3 +1,6 @@ +import warnings + + def add_codes(err_cls): """Add error codes to string messages via class attribute names.""" @@ -12,6 +15,30 @@ def add_codes(err_cls): return ErrorsWithCodes() +def setup_default_warnings(): + # ignore certain numpy warnings + filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa + filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa + + # warn about entity_ruler & matcher having no patterns only once + for pipe in ["matcher", "entity_ruler"]: + filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) + + +def filter_warning(action: str, error_msg: str): + """Customize how spaCy should handle a certain warning. + + error_msg (str): e.g. "W006", or a full error message + action (str): "default", "error", "ignore", "always", "module" or "once" + """ + warnings.filterwarnings(action, message=_escape_warning_msg(error_msg)) + + +def _escape_warning_msg(msg): + """To filter with warnings.filterwarnings, the [] brackets need to be escaped""" + return msg.replace("[", "\\[").replace("]", "\\]") + + # fmt: off @add_codes @@ -80,8 +107,9 @@ class Warnings: "@misc = \"spacy.LookupsDataLoader.v1\"\n" "lang = ${{nlp.lang}}\n" "tables = [\"lexeme_norm\"]\n") - W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' + W035 = ("Discarding subpattern '{pattern}' due to an unrecognized " "attribute or operator.") + W036 = ("The component '{name}' does not have any patterns defined.") # New warnings added in v3.x W086 = ("Component '{listener}' will be (re)trained, but it needs the component " diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index dae12c3f6..6fd8bdb03 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -138,6 +138,11 @@ cdef class Matcher: self._filter[key] = greedy self._patterns[key].extend(patterns) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name="matcher")) + def remove(self, key): """Remove a rule from the matcher. A KeyError is raised if the key does not exist. @@ -215,6 +220,7 @@ cdef class Matcher: If with_alignments is set to True and as_spans is set to False, A list of `(match_id, start, end, alignments)` tuples is returned. """ + self._require_patterns() if isinstance(doclike, Doc): doc = doclike length = len(doc) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 03730f772..78269f180 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,3 +1,4 @@ +import warnings from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence from collections import defaultdict from pathlib import Path @@ -6,7 +7,7 @@ import srsly from .pipe import Pipe from ..training import Example from ..language import Language -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher @@ -139,6 +140,7 @@ class EntityRuler(Pipe): error_handler(self.name, self, [doc], e) def match(self, doc: Doc): + self._require_patterns() matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] @@ -327,6 +329,11 @@ class EntityRuler(Pipe): self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) + def _require_patterns(self) -> None: + """Raise a warning if this component has no patterns defined.""" + if len(self) == 0: + warnings.warn(Warnings.W036.format(name=self.name)) + def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 094bf22a6..4e6b4bfae 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) +def test_matcher_empty_patterns_warns(en_vocab): + matcher = Matcher(en_vocab) + assert len(matcher) == 0 + doc = Doc(en_vocab, words=["This", "is", "quite", "something"]) + with pytest.warns(UserWarning): + matcher(doc) + assert len(doc.ents) == 0 + + def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = Doc(en_vocab, words=text.split(" ")) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 79ad44abd..4a01ce183 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns): assert doc.ents[1].label_ == "BYE" +def test_entity_ruler_no_patterns_warns(nlp): + ruler = EntityRuler(nlp) + assert len(ruler) == 0 + assert len(ruler.labels) == 0 + nlp.add_pipe("entity_ruler") + assert nlp.pipe_names == ["entity_ruler"] + with pytest.warns(UserWarning): + doc = nlp("hello world bye bye") + assert len(doc.ents) == 0 + + def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns ruler = nlp.add_pipe("entity_ruler") From 9dfd3c9484a2cf332bed9f84473c2d419f621fb6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 4 Jun 2021 17:44:04 +0200 Subject: [PATCH 023/140] Use warnings.warn instead of logger.warning --- spacy/errors.py | 3 +++ spacy/language.py | 2 +- spacy/matcher/dependencymatcher.pyx | 4 ++-- spacy/pipeline/lemmatizer.py | 4 +++- spacy/tests/doc/test_doc_api.py | 9 ++------- spacy/tests/pipeline/test_lemmatizer.py | 8 +++----- spacy/tests/pipeline/test_pipe_factories.py | 6 +----- spacy/tokens/doc.pyx | 2 +- 8 files changed, 16 insertions(+), 22 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index ce0d735af..2e8cc4494 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -24,6 +24,9 @@ def setup_default_warnings(): for pipe in ["matcher", "entity_ruler"]: filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) + # warn once about lemmatizer without required POS + filter_warning("once", error_msg="[W108]") + def filter_warning(action: str, error_msg: str): """Customize how spaCy should handle a certain warning. diff --git a/spacy/language.py b/spacy/language.py index 1a447c11b..7786089a5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -689,7 +689,7 @@ class Language: if self.vocab.vectors.shape != source.vocab.vectors.shape or \ self.vocab.vectors.key2row != source.vocab.vectors.key2row or \ self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes(): - util.logger.warning(Warnings.W113.format(name=source_name)) + warnings.warn(Warnings.W113.format(name=source_name)) if not source_name in source.component_names: raise KeyError( Errors.E944.format( diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 0e601281a..b6e84a5da 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -4,6 +4,7 @@ from collections import defaultdict from itertools import product import numpy +import warnings from .matcher cimport Matcher from ..vocab cimport Vocab @@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc from ..errors import Errors, Warnings from ..tokens import Span -from ..util import logger DELIMITER = "||" @@ -282,7 +282,7 @@ cdef class DependencyMatcher: keys_to_position_maps = defaultdict(lambda: defaultdict(list)) for match_id, start, end in self._matcher(doc): if start + 1 != end: - logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) + warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) token = doc[start] root = ([token] + list(token.ancestors))[-1] keys_to_position_maps[root.i][match_id].append(start) diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index cfe405efa..87504fade 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple from thinc.api import Model from pathlib import Path +import warnings + from .pipe import Pipe from ..errors import Errors, Warnings from ..language import Language @@ -182,7 +184,7 @@ class Lemmatizer(Pipe): univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): if univ_pos == "": - logger.warning(Warnings.W108.format(text=string)) + warnings.warn(Warnings.W108.format(text=string)) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d7452a802..358724517 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,8 +2,6 @@ import weakref import pytest import numpy -import logging -import mock from spacy.lang.xx import MultiLanguage from spacy.tokens import Doc, Span, Token @@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text): def inner_func(d1, d2): return "hello!" - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: - _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_not_called() + _ = tokens.to_bytes() # noqa: F841 + with pytest.warns(UserWarning): tokens.user_hooks["similarity"] = inner_func _ = tokens.to_bytes() # noqa: F841 - mock_warning.assert_called_once() def test_doc_api_set_ents(en_tokenizer): diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 3c16d3bcb..1bec8696c 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,6 +1,4 @@ import pytest -import logging -import mock import pickle from spacy import util, registry from spacy.lang.en import English @@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp): # warning if no POS assigned doc = nlp.make_doc("coping") - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: + with pytest.warns(UserWarning): doc = lemmatizer(doc) - mock_warning.assert_called_once() + # warns once by default + doc = lemmatizer(doc) # works with POS doc = nlp.make_doc("coping") diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index c5cc62661..b99e9a863 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,6 +1,4 @@ import pytest -import mock -import logging from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German @@ -437,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass(): nlp = English() nlp.vocab.vectors.resize((1, 4)) nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4]) - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: + with pytest.warns(UserWarning): nlp.add_pipe("tagger", source=source_nlp) - mock_warning.assert_called() def test_pipe_factories_from_source_custom(): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index aae0ff374..28f8debf3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1318,7 +1318,7 @@ cdef class Doc: if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)): - util.logger.warning(Warnings.W109) + warnings.warn(Warnings.W109) return util.to_dict(serializers, exclude) def from_dict(self, msg, *, exclude=tuple()): From f34dd0b98f2301b3ac5d6859992907405cd4a25f Mon Sep 17 00:00:00 2001 From: graue70 <23035329+graue70@users.noreply.github.com> Date: Mon, 7 Jun 2021 10:43:54 +0200 Subject: [PATCH 024/140] Fix typos in comments (#8279) --- spacy/pipeline/entity_linker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 002ea71a7..a03b6b384 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -142,7 +142,7 @@ class EntityLinker(TrainablePipe): self.get_candidates = get_candidates self.cfg = {} self.distance = CosineDistance(normalize=False) - # how many neightbour sentences to take into account + # how many neighbour sentences to take into account # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) @@ -305,7 +305,7 @@ class EntityLinker(TrainablePipe): sent = ent.sent sent_index = sentences.index(sent) assert sent_index >= 0 - # get n_neightbour sentences, clipped to the length of the document + # get n_neighbour sentences, clipped to the length of the document start_sentence = max(0, sent_index - self.n_sents) end_sentence = min( len(sentences) - 1, sent_index + self.n_sents From d52ab13b5fa4cb2153787ac87da388b181eeddde Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Jun 2021 14:46:07 +0200 Subject: [PATCH 025/140] Update CI: update ubuntu image, add download test (#8298) * Update CI: update ubuntu image, add download test * Switch instances to `ubuntu-18.04` * Add model download test, currently only for one job with python 3.8 * Fix variable name * Set variables explicitly --- .github/azure-steps.yml | 10 ++++++++++ azure-pipelines.yml | 10 +++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index d536f2eb8..21d2654ad 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -11,6 +11,10 @@ steps: versionSpec: ${{ parameters.python_version }} architecture: ${{ parameters.architecture }} + - bash: | + echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" + displayName: 'Set variables' + - script: | ${{ parameters.prefix }} python -m pip install -U pip setuptools ${{ parameters.prefix }} python -m pip install -U -r requirements.txt @@ -55,3 +59,9 @@ steps: ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) + + - script: | + python -m spacy download en_core_web_sm + python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5840b916b..7ad1b27b3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,7 +22,7 @@ jobs: # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-16.04" + vmImage: "ubuntu-18.04" steps: - task: UsePythonVersion@0 inputs: @@ -38,7 +38,7 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-16.04" + imageName: "ubuntu-18.04" python.version: "3.6" # Python36Windows: # imageName: "vs2017-win2016" @@ -47,7 +47,7 @@ jobs: # imageName: "macos-10.14" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-16.04" + # imageName: "ubuntu-18.04" # python.version: "3.7" Python37Windows: imageName: "vs2017-win2016" @@ -56,7 +56,7 @@ jobs: # imageName: "macos-10.14" # python.version: "3.7" # Python38Linux: - # imageName: "ubuntu-16.04" + # imageName: "ubuntu-18.04" # python.version: "3.8" # Python38Windows: # imageName: "vs2017-win2016" @@ -65,7 +65,7 @@ jobs: imageName: "macos-10.14" python.version: "3.8" Python39Linux: - imageName: "ubuntu-16.04" + imageName: "ubuntu-18.04" python.version: "3.9" Python39Windows: imageName: "vs2017-win2016" From 6d2789452e23141be61b04b22afe6f3c065add4e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 10 Jun 2021 11:03:30 +0200 Subject: [PATCH 026/140] Restrict cython to <3.0 (#8337) --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3e34a0b2d..7a0e34376 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] requires = [ "setuptools", - "cython>=0.25", + "cython>=0.25,<3.0", "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", diff --git a/requirements.txt b/requirements.txt index dda9c7773..46337389c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ setuptools packaging>=20.0 typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8" # Development dependencies -cython>=0.25 +cython>=0.25,<3.0 pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index 6baa2f7bb..0928c8db0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,7 +31,7 @@ zip_safe = false include_package_data = true python_requires = >=3.6 setup_requires = - cython>=0.25 + cython>=0.25,<3.0 numpy>=1.15.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 From 0a1a4c665d9a95f2ae9e25dc450b6156a20584ed Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Thu, 10 Jun 2021 21:53:11 +0200 Subject: [PATCH 027/140] update spacy-wordnet code example (#8327) * update spacy-wordnet code example - include spaCy 2.x and 3.x init alternatives - upgrade recognai logo * fix escape chars --- website/meta/universe.json | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index add089a0c..32363e5d3 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2080,14 +2080,17 @@ "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", "github": "recognai/spacy-wordnet", "tags": ["wordnet", "synsets"], - "thumb": "https://i.imgur.com/3y2uPUv.jpg", + "thumb": "https://i.imgur.com/ud4C7cj.png", "code_example": [ "import spacy", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "", "# Load an spacy model (supported models are \"es\" and \"en\") ", "nlp = spacy.load('en')", - "nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", + "# Spacy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", + "# Spacy 2.x", + "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "token = nlp('prices')[0]", "", "# wordnet object link spacy token with nltk wordnet interface by giving acces to", From f4008bdb13e262c389e3d0c7017a634605f6e706 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Jun 2021 10:19:22 +0200 Subject: [PATCH 028/140] Restrict pymorphy2 requirement to pymorphy2 mode (#8299) For the Russian and Ukrainian lemmatizers, restrict the `pymorphy2` requirement to the mode `pymorphy2` so that lookup or other lemmatizer modes can be loaded without installing `pymorphy2`. --- spacy/lang/ru/lemmatizer.py | 19 ++++++++++--------- spacy/lang/uk/lemmatizer.py | 21 +++++++++++---------- spacy/tests/conftest.py | 7 +++++++ spacy/tests/lang/uk/test_lemmatizer.py | 8 ++++++++ 4 files changed, 36 insertions(+), 19 deletions(-) create mode 100644 spacy/tests/lang/uk/test_lemmatizer.py diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index c337b9bc3..63aa94a36 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -23,15 +23,16 @@ class RussianLemmatizer(Lemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian lemmatizer requires the pymorphy2 library: " - 'try to fix it with "pip install pymorphy2"' - ) from None - if RussianLemmatizer._morph is None: - RussianLemmatizer._morph = MorphAnalyzer() + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Russian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library. Install it with: pip install pymorphy2" + ) from None + if RussianLemmatizer._morph is None: + RussianLemmatizer._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) def pymorphy2_lemmatize(self, token: Token) -> List[str]: diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 0b4435a21..e1fdf39fc 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -18,14 +18,15 @@ class UkrainianLemmatizer(RussianLemmatizer): mode: str = "pymorphy2", overwrite: bool = False, ) -> None: - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Ukrainian lemmatizer requires the pymorphy2 library and " - "dictionaries: try to fix it with " - '"pip install pymorphy2 pymorphy2-dicts-uk"' - ) from None - if UkrainianLemmatizer._morph is None: - UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") + if mode == "pymorphy2": + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Ukrainian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2 pymorphy2-dicts-uk" + ) from None + if UkrainianLemmatizer._morph is None: + UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 04e254c50..c6be15189 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -281,6 +281,13 @@ def uk_tokenizer(): return get_lang_class("uk")().tokenizer +@pytest.fixture +def uk_lemmatizer(): + pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2_dicts_uk") + return get_lang_class("uk")().add_pipe("lemmatizer") + + @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur")().tokenizer diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py new file mode 100644 index 000000000..4a0d91f7e --- /dev/null +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -0,0 +1,8 @@ +import pytest +from spacy.tokens import Doc + + +def test_uk_lemmatizer(uk_lemmatizer): + """Check that the default uk lemmatizer runs.""" + doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) + uk_lemmatizer(doc) From d9be9e6cf9f892d97a26b11e022627abdb9dd07d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Jun 2021 10:20:24 +0200 Subject: [PATCH 029/140] Move README.md and LICENSES_SOURCES in package (#8297) In addition to `LICENSE`, move the files `README.md` and `LICENSES_SOURCES` to the top directory in `spacy package` if present in the model directory. --- spacy/cli/package.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 5b8daf048..58e191f65 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -113,7 +113,7 @@ def package( print("\n".join(errors)) sys.exit(1) model_name = meta["name"] - if not model_name.startswith(meta['lang'] + "_"): + if not model_name.startswith(meta["lang"] + "_"): model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v @@ -130,9 +130,10 @@ def package( ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) - license_path = package_path / model_name_v / "LICENSE" - if license_path.exists(): - shutil.move(str(license_path), str(main_path)) + for file_name in FILENAMES_DOCS: + file_path = package_path / model_name_v / file_name + if file_path.exists(): + shutil.move(str(file_path), str(main_path)) imports = [] for code_path in code_paths: imports.append(code_path.stem) @@ -317,3 +318,6 @@ __version__ = get_model_meta(Path(__file__).parent)['version'] def load(**overrides): return load_model_from_init_py(__file__, **overrides) """.lstrip() + + +FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"] From b98d216205068024407ae02c27c06da04ea88ff8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Jun 2021 10:21:22 +0200 Subject: [PATCH 030/140] Update Catalan language data (#8308) * Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up --- setup.cfg | 2 +- spacy/lang/ca/__init__.py | 22 +++++++- spacy/lang/ca/lemmatizer.py | 81 +++++++++++++++++++++++++++ spacy/lang/ca/punctuation.py | 44 +++++++++++++-- spacy/lang/ca/syntax_iterators.py | 46 +++++++++++++++ spacy/lang/ca/tokenizer_exceptions.py | 7 +++ spacy/tests/lang/ca/test_text.py | 4 +- spacy/tests/lang/test_lemmatizers.py | 2 +- 8 files changed, 198 insertions(+), 10 deletions(-) create mode 100644 spacy/lang/ca/lemmatizer.py create mode 100644 spacy/lang/ca/syntax_iterators.py diff --git a/setup.cfg b/setup.cfg index 2fedd8f5c..cd55911fe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,7 @@ console_scripts = [options.extras_require] lookups = - spacy_lookups_data>=1.0.0,<1.1.0 + spacy_lookups_data>=1.0.1,<1.1.0 transformers = spacy_transformers>=1.0.1,<1.1.0 ray = diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 970b23c1e..81f39b13c 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,15 +1,23 @@ +from typing import Optional + +from thinc.api import Model + from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from .lemmatizer import CatalanLemmatizer class CatalanDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS class Catalan(Language): @@ -17,4 +25,16 @@ class Catalan(Language): Defaults = CatalanDefaults +@Catalan.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Catalan"] diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py new file mode 100644 index 000000000..2518eb720 --- /dev/null +++ b/spacy/lang/ca/lemmatizer.py @@ -0,0 +1,81 @@ +from typing import List, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class CatalanLemmatizer(Lemmatizer): + """ + Copied from French Lemmatizer + Catalan language lemmatizer applies the default rule based lemmatization + procedure with some modifications for better Catalan language support. + + The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use + the rule-based lemmatization. As a last resort, the lemmatizer checks in + the lookup table. + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) + else: + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + elif "lemma_rules" not in self.lookups or univ_pos not in ( + "noun", + "verb", + "adj", + "adp", + "adv", + "aux", + "cconj", + "det", + "pron", + "punct", + "sconj", + ): + return self.lookup_lemmatize(token) + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + lookup_table = self.lookups.get_table("lemma_lookup", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, []) + string = string.lower() + forms = [] + if string in index: + forms.append(string) + self.cache[cache_key] = forms + return forms + forms.extend(exceptions.get(string, [])) + oov_forms = [] + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) + if not forms and string in lookup_table.keys(): + forms.append(self.lookup_lemmatize(token)[0]) + if not forms: + forms.append(string) + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index d50b75589..39db08f17 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,12 +1,46 @@ -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import CURRENCY +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT +from ..char_classes import merge_chars, _units ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), + ] +) + +_units = _units.replace("% ", "") +UNITS = merge_chars(_units) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [r"-", "—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py new file mode 100644 index 000000000..c70d53e80 --- /dev/null +++ b/spacy/lang/ca/syntax_iterators.py @@ -0,0 +1,46 @@ +from ...symbols import NOUN, PROPN +from ...errors import Errors + + +def noun_chunks(doclike): + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] + np_label = doc.vocab.strings.add("NP") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + left = word.left_edge.i + right = word.right_edge.i + 1 + # leave prepositions and punctuation out of the left side of the chunk + if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT": + left = word.left_edge.i + 1 + prev_end = word.right_edge.i + # leave subordinated clauses and appositions out of the chunk + a = word.i + 1 + while a < word.right_edge.i: + paraula = doc[a] + if paraula.pos_ == "VERB": + right = paraula.left_edge.i + prev_end = paraula.left_edge.i - 1 + elif paraula.dep_ == "appos": + right = paraula.left_edge.i + 1 + prev_end = paraula.left_edge.i - 1 + a += 1 + # leave punctuation out of the right side of the chunk + if word.right_edge.pos_ == "PUNCT": + right = right - 1 + yield left, right, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index b465e97ba..5f9a50f5e 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -24,6 +24,13 @@ for exc_data in [ {ORTH: "núm", NORM: "número"}, {ORTH: "St.", NORM: "sant"}, {ORTH: "Sta.", NORM: "santa"}, + {ORTH: "'l"}, + {ORTH: "'ls"}, + {ORTH: "'m"}, + {ORTH: "'n"}, + {ORTH: "'ns"}, + {ORTH: "'s"}, + {ORTH: "'t"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 38f5fc708..55bad0e94 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" tokens = ca_tokenizer(text) - assert len(tokens) == 138 + assert len(tokens) == 140 @pytest.mark.parametrize( "text,length", [ - ("Perquè va anar-hi?", 6), + ("Perquè va anar-hi?", 4), ("“Ah no?”", 5), ("""Sí! "Anem", va contestar el Joan Carles""", 11), ("Van córrer aprox. 10km", 5), diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index e755da22d..e419f0a14 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -8,7 +8,7 @@ from spacy.util import get_lang_class # Only include languages with no external dependencies # excluded: ru, uk # excluded for custom tables: es, pl -LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] +LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on From 7f0f674a1bccfa96bee3dad4683c7e4f4c68ac30 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 14 Jun 2021 10:18:06 +1000 Subject: [PATCH 031/140] Fix universe.json and auto-format [ci skip] --- website/meta/universe.json | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 32363e5d3..6d061ff0a 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -14,14 +14,14 @@ "client.entities('John Doe is a Go Developer at Google')", "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" ], - "thumb":"https://avatars.githubusercontent.com/u/77671902", - "image":"https://nlpcloud.io/assets/images/logo.svg", + "thumb": "https://avatars.githubusercontent.com/u/77671902", + "image": "https://nlpcloud.io/assets/images/logo.svg", "code_language": "python", "author": "NLPCloud.io", "author_links": { - "github": "nlpcloud", - "twitter": "cloud_nlp", - "website": "https://nlpcloud.io" + "github": "nlpcloud", + "twitter": "cloud_nlp", + "website": "https://nlpcloud.io" }, "category": ["apis", "nonpython", "standalone"], "tags": ["api", "deploy", "production"] @@ -42,17 +42,17 @@ "print(doc._.person_name)", "# ['Meghana S.R Bhange', 'Asha']" ], - "thumb":"https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png", + "thumb": "https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png", "code_language": "python", "author": "Meghana Bhange", "author_links": { - "github": "meghanabhange", - "twitter": "_aspiringcat" + "github": "meghanabhange", + "twitter": "_aspiringcat" }, "category": ["standalone"], "tags": ["person-name-detection"] }, - { + { "id": "eMFDscore", "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", "slogan": "Extended Moral Foundation Dictionary Scoring for Python", @@ -65,20 +65,20 @@ "DICT_TYPE = 'emfd'", "PROB_MAP = 'single'", "SCORE_METHOD = 'bow'", - "OUT_METRICS = 'vice-virtue'", - "OUT_CSV_PATH = 'single-vv.csv'", - "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)", + "OUT_METRICS = 'vice-virtue'", + "OUT_CSV_PATH = 'single-vv.csv'", + "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)" ], "code_language": "python", "author": "Media Neuroscience Lab", "author_links": { - "github": "medianeuroscience", - "twitter": "medianeuro" + "github": "medianeuroscience", + "twitter": "medianeuro" }, "category": ["research", "teaching"], "tags": ["morality", "dictionary", "sentiment"] }, - { + { "id": "skweak", "title": "skweak", "slogan": "Weak supervision for NLP", @@ -2087,9 +2087,9 @@ "", "# Load an spacy model (supported models are \"es\" and \"en\") ", "nlp = spacy.load('en')", - "# Spacy 3.x", - "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", - "# Spacy 2.x", + "# Spacy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", + "# Spacy 2.x", "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "token = nlp('prices')[0]", "", From 3259faad42b78a510d748b4fef1042e6c2ae26e3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 14 Jun 2021 10:21:01 +1000 Subject: [PATCH 032/140] Update YouTube embed [ci skip] --- website/src/components/embed.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/embed.js b/website/src/components/embed.js index dc25ae079..8d82bfaae 100644 --- a/website/src/components/embed.js +++ b/website/src/components/embed.js @@ -13,7 +13,7 @@ const YouTube = ({ id, ratio = '16x9', className }) => { [classes.ratio16x9]: ratio === '16x9', [classes.ratio4x3]: ratio === '4x3', }) - const url = `https://www.youtube.com/embed/${id}` + const url = `https://www.youtube-nocookie.com/embed/${id}` return (