From 02e18926c37d9c2e8965356c595fedd692507902 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 21 Jul 2021 15:32:37 +0200 Subject: [PATCH] Revert "Backport bugfixes from v3.1.0 to v3.0 (#8739)" (#8786) This reverts commit f94168a41e41dc67a3724a33d0e10a45423085dc. --- MANIFEST.in | 1 - spacy/about.py | 2 +- spacy/cli/convert.py | 3 +- spacy/cli/package.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 2 +- spacy/errors.py | 5 -- spacy/lang/az/__init__.py | 5 ++ spacy/lang/el/lemmatizer.py | 2 +- spacy/lang/ru/lemmatizer.py | 5 +- spacy/lang/uk/lemmatizer.py | 6 ++- spacy/matcher/phrasematcher.pyx | 2 - spacy/ml/models/multi_task.py | 3 +- spacy/pipeline/entity_linker.py | 3 +- spacy/pipeline/entityruler.py | 46 ++++++++++--------- spacy/pipeline/textcat.py | 2 - spacy/pipeline/trainable_pipe.pyx | 3 +- spacy/tests/doc/test_doc_api.py | 40 ++++------------ spacy/tests/lang/test_initialize.py | 13 +++--- spacy/tests/matcher/test_matcher_api.py | 1 - spacy/tests/matcher/test_matcher_logic.py | 1 - spacy/tests/parser/test_ner.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 2 - spacy/tests/pipeline/test_entity_ruler.py | 14 ------ spacy/tests/pipeline/test_pipe_factories.py | 31 +++++-------- spacy/tests/pipeline/test_textcat.py | 6 --- spacy/tests/regression/test_issue8216.py | 34 -------------- spacy/tests/tokenizer/test_tokenizer.py | 3 +- spacy/tests/training/test_new_example.py | 21 --------- spacy/tokens/doc.pyx | 6 +-- spacy/training/example.pyx | 2 +- spacy/util.py | 30 +++++++++--- website/docs/api/textcategorizer.md | 9 ++-- website/meta/languages.json | 7 +-- 33 files changed, 105 insertions(+), 209 deletions(-) delete mode 100644 spacy/tests/regression/test_issue8216.py diff --git a/MANIFEST.in b/MANIFEST.in index 99fc174bd..8008b4507 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,4 +8,3 @@ recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz recursive-include spacy/cli *.json *.yml recursive-include licenses * -recursive-exclude spacy *.cpp diff --git a/spacy/about.py b/spacy/about.py index 123e5ea7c..c351076c5 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.7" +__version__ = "3.0.6" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c84aa6431..d13a4fc80 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -115,8 +115,7 @@ def convert( ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] for input_loc in walk_directory(Path(input_path), converter): - with input_loc.open("r", encoding="utf-8") as infile: - input_data = infile.read() + input_data = input_loc.open("r", encoding="utf-8").read() # Use converter function to convert data func = CONVERTERS[converter] docs = func( diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 275476307..eaffde1d7 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,7 +18,7 @@ def package_cli( output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), + create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index c139bcaae..e43c21bbd 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -418,7 +418,7 @@ compound = 1.001 [initialize] {% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = ${paths.vectors} +vectors = null {% else -%} vectors = "{{ word_vectors }}" {% endif -%} diff --git a/spacy/errors.py b/spacy/errors.py index de04ca641..e8eccaece 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -518,11 +518,6 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x - E867 = ("The 'textcat' component requires at least two labels because it " - "uses mutually exclusive classes where exactly one label is True " - "for each doc. For binary classification tasks, you can use two " - "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you " - "can use the 'textcat_multilabel' component with one label.") E870 = ("Could not serialize the DocBin because it is too large. Consider " "splitting up your documents into several doc bins and serializing " "each separately. spacy.Corpus.v1 will search recursively for all " diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py index 2937e2ecf..6a4288d1e 100644 --- a/spacy/lang/az/__init__.py +++ b/spacy/lang/az/__init__.py @@ -1,11 +1,16 @@ +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS from .lex_attrs import LEX_ATTRS from ...language import Language class AzerbaijaniDefaults(Language.Defaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS + token_match = TOKEN_MATCH + syntax_iterators = SYNTAX_ITERATORS class Azerbaijani(Language): diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 631848af4..a049601dc 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer): forms.extend(oov_forms) if not forms: forms.append(string) - forms = list(dict.fromkeys(forms)) + forms = list(set(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 5a49a4e00..63aa94a36 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -12,6 +12,7 @@ PUNCT_RULES = {"«": '"', "»": '"'} class RussianLemmatizer(Lemmatizer): + _morph = None def __init__( self, @@ -30,8 +31,8 @@ class RussianLemmatizer(Lemmatizer): "The Russian lemmatizer mode 'pymorphy2' requires the " "pymorphy2 library. Install it with: pip install pymorphy2" ) from None - if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer() + if RussianLemmatizer._morph is None: + RussianLemmatizer._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) def pymorphy2_lemmatize(self, token: Token) -> List[str]: diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 1fb030e06..e1fdf39fc 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -7,6 +7,8 @@ from ...vocab import Vocab class UkrainianLemmatizer(RussianLemmatizer): + _morph = None + def __init__( self, vocab: Vocab, @@ -25,6 +27,6 @@ class UkrainianLemmatizer(RussianLemmatizer): "pymorphy2 library and dictionaries. Install them with: " "pip install pymorphy2 pymorphy2-dicts-uk" ) from None - if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer(lang="uk") + if UkrainianLemmatizer._morph is None: + UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index d8486b84b..e5ff2202c 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -50,8 +50,6 @@ cdef class PhraseMatcher: if isinstance(attr, (int, long)): self.attr = attr else: - if attr is None: - attr = "ORTH" attr = attr.upper() if attr == "TEXT": attr = "ORTH" diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 97bef2d0e..d4d2d638b 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode from thinc.api import MultiSoftmax, list2array from thinc.api import to_categorical, CosineDistance, L2Distance -from ...util import registry, OOV_RANK +from ...util import registry from ...errors import Errors from ...attrs import ID @@ -70,7 +70,6 @@ def get_vectors_loss(ops, docs, prediction, distance): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - target[ids == OOV_RANK] = 0 d_target, loss = distance(prediction, target) return loss, d_target diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 243ad9094..a03b6b384 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -481,8 +481,7 @@ class EntityLinker(TrainablePipe): def load_model(p): try: - with p.open("rb") as infile: - self.model.from_bytes(infile.read()) + self.model.from_bytes(p.open("rb").read()) except AttributeError: raise ValueError(Errors.E149) from None diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 761ff12bf..a74d2f303 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -3,7 +3,6 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, from collections import defaultdict from pathlib import Path import srsly -import warnings from .pipe import Pipe from ..training import Example @@ -103,12 +102,17 @@ class EntityRuler(Pipe): self.overwrite = overwrite_ents self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) - self._validate = validate self.matcher = Matcher(nlp.vocab, validate=validate) - self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr, validate=validate - ) + if phrase_matcher_attr is not None: + if phrase_matcher_attr.upper() == "TEXT": + phrase_matcher_attr = "ORTH" + self.phrase_matcher_attr = phrase_matcher_attr + self.phrase_matcher = PhraseMatcher( + nlp.vocab, attr=self.phrase_matcher_attr, validate=validate + ) + else: + self.phrase_matcher_attr = None + self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) self.ent_id_sep = ent_id_sep self._ent_ids = defaultdict(dict) if patterns is not None: @@ -142,9 +146,7 @@ class EntityRuler(Pipe): def match(self, doc: Doc): self._require_patterns() - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="\\[W036") - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) + matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] ) @@ -279,7 +281,7 @@ class EntityRuler(Pipe): current_index = i break subsequent_pipes = [ - pipe for pipe in self.nlp.pipe_names[current_index :] + pipe for pipe in self.nlp.pipe_names[current_index + 1 :] ] except ValueError: subsequent_pipes = [] @@ -315,22 +317,20 @@ class EntityRuler(Pipe): pattern = entry["pattern"] if isinstance(pattern, Doc): self.phrase_patterns[label].append(pattern) - self.phrase_matcher.add(label, [pattern]) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) - self.matcher.add(label, [pattern]) else: raise ValueError(Errors.E097.format(pattern=pattern)) + for label, patterns in self.token_patterns.items(): + self.matcher.add(label, patterns) + for label, patterns in self.phrase_patterns.items(): + self.phrase_matcher.add(label, patterns) def clear(self) -> None: """Reset all patterns.""" self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(dict) - self.matcher = Matcher(self.nlp.vocab, validate=self._validate) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate - ) def _require_patterns(self) -> None: """Raise a warning if this component has no patterns defined.""" @@ -381,9 +381,10 @@ class EntityRuler(Pipe): self.add_patterns(cfg.get("patterns", cfg)) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + if self.phrase_matcher_attr is not None: + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) else: self.add_patterns(cfg) @@ -434,9 +435,10 @@ class EntityRuler(Pipe): self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) + if self.phrase_matcher_attr is not None: + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) from_disk(path, deserializers_patterns, {}) return self diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 8edd99361..1d652a483 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -332,8 +332,6 @@ class TextCategorizer(TrainablePipe): else: for label in labels: self.add_label(label) - if len(self.labels) < 2: - raise ValueError(Errors.E867) if positive_label is not None: if positive_label not in self.labels: err = Errors.E920.format(pos_label=positive_label, labels=self.labels) diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index fe51f38e5..64e33f800 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -324,8 +324,7 @@ cdef class TrainablePipe(Pipe): def load_model(p): try: - with open(p, "rb") as mfile: - self.model.from_bytes(mfile.read()) + self.model.from_bytes(p.open("rb").read()) except AttributeError: raise ValueError(Errors.E149) from None diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 3dafb6956..3aae063d3 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -351,25 +351,17 @@ def test_doc_from_array_morph(en_vocab): @pytest.mark.usefixtures("clean_underscore") def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = [ - "Merging the docs is fun.", - "", - "They don't think alike. ", - "Another doc.", - ] + en_texts = ["Merging the docs is fun.", "", "They don't think alike."] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - en_docs[3].spans["group"] = [en_docs[3][0:1]] - span_group_texts = sorted( - [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] - ) + span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) - en_docs[0][2]._.is_ambiguous = True # docs - en_docs[2][3]._.is_ambiguous = True # think + en_docs[0][2]._.is_ambiguous = True # docs + en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) @@ -379,8 +371,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) - assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) + assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -392,12 +384,11 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) - assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(m_doc.text) == sum(len(t) for t in en_texts) - assert m_doc.text == "".join(en_texts_without_empty) + assert len(str(m_doc)) == sum(len(t) for t in en_texts) + assert str(m_doc) == "".join(en_texts) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -406,12 +397,11 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) - assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) + assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) + assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -424,16 +414,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): # can merge empty docs doc = Doc.from_docs([en_tokenizer("")] * 10) - # empty but set spans keys are preserved - en_docs = [en_tokenizer(text) for text in en_texts] - m_doc = Doc.from_docs(en_docs) - assert "group" not in m_doc.spans - for doc in en_docs: - doc.spans["group"] = [] - m_doc = Doc.from_docs(en_docs) - assert "group" in m_doc.spans - assert len(m_doc.spans["group"]) == 0 - def test_doc_api_from_docs_ents(en_tokenizer): texts = ["Merging the docs is fun.", "They don't think alike."] diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 36f4a75e0..46f1f2bd1 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -4,13 +4,12 @@ from spacy.util import get_lang_class # fmt: off # Only include languages with no external dependencies -# excluded: ja, ko, th, vi, zh -LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", - "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", - "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", - "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", - "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] +# excluded: ja, ru, th, uk, vi, zh +LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", + "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur", + "yo"] # fmt: on diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e0f655bbe..d3772a931 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -481,7 +481,6 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text): assert len(matches) == 1 -@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 36708edd0..9f575fe05 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -180,7 +180,6 @@ def test_matcher_sets_return_correct_tokens(en_vocab): assert texts == ["zero", "one", "two"] -@pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_remove(): nlp = English() matcher = Matcher(nlp.vocab) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d7c37fbd1..bebadf7e9 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -252,12 +252,12 @@ def test_ruler_before_ner(): # 1 : Entity Ruler - should set "this" to B and everything else to empty patterns = [{"label": "THING", "pattern": "This"}] ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) # 2: untrained NER - should set everything else to O untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") nlp.initialize() - ruler.add_patterns(patterns) doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 13c8cb72e..a7f9364e9 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -324,7 +324,6 @@ def test_append_alias(nlp): assert len(mykb.get_alias_candidates("douglas")) == 3 -@pytest.mark.filterwarnings("ignore:\\[W036") def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -343,7 +342,6 @@ def test_append_invalid_alias(nlp): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) -@pytest.mark.filterwarnings("ignore:\\[W036") def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" vector_length = 1 diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index dc0ca0301..a382532d2 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -89,20 +89,6 @@ def test_entity_ruler_init_clear(nlp, patterns): assert len(ruler.labels) == 0 -def test_entity_ruler_clear(nlp, patterns): - """Test that initialization clears patterns.""" - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - assert len(ruler.labels) == 4 - doc = nlp("hello world") - assert len(doc.ents) == 1 - ruler.clear() - assert len(ruler.labels) == 0 - with pytest.warns(UserWarning): - doc = nlp("hello world") - assert len(doc.ents) == 0 - - def test_entity_ruler_existing(nlp, patterns): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index c5cc62661..a7071abfd 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -334,31 +334,24 @@ def test_language_factories_invalid(): @pytest.mark.parametrize( - "weights,override,expected", + "weights,expected", [ - ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}), - ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}), + ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}), ( [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}], - {}, {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17}, ), ( - [{"a": 100, "b": 300}, {"c": 50, "d": 50}], - {}, - {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1}, + [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], + {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), - ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}), - ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}), - ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}), - ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}), - ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}), + ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}), ], ) -def test_language_factories_combine_score_weights(weights, override, expected): - result = combine_score_weights(weights, override) +def test_language_factories_combine_score_weights(weights, expected): + result = combine_score_weights(weights) assert sum(result.values()) in (0.99, 1.0, 0.0) assert result == expected @@ -384,17 +377,17 @@ def test_language_factories_scores(): # Test with custom defaults config = nlp.config.copy() config["training"]["score_weights"]["a1"] = 0.0 - config["training"]["score_weights"]["b3"] = 1.3 + config["training"]["score_weights"]["b3"] = 1.0 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65} + expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66} + expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} assert score_weights == expected diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index c251f3dfd..43dfff147 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -108,12 +108,6 @@ def test_label_types(name): textcat.add_label("answer") with pytest.raises(ValueError): textcat.add_label(9) - # textcat requires at least two labels - if name == "textcat": - with pytest.raises(ValueError): - nlp.initialize() - else: - nlp.initialize() @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"]) diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py deleted file mode 100644 index 528d4b6f9..000000000 --- a/spacy/tests/regression/test_issue8216.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest - -from spacy import registry -from spacy.language import Language -from spacy.pipeline import EntityRuler - - -@pytest.fixture -def nlp(): - return Language() - - -@pytest.fixture -@registry.misc("entity_ruler_patterns") -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"}, - ] - - -def test_entity_ruler_fix8216(nlp, patterns): - """Test that patterns don't get added excessively.""" - ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) - ruler.add_patterns(patterns) - pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert pattern_count > 0 - ruler.add_patterns([]) - after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert after_count == pattern_count diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c1ba1df36..6cfeaf014 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -84,8 +84,7 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize("file_name", ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name - with loc.open("r", encoding="utf8") as infile: - text = infile.read() + text = loc.open("r", encoding="utf8").read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 4dd90f416..ba58ea96d 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -182,27 +182,6 @@ def test_Example_from_dict_with_entities(annots): assert example.reference[5].ent_type_ == "LOC" -def test_Example_from_dict_with_empty_entities(): - annots = { - "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [], - } - vocab = Vocab() - predicted = Doc(vocab, words=annots["words"]) - example = Example.from_dict(predicted, annots) - # entities as empty list sets everything to O - assert example.reference.has_annotation("ENT_IOB") - assert len(list(example.reference.ents)) == 0 - assert all(token.ent_iob_ == "O" for token in example.reference) - # various unset/missing entities leaves entities unset - annots["entities"] = None - example = Example.from_dict(predicted, annots) - assert not example.reference.has_annotation("ENT_IOB") - annots.pop("entities", None) - example = Example.from_dict(predicted, annots) - assert not example.reference.has_annotation("ENT_IOB") - - @pytest.mark.parametrize( "annots", [ diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a6732c7c9..7a50d3d53 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1141,10 +1141,6 @@ cdef class Doc: else: warnings.warn(Warnings.W102.format(key=key, value=value)) for key in doc.spans: - # if a spans key is in any doc, include it in the merged doc - # even if it is empty - if key not in concat_spans: - concat_spans[key] = [] for span in doc.spans[key]: concat_spans[key].append(( span.start_char + char_offset, @@ -1154,7 +1150,7 @@ cdef class Doc: span.text, # included as a check )) char_offset += len(doc.text) - if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 1c80dc268..07a83bfec 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -416,7 +416,7 @@ def _fix_legacy_dict_data(example_dict): token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) for key, value in example_dict.items(): - if value is not None: + if value: if key in ("token_annotation", "doc_annotation"): pass elif key == "ids": diff --git a/spacy/util.py b/spacy/util.py index 52b48dcdb..928106dbb 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1370,14 +1370,32 @@ def combine_score_weights( should be preserved. RETURNS (Dict[str, float]): The combined and normalized weights. """ - # We divide each weight by the total weight sum. # We first need to extract all None/null values for score weights that # shouldn't be shown in the table *or* be weighted - result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()} - weight_sum = sum([v if v else 0.0 for v in result.values()]) - for key, value in result.items(): - if value and weight_sum > 0: - result[key] = round(value / weight_sum, 2) + result = {} + all_weights = [] + for w_dict in weights: + filtered_weights = {} + for key, value in w_dict.items(): + value = overrides.get(key, value) + if value is None: + result[key] = None + else: + filtered_weights[key] = value + all_weights.append(filtered_weights) + for w_dict in all_weights: + # We need to account for weights that don't sum to 1.0 and normalize + # the score weights accordingly, then divide score by the number of + # components. + total = sum(w_dict.values()) + for key, value in w_dict.items(): + if total == 0: + weight = 0.0 + else: + weight = round(value / total / len(all_weights), 2) + prev_weight = result.get(key, 0.0) + prev_weight = 0.0 if prev_weight is None else prev_weight + result[key] = prev_weight + weight return result diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index baa30ae01..fdd235b85 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -10,12 +10,11 @@ api_trainable: true --- The text categorizer predicts **categories over a whole document**. and comes in -two flavors: `textcat` and `textcat_multilabel`. When you need to predict +two flavours: `textcat` and `textcat_multilabel`. When you need to predict exactly one true label per document, use the `textcat` which has mutually exclusive labels. If you want to perform multi-label classification and predict -zero, one or more true labels per document, use the `textcat_multilabel` -component instead. For a binary classification task, you can use `textcat` with -**two** labels or `textcat_multilabel` with **one** label. +zero, one or more labels per document, use the `textcat_multilabel` component +instead. Both components are documented on this page. @@ -190,7 +189,7 @@ This method was previously called `begin_training`. | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | | `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ | ## TextCategorizer.predict {#predict tag="method"} diff --git a/website/meta/languages.json b/website/meta/languages.json index f6ac45b15..e05718047 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -262,12 +262,7 @@ }, { "code": "mk", - "name": "Macedonian", - "models": [ - "mk_core_news_sm", - "mk_core_news_md", - "mk_core_news_lg" - ] + "name": "Macedonian" }, { "code": "ml",