mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 01:02:23 +03:00
Backport bugfixes from v3.1.0 to v3.0 (#8739)
* Fix scoring normalization (#7629) * fix scoring normalization * score weights by total sum instead of per component * cleanup * more cleanup * Use a context manager when reading model (fix #7036) (#8244) * Fix other open calls without context managers (#8245) * Don't add duplicate patterns all the time in EntityRuler (fix #8216) (#8246) * Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests * Exclude generated .cpp files from package (#8271) * Fix non-deterministic deduplication in Greek lemmatizer (#8421) * Fix setting empty entities in Example.from_dict (#8426) * Filter W036 for entity ruler, etc. (#8424) * Preserve paths.vectors/initialize.vectors setting in quickstart template * Various fixes for spans in Docs.from_docs (#8487) * Fix spans offsets if a doc ends in a single space and no space is inserted * Also include spans key in merged doc for empty spans lists * Fix duplicate spacy package CLI opts (#8551) Use `-c` for `--code` and not additionally for `--create-meta`, in line with the docs. * Raise an error for textcat with <2 labels (#8584) * Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs * Add Macedonian models to website (#8637) * Fix Azerbaijani init, extend lang init tests (#8656) * Extend langs in initialize tests * Fix az init * Fix ru/uk lemmatizer mp with spawn (#8657) Use an instance variable instead a class variable for the morphological analzyer so that multiprocessing with spawn is possible. * Use 0-vector for OOV lexemes (#8639) * Set version to v3.0.7 Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
This commit is contained in:
parent
bb97e7bf8a
commit
f94168a41e
|
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
|
|||
recursive-include spacy/lang *.json.gz
|
||||
recursive-include spacy/cli *.json *.yml
|
||||
recursive-include licenses *
|
||||
recursive-exclude spacy *.cpp
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.0.6"
|
||||
__version__ = "3.0.7"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -115,7 +115,8 @@ def convert(
|
|||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||
doc_files = []
|
||||
for input_loc in walk_directory(Path(input_path), converter):
|
||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||
with input_loc.open("r", encoding="utf-8") as infile:
|
||||
input_data = infile.read()
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
docs = func(
|
||||
|
|
|
@ -18,7 +18,7 @@ def package_cli(
|
|||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||
|
|
|
@ -418,7 +418,7 @@ compound = 1.001
|
|||
|
||||
[initialize]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
vectors = ${paths.vectors}
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
|
|
@ -518,6 +518,11 @@ class Errors:
|
|||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E867 = ("The 'textcat' component requires at least two labels because it "
|
||||
"uses mutually exclusive classes where exactly one label is True "
|
||||
"for each doc. For binary classification tasks, you can use two "
|
||||
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
|
||||
"can use the 'textcat_multilabel' component with one label.")
|
||||
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
||||
"splitting up your documents into several doc bins and serializing "
|
||||
"each separately. spacy.Corpus.v1 will search recursively for all "
|
||||
|
|
|
@ -1,16 +1,11 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
|
||||
|
||||
class AzerbaijaniDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Azerbaijani(Language):
|
||||
|
|
|
@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
|
|||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms = list(set(forms))
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -12,7 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
|
|||
|
||||
|
||||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -31,8 +30,8 @@ class RussianLemmatizer(Lemmatizer):
|
|||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||
) from None
|
||||
if RussianLemmatizer._morph is None:
|
||||
RussianLemmatizer._morph = MorphAnalyzer()
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
|
|
|
@ -7,8 +7,6 @@ from ...vocab import Vocab
|
|||
|
||||
|
||||
class UkrainianLemmatizer(RussianLemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
|
@ -27,6 +25,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
"pymorphy2 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy2 pymorphy2-dicts-uk"
|
||||
) from None
|
||||
if UkrainianLemmatizer._morph is None:
|
||||
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
|
|
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
|
|||
if isinstance(attr, (int, long)):
|
||||
self.attr = attr
|
||||
else:
|
||||
if attr is None:
|
||||
attr = "ORTH"
|
||||
attr = attr.upper()
|
||||
if attr == "TEXT":
|
||||
attr = "ORTH"
|
||||
|
|
|
@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
|
|||
from thinc.api import MultiSoftmax, list2array
|
||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||
|
||||
from ...util import registry
|
||||
from ...util import registry, OOV_RANK
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
|
||||
|
@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
target[ids == OOV_RANK] = 0
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
|
|
@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
def load_model(p):
|
||||
try:
|
||||
self.model.from_bytes(p.open("rb").read())
|
||||
with p.open("rb") as infile:
|
||||
self.model.from_bytes(infile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import warnings
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..training import Example
|
||||
|
@ -102,17 +103,12 @@ class EntityRuler(Pipe):
|
|||
self.overwrite = overwrite_ents
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self._validate = validate
|
||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||
if phrase_matcher_attr is not None:
|
||||
if phrase_matcher_attr.upper() == "TEXT":
|
||||
phrase_matcher_attr = "ORTH"
|
||||
self.phrase_matcher_attr = phrase_matcher_attr
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||
)
|
||||
else:
|
||||
self.phrase_matcher_attr = None
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
||||
self.phrase_matcher_attr = phrase_matcher_attr
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||
)
|
||||
self.ent_id_sep = ent_id_sep
|
||||
self._ent_ids = defaultdict(dict)
|
||||
if patterns is not None:
|
||||
|
@ -146,7 +142,9 @@ class EntityRuler(Pipe):
|
|||
|
||||
def match(self, doc: Doc):
|
||||
self._require_patterns()
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="\\[W036")
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
matches = set(
|
||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||
)
|
||||
|
@ -281,7 +279,7 @@ class EntityRuler(Pipe):
|
|||
current_index = i
|
||||
break
|
||||
subsequent_pipes = [
|
||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
||||
pipe for pipe in self.nlp.pipe_names[current_index :]
|
||||
]
|
||||
except ValueError:
|
||||
subsequent_pipes = []
|
||||
|
@ -317,20 +315,22 @@ class EntityRuler(Pipe):
|
|||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, Doc):
|
||||
self.phrase_patterns[label].append(pattern)
|
||||
self.phrase_matcher.add(label, [pattern])
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
self.matcher.add(label, [pattern])
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
for label, patterns in self.token_patterns.items():
|
||||
self.matcher.add(label, patterns)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
self.phrase_matcher.add(label, patterns)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Reset all patterns."""
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self._ent_ids = defaultdict(dict)
|
||||
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||
)
|
||||
|
||||
def _require_patterns(self) -> None:
|
||||
"""Raise a warning if this component has no patterns defined."""
|
||||
|
@ -381,10 +381,9 @@ class EntityRuler(Pipe):
|
|||
self.add_patterns(cfg.get("patterns", cfg))
|
||||
self.overwrite = cfg.get("overwrite", False)
|
||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||
if self.phrase_matcher_attr is not None:
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
else:
|
||||
self.add_patterns(cfg)
|
||||
|
@ -435,10 +434,9 @@ class EntityRuler(Pipe):
|
|||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
|
||||
if self.phrase_matcher_attr is not None:
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
)
|
||||
from_disk(path, deserializers_patterns, {})
|
||||
return self
|
||||
|
||||
|
|
|
@ -332,6 +332,8 @@ class TextCategorizer(TrainablePipe):
|
|||
else:
|
||||
for label in labels:
|
||||
self.add_label(label)
|
||||
if len(self.labels) < 2:
|
||||
raise ValueError(Errors.E867)
|
||||
if positive_label is not None:
|
||||
if positive_label not in self.labels:
|
||||
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
|
||||
|
|
|
@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
|
|||
|
||||
def load_model(p):
|
||||
try:
|
||||
self.model.from_bytes(p.open("rb").read())
|
||||
with open(p, "rb") as mfile:
|
||||
self.model.from_bytes(mfile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
|
|
|
@ -351,17 +351,25 @@ def test_doc_from_array_morph(en_vocab):
|
|||
|
||||
@pytest.mark.usefixtures("clean_underscore")
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
||||
en_texts = [
|
||||
"Merging the docs is fun.",
|
||||
"",
|
||||
"They don't think alike. ",
|
||||
"Another doc.",
|
||||
]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
de_text = "Wie war die Frage?"
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
|
||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||
span_group_texts = sorted(
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||
)
|
||||
de_doc = de_tokenizer(de_text)
|
||||
Token.set_extension("is_ambiguous", default=False)
|
||||
en_docs[0][2]._.is_ambiguous = True # docs
|
||||
en_docs[2][3]._.is_ambiguous = True # think
|
||||
en_docs[0][2]._.is_ambiguous = True # docs
|
||||
en_docs[2][3]._.is_ambiguous = True # think
|
||||
assert Doc.from_docs([]) is None
|
||||
assert de_doc is not Doc.from_docs([de_doc])
|
||||
assert str(de_doc) == str(Doc.from_docs([de_doc]))
|
||||
|
@ -371,8 +379,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -384,11 +392,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
||||
assert "group" in m_doc.spans
|
||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||
assert bool(m_doc[11].whitespace_)
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
||||
assert str(m_doc) == "".join(en_texts)
|
||||
assert len(m_doc.text) == sum(len(t) for t in en_texts)
|
||||
assert m_doc.text == "".join(en_texts_without_empty)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -397,11 +406,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
assert "group" in m_doc.spans
|
||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||
assert bool(m_doc[11].whitespace_)
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||
# space delimiter considered, although spacy attribute was missing
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -414,6 +424,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
# can merge empty docs
|
||||
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
||||
|
||||
# empty but set spans keys are preserved
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert "group" not in m_doc.spans
|
||||
for doc in en_docs:
|
||||
doc.spans["group"] = []
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert "group" in m_doc.spans
|
||||
assert len(m_doc.spans["group"]) == 0
|
||||
|
||||
|
||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
|
|
|
@ -4,12 +4,13 @@ from spacy.util import get_lang_class
|
|||
|
||||
# fmt: off
|
||||
# Only include languages with no external dependencies
|
||||
# excluded: ja, ru, th, uk, vi, zh
|
||||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
||||
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
|
||||
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
|
||||
"yo"]
|
||||
# excluded: ja, ko, th, vi, zh
|
||||
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
|||
assert len(matches) == 1
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
def test_matcher_valid_callback(en_vocab):
|
||||
"""Test that on_match can only be None or callable."""
|
||||
matcher = Matcher(en_vocab)
|
||||
|
|
|
@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
|
|||
assert texts == ["zero", "one", "two"]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
def test_matcher_remove():
|
||||
nlp = English()
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
|
|
@ -252,12 +252,12 @@ def test_ruler_before_ner():
|
|||
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
||||
patterns = [{"label": "THING", "pattern": "This"}]
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
# 2: untrained NER - should set everything else to O
|
||||
untrained_ner = nlp.add_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.initialize()
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||
expected_types = ["THING", "", "", "", "", "", ""]
|
||||
|
|
|
@ -324,6 +324,7 @@ def test_append_alias(nlp):
|
|||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
def test_append_invalid_alias(nlp):
|
||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
@ -342,6 +343,7 @@ def test_append_invalid_alias(nlp):
|
|||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
def test_preserving_links_asdoc(nlp):
|
||||
"""Test that Span.as_doc preserves the existing entity links"""
|
||||
vector_length = 1
|
||||
|
|
|
@ -89,6 +89,20 @@ def test_entity_ruler_init_clear(nlp, patterns):
|
|||
assert len(ruler.labels) == 0
|
||||
|
||||
|
||||
def test_entity_ruler_clear(nlp, patterns):
|
||||
"""Test that initialization clears patterns."""
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
doc = nlp("hello world")
|
||||
assert len(doc.ents) == 1
|
||||
ruler.clear()
|
||||
assert len(ruler.labels) == 0
|
||||
with pytest.warns(UserWarning):
|
||||
doc = nlp("hello world")
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
def test_entity_ruler_existing(nlp, patterns):
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
|
|
@ -334,24 +334,31 @@ def test_language_factories_invalid():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"weights,expected",
|
||||
"weights,override,expected",
|
||||
[
|
||||
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
||||
([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
|
||||
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
|
||||
([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
|
||||
(
|
||||
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
|
||||
{},
|
||||
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
|
||||
),
|
||||
(
|
||||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
||||
[{"a": 100, "b": 300}, {"c": 50, "d": 50}],
|
||||
{},
|
||||
{"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
|
||||
),
|
||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
|
||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
|
||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
|
||||
([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
|
||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
||||
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
|
||||
],
|
||||
)
|
||||
def test_language_factories_combine_score_weights(weights, expected):
|
||||
result = combine_score_weights(weights)
|
||||
def test_language_factories_combine_score_weights(weights, override, expected):
|
||||
result = combine_score_weights(weights, override)
|
||||
assert sum(result.values()) in (0.99, 1.0, 0.0)
|
||||
assert result == expected
|
||||
|
||||
|
@ -377,17 +384,17 @@ def test_language_factories_scores():
|
|||
# Test with custom defaults
|
||||
config = nlp.config.copy()
|
||||
config["training"]["score_weights"]["a1"] = 0.0
|
||||
config["training"]["score_weights"]["b3"] = 1.0
|
||||
config["training"]["score_weights"]["b3"] = 1.3
|
||||
nlp = English.from_config(config)
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
|
||||
expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
|
||||
assert score_weights == expected
|
||||
# Test with null values
|
||||
config = nlp.config.copy()
|
||||
config["training"]["score_weights"]["a1"] = None
|
||||
nlp = English.from_config(config)
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
|
||||
expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
|
||||
assert score_weights == expected
|
||||
|
||||
|
||||
|
|
|
@ -108,6 +108,12 @@ def test_label_types(name):
|
|||
textcat.add_label("answer")
|
||||
with pytest.raises(ValueError):
|
||||
textcat.add_label(9)
|
||||
# textcat requires at least two labels
|
||||
if name == "textcat":
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
else:
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||
|
|
34
spacy/tests/regression/test_issue8216.py
Normal file
34
spacy/tests/regression/test_issue8216.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import pytest
|
||||
|
||||
from spacy import registry
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return Language()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@registry.misc("entity_ruler_patterns")
|
||||
def patterns():
|
||||
return [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
{"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
|
||||
]
|
||||
|
||||
|
||||
def test_entity_ruler_fix8216(nlp, patterns):
|
||||
"""Test that patterns don't get added excessively."""
|
||||
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
|
||||
ruler.add_patterns(patterns)
|
||||
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||
assert pattern_count > 0
|
||||
ruler.add_patterns([])
|
||||
after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||
assert after_count == pattern_count
|
|
@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
|||
@pytest.mark.parametrize("file_name", ["sun.txt"])
|
||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||
loc = ensure_path(__file__).parent / file_name
|
||||
text = loc.open("r", encoding="utf8").read()
|
||||
with loc.open("r", encoding="utf8") as infile:
|
||||
text = infile.read()
|
||||
assert len(text) != 0
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) > 100
|
||||
|
|
|
@ -182,6 +182,27 @@ def test_Example_from_dict_with_entities(annots):
|
|||
assert example.reference[5].ent_type_ == "LOC"
|
||||
|
||||
|
||||
def test_Example_from_dict_with_empty_entities():
|
||||
annots = {
|
||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||
"entities": [],
|
||||
}
|
||||
vocab = Vocab()
|
||||
predicted = Doc(vocab, words=annots["words"])
|
||||
example = Example.from_dict(predicted, annots)
|
||||
# entities as empty list sets everything to O
|
||||
assert example.reference.has_annotation("ENT_IOB")
|
||||
assert len(list(example.reference.ents)) == 0
|
||||
assert all(token.ent_iob_ == "O" for token in example.reference)
|
||||
# various unset/missing entities leaves entities unset
|
||||
annots["entities"] = None
|
||||
example = Example.from_dict(predicted, annots)
|
||||
assert not example.reference.has_annotation("ENT_IOB")
|
||||
annots.pop("entities", None)
|
||||
example = Example.from_dict(predicted, annots)
|
||||
assert not example.reference.has_annotation("ENT_IOB")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"annots",
|
||||
[
|
||||
|
|
|
@ -1141,6 +1141,10 @@ cdef class Doc:
|
|||
else:
|
||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||
for key in doc.spans:
|
||||
# if a spans key is in any doc, include it in the merged doc
|
||||
# even if it is empty
|
||||
if key not in concat_spans:
|
||||
concat_spans[key] = []
|
||||
for span in doc.spans[key]:
|
||||
concat_spans[key].append((
|
||||
span.start_char + char_offset,
|
||||
|
@ -1150,7 +1154,7 @@ cdef class Doc:
|
|||
span.text, # included as a check
|
||||
))
|
||||
char_offset += len(doc.text)
|
||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
|
||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
||||
char_offset += 1
|
||||
|
||||
arrays = [doc.to_array(attrs) for doc in docs]
|
||||
|
|
|
@ -416,7 +416,7 @@ def _fix_legacy_dict_data(example_dict):
|
|||
token_dict = example_dict.get("token_annotation", {})
|
||||
doc_dict = example_dict.get("doc_annotation", {})
|
||||
for key, value in example_dict.items():
|
||||
if value:
|
||||
if value is not None:
|
||||
if key in ("token_annotation", "doc_annotation"):
|
||||
pass
|
||||
elif key == "ids":
|
||||
|
|
|
@ -1370,32 +1370,14 @@ def combine_score_weights(
|
|||
should be preserved.
|
||||
RETURNS (Dict[str, float]): The combined and normalized weights.
|
||||
"""
|
||||
# We divide each weight by the total weight sum.
|
||||
# We first need to extract all None/null values for score weights that
|
||||
# shouldn't be shown in the table *or* be weighted
|
||||
result = {}
|
||||
all_weights = []
|
||||
for w_dict in weights:
|
||||
filtered_weights = {}
|
||||
for key, value in w_dict.items():
|
||||
value = overrides.get(key, value)
|
||||
if value is None:
|
||||
result[key] = None
|
||||
else:
|
||||
filtered_weights[key] = value
|
||||
all_weights.append(filtered_weights)
|
||||
for w_dict in all_weights:
|
||||
# We need to account for weights that don't sum to 1.0 and normalize
|
||||
# the score weights accordingly, then divide score by the number of
|
||||
# components.
|
||||
total = sum(w_dict.values())
|
||||
for key, value in w_dict.items():
|
||||
if total == 0:
|
||||
weight = 0.0
|
||||
else:
|
||||
weight = round(value / total / len(all_weights), 2)
|
||||
prev_weight = result.get(key, 0.0)
|
||||
prev_weight = 0.0 if prev_weight is None else prev_weight
|
||||
result[key] = prev_weight + weight
|
||||
result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
|
||||
weight_sum = sum([v if v else 0.0 for v in result.values()])
|
||||
for key, value in result.items():
|
||||
if value and weight_sum > 0:
|
||||
result[key] = round(value / weight_sum, 2)
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -10,11 +10,12 @@ api_trainable: true
|
|||
---
|
||||
|
||||
The text categorizer predicts **categories over a whole document**. and comes in
|
||||
two flavours: `textcat` and `textcat_multilabel`. When you need to predict
|
||||
two flavors: `textcat` and `textcat_multilabel`. When you need to predict
|
||||
exactly one true label per document, use the `textcat` which has mutually
|
||||
exclusive labels. If you want to perform multi-label classification and predict
|
||||
zero, one or more labels per document, use the `textcat_multilabel` component
|
||||
instead.
|
||||
zero, one or more true labels per document, use the `textcat_multilabel`
|
||||
component instead. For a binary classification task, you can use `textcat` with
|
||||
**two** labels or `textcat_multilabel` with **one** label.
|
||||
|
||||
Both components are documented on this page.
|
||||
|
||||
|
@ -189,7 +190,7 @@ This method was previously called `begin_training`.
|
|||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
||||
| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~ |
|
||||
|
||||
## TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
|
|
|
@ -262,7 +262,12 @@
|
|||
},
|
||||
{
|
||||
"code": "mk",
|
||||
"name": "Macedonian"
|
||||
"name": "Macedonian",
|
||||
"models": [
|
||||
"mk_core_news_sm",
|
||||
"mk_core_news_md",
|
||||
"mk_core_news_lg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"code": "ml",
|
||||
|
|
Loading…
Reference in New Issue
Block a user