Merge pull request #8523 from adrianeboyd/chore/cleanup-v3.1.0

This commit is contained in:
Ines Montani 2021-06-28 21:45:38 +10:00 committed by GitHub
commit 8bc235dcc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
80 changed files with 489 additions and 357 deletions

View File

@ -111,7 +111,7 @@ universal = false
formats = gztar
[flake8]
ignore = E203, E266, E501, E731, W503, E741
ignore = E203, E266, E501, E731, W503, E741, F541
max-line-length = 80
select = B,C,E,F,W,T4,B9
exclude =

View File

@ -4,6 +4,7 @@ import sys
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
setup_default_warnings()
# These are imported as part of the API

View File

@ -6,7 +6,6 @@ import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..training.initialize import init_nlp
from .. import util
from ..util import get_sourced_components, load_model_from_config

View File

@ -1,11 +1,11 @@
from typing import Dict, Any, Optional, Iterable
from typing import Dict, Any, Optional
from pathlib import Path
import itertools
from spacy.training import Example
from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate, Adam
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
@ -133,15 +133,16 @@ def debug_model(
_print_model(model, print_settings)
# STEP 2: Updating the model and printing again
optimizer = Adam(0.001)
set_dropout_rate(model, 0.2)
# ugly hack to deal with Tok2Vec/Transformer listeners
upstream_component = None
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
upstream_component = nlp.get_pipe("tok2vec")
if model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name:
if (
model.has_ref("tok2vec")
and "transformer-listener" in model.get_ref("tok2vec").name
):
upstream_component = nlp.get_pipe("transformer")
goldY = None
for e in range(3):
if upstream_component:
upstream_component.update(examples)

View File

@ -127,7 +127,9 @@ def evaluate(
data["ents_per_type"] = scores["ents_per_type"]
if f"spans_{spans_key}_per_type" in scores:
if scores[f"spans_{spans_key}_per_type"]:
print_prf_per_type(msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type")
print_prf_per_type(
msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
)
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]:

View File

@ -331,7 +331,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
continue
col1 = md.bold(md.code(pipe))
col2 = ", ".join(
[md.code(label.replace("|", "\|")) for label in labels]
[md.code(label.replace("|", "\\|")) for label in labels]
) # noqa: W605
label_data.append((col1, col2))
n_labels += len(labels)

View File

@ -5,7 +5,6 @@ import requests
from wasabi import msg, Printer
import warnings
from ..errors import Warnings
from ._util import app
from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version

View File

@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
)
if not doc.has_annotation("DEP"):
warnings.warn(Warnings.W005)
if options.get("collapse_phrases", False):

View File

@ -22,13 +22,13 @@ _num_words = [
"тринадесет",
"тринайсет",
"четиринадесет",
"четиринайсет"
"четиринайсет",
"петнадесет",
"петнайсет"
"петнайсет",
"шестнадесет",
"шестнайсет",
"седемнадесет",
"седемнайсет"
"седемнайсет",
"осемнадесет",
"осемнайсет",
"деветнадесет",
@ -36,7 +36,7 @@ _num_words = [
"двадесет",
"двайсет",
"тридесет",
"трийсет"
"трийсет",
"четиридесет",
"четиресет",
"петдесет",

View File

@ -58,7 +58,6 @@ _abbr_dot_exc = [
{ORTH: "стр.", NORM: "страница"},
{ORTH: "ул.", NORM: "улица"},
{ORTH: "чл.", NORM: "член"},
]
for abbr in _abbr_dot_exc:

View File

@ -81,16 +81,32 @@ for exc_data in [
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
conj_contraction_bases = [
("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"),
("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"),
("joll", "jos"), ("ell", "jos")
("ett", "että"),
("jott", "jotta"),
("kosk", "koska"),
("mutt", "mutta"),
("vaikk", "vaikka"),
("ehk", "ehkä"),
("miks", "miksi"),
("siks", "siksi"),
("joll", "jos"),
("ell", "jos"),
]
conj_contraction_negations = [
("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"),
("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät")]
("en", "en"),
("et", "et"),
("ei", "ei"),
("emme", "emme"),
("ette", "ette"),
("eivat", "eivät"),
("eivät", "eivät"),
]
for (base_lower, base_norm) in conj_contraction_bases:
for base in [base_lower, base_lower.title()]:
for (suffix, suffix_norm) in conj_contraction_negations:
_exc[base + suffix] = [{ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}]
_exc[base + suffix] = [
{ORTH: base, NORM: base_norm},
{ORTH: suffix, NORM: suffix_norm},
]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -4,12 +4,12 @@ from ...pipeline import Lemmatizer
from ...tokens import Token
class ItalianLemmatizer(Lemmatizer):
"""This lemmatizer was adapted from the Polish one (version of April 2021).
It implements lookup lemmatization based on the morphological lexicon
morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware
entries is used as a backup for words that aren't handled by morph-it."""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "pos_lookup":

View File

@ -25,7 +25,7 @@ for orth in [
"artt.",
"att.",
"avv.",
"Avv."
"Avv.",
"by-pass",
"c.d.",
"c/c",

View File

@ -35,8 +35,8 @@ URL_PATTERN = (
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
r"(?:" # noqa: E131
r"(?:"
r"[A-Za-z0-9\u00a1-\uffff]"
r"(?:" # noqa: E131
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
r")?"
r"[A-Za-z0-9\u00a1-\uffff]\."

View File

@ -687,11 +687,13 @@ class Language:
if not isinstance(source, Language):
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
# Check vectors, with faster checks first
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
if (
self.vocab.vectors.shape != source.vocab.vectors.shape
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
):
warnings.warn(Warnings.W113.format(name=source_name))
if not source_name in source.component_names:
if source_name not in source.component_names:
raise KeyError(
Errors.E944.format(
name=source_name,
@ -1539,15 +1541,21 @@ class Language:
# Cycle channels not to break the order of docs.
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
byte_tuples = chain.from_iterable(
recv.recv() for recv in cycle(bytedocs_recv_ch)
)
try:
for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1):
for i, (_, (byte_doc, byte_error)) in enumerate(
zip(raw_texts, byte_tuples), 1
):
if byte_doc is not None:
doc = Doc(self.vocab).from_bytes(byte_doc)
yield doc
elif byte_error is not None:
error = srsly.msgpack_loads(byte_error)
self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error)))
self.default_error_handler(
None, None, None, ValueError(Errors.E871.format(error=error))
)
if i % batch_size == 0:
# tell `sender` that one batch was consumed.
sender.step()
@ -1707,7 +1715,9 @@ class Language:
if "replace_listeners" in pipe_cfg:
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
source_nlps[model].replace_listeners(
name, source_name, pipe_cfg["replace_listeners"]
)
listeners_replaced = True
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
# Delete from cache if listeners were replaced
@ -1727,12 +1737,16 @@ class Language:
for name, proc in nlp.pipeline:
# Remove listeners not in the pipeline
listener_names = getattr(proc, "listening_components", [])
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
for listener in getattr(
proc, "listening_components", []
): # e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
@ -1827,7 +1841,9 @@ class Language:
new_config = tok2vec_cfg["model"]
if "replace_listener_cfg" in tok2vec_model.attrs:
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"])
new_config = replace_func(
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
)
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
# Go over the listener layers and replace them
for listener in pipe_listeners:
@ -1866,8 +1882,11 @@ class Language:
util.to_disk(path, serializers, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
overrides: Dict[str, Any] = SimpleFrozenDict(),
self,
path: Union[str, Path],
*,
exclude: Iterable[str] = SimpleFrozenList(),
overrides: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the

View File

@ -12,9 +12,7 @@ from .strings import get_string_id
UNSET = object()
def load_lookups(
lang: str, tables: List[str], strict: bool = True
) -> 'Lookups':
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty `Lookups` container if there's no data or if the package
is not installed.

View File

@ -1,7 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
from .spancat import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -309,9 +309,7 @@ class EntityLinker(TrainablePipe):
assert sent_index >= 0
# get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents)
end_sentence = min(
len(sentences) - 1, sent_index + self.n_sents
)
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc()
@ -337,22 +335,16 @@ class EntityLinker(TrainablePipe):
else:
random.shuffle(candidates)
# set all prior probabilities to 0 if incl_prior=False
prior_probs = xp.asarray(
[c.prior_prob for c in candidates]
)
prior_probs = xp.asarray([c.prior_prob for c in candidates])
if not self.incl_prior:
prior_probs = xp.asarray(
[0.0 for _ in candidates]
)
prior_probs = xp.asarray([0.0 for _ in candidates])
scores = prior_probs
# add in similarity from the context
if self.incl_context:
entity_encodings = xp.asarray(
[c.entity_vector for c in candidates]
)
entity_norm = xp.linalg.norm(
entity_encodings, axis=1
)
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
if len(entity_encodings) != len(prior_probs):
raise RuntimeError(
Errors.E147.format(
@ -361,14 +353,12 @@ class EntityLinker(TrainablePipe):
)
)
# cosine similarity
sims = xp.dot(
entity_encodings, sentence_encoding_t
) / (sentence_norm * entity_norm)
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
sentence_norm * entity_norm
)
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = (
prior_probs + sims - (prior_probs * sims)
)
scores = prior_probs + sims - (prior_probs * sims)
# TODO: thresholding
best_index = scores.argmax().item()
best_candidate = candidates[best_index]

View File

@ -3,7 +3,6 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
from collections import defaultdict
from pathlib import Path
import srsly
import warnings
from .pipe import Pipe
from ..training import Example
@ -278,9 +277,7 @@ class EntityRuler(Pipe):
if self == pipe:
current_index = i
break
subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index :]
]
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
except ValueError:
subsequent_pipes = []
with self.nlp.select_pipes(disable=subsequent_pipes):

View File

@ -61,7 +61,7 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
length = 0
for size in sizes:
if size <= len(doc):
starts_size = starts[:len(doc) - (size - 1)]
starts_size = starts[: len(doc) - (size - 1)]
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
length += spans[-1].shape[0]
if spans:
@ -70,7 +70,7 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), ops.asarray(lengths, dtype="i"))
else:
output = Ragged(ops.xp.zeros((0,0)), ops.asarray(lengths, dtype="i"))
output = Ragged(ops.xp.zeros((0, 0)), ops.asarray(lengths, dtype="i"))
assert output.dataXd.ndim == 2
return output

View File

@ -299,7 +299,9 @@ class TextCategorizer(TrainablePipe):
self._allow_extra_label()
self.cfg["labels"].append(label)
if self.model and "resize_output" in self.model.attrs:
self.model = self.model.attrs["resize_output"](self.model, len(self.cfg["labels"]))
self.model = self.model.attrs["resize_output"](
self.model, len(self.cfg["labels"])
)
self.vocab.strings.add(label)
return 1

View File

@ -365,7 +365,9 @@ class Scorer:
gold_spans.add(gold_span)
gold_per_type[span.label_].add(gold_span)
pred_per_type = {label: set() for label in labels}
for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap):
for span in example.get_aligned_spans_x2y(
getter(pred_doc, attr), allow_overlap
):
if labeled:
pred_span = (span.label_, span.start, span.end - 1)
else:
@ -381,10 +383,10 @@ class Scorer:
score.score_set(pred_spans, gold_spans)
# Assemble final result
final_scores = {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
}
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
}
if labeled:
final_scores[f"{attr}_per_type"] = None
if len(score) > 0:
@ -392,7 +394,9 @@ class Scorer:
final_scores[f"{attr}_r"] = score.recall
final_scores[f"{attr}_f"] = score.fscore
if labeled:
final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()}
final_scores[f"{attr}_per_type"] = {
k: v.to_dict() for k, v in score_per_type.items()
}
return final_scores
@staticmethod

View File

@ -381,9 +381,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
assert m_doc[2]._.is_ambiguous == True
assert m_doc[2]._.is_ambiguous is True
assert m_doc[9].idx == think_idx
assert m_doc[9]._.is_ambiguous == True
assert m_doc[9]._.is_ambiguous is True
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])

View File

@ -484,7 +484,7 @@ def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
assert len(list(doc.sents)) == 2
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:6])
assert doc[3].is_sent_start == None
assert doc[3].is_sent_start is None
# merging over a sentence boundary and setting sent_start
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)

View File

@ -1,5 +1,5 @@
import pytest
from spacy.lang.bg.lex_attrs import like_num
@pytest.mark.parametrize(
"word,match",

View File

@ -40,20 +40,21 @@ CONTRACTION_TESTS = [
(
"Päätimme ettemme tule.",
["Päätimme", "ett", "emme", "tule", "."],
["päätimme", "että", "emme", "tule", "."]
["päätimme", "että", "emme", "tule", "."],
),
(
"Miksei puhuttaisi?",
["Miks", "ei", "puhuttaisi", "?"],
["miksi", "ei", "puhuttaisi", "?"]
["miksi", "ei", "puhuttaisi", "?"],
),
(
"He tottelivat vaikkeivat halunneet",
["He", "tottelivat", "vaikk", "eivat", "halunneet"],
["he", "tottelivat", "vaikka", "eivät", "halunneet"]
["he", "tottelivat", "vaikka", "eivät", "halunneet"],
),
]
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)

View File

@ -1,4 +1,3 @@
import pytest
from spacy.tokens import Doc

View File

@ -23,11 +23,11 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
nlp_r = Vietnamese()
nlp_r.from_bytes(nlp_bytes)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False
assert nlp_r.tokenizer.use_pyvi is False
with make_tempdir() as d:
nlp.to_disk(d)
nlp_r = Vietnamese()
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False
assert nlp_r.tokenizer.use_pyvi is False

View File

@ -354,7 +354,6 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
for token in doc:
token.head = doc[0]
token.dep_ = "a"
get_is_c = lambda token: token.text in ("c",)
Token.set_extension("is_c", default=False)
doc[2]._.is_c = True
pattern = [

View File

@ -255,13 +255,23 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
(2, "aaab", "a a a b", [[0, 1, 2, 3]]),
(3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
(3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
(4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
(5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]),
(
5,
"aabaa",
"a+ b a+",
[[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]],
),
(6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
(7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
(8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
(9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]),
(
9,
"aabb",
"a* b* a*",
[[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]],
),
(10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
(11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
(12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),

View File

@ -557,7 +557,11 @@ def test_neg_annotation(neg_key):
ner.add_label("PERSON")
ner.add_label("ORG")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
example.reference.spans[neg_key] = [
Span(neg_doc, 2, 4, "ORG"),
Span(neg_doc, 2, 3, "PERSON"),
Span(neg_doc, 1, 4, "PERSON"),
]
optimizer = nlp.initialize()
for i in range(2):

View File

@ -1,6 +1,5 @@
from typing import Callable, Iterable, Iterator
import pytest
import io
from thinc.api import Config
from spacy.language import Language

View File

@ -11,7 +11,7 @@ from spacy.ml import load_kb
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.lang.en import English
from spacy.tests.util import make_tempdir, make_tempfile
from spacy.tests.util import make_tempdir
from spacy.tokens import Span
@ -254,7 +254,9 @@ def test_nel_nsents(nlp):
"""Test that n_sents can be set through the configuration"""
entity_linker = nlp.add_pipe("entity_linker", config={})
assert entity_linker.n_sents == 0
entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2})
entity_linker = nlp.replace_pipe(
"entity_linker", "entity_linker", config={"n_sents": 2}
)
assert entity_linker.n_sents == 2
@ -596,7 +598,9 @@ def test_kb_to_bytes():
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2])
kb_1.add_alias(
alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]
)
assert kb_1.contains_alias("Russ Cochran")
kb_bytes = kb_1.to_bytes()
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
@ -611,8 +615,12 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(kb_2.get_alias_candidates("Russ Cochran"))
assert len(kb_1.get_alias_candidates("Randomness")) == len(kb_2.get_alias_candidates("Randomness"))
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
kb_2.get_alias_candidates("Russ Cochran")
)
assert len(kb_1.get_alias_candidates("Randomness")) == len(
kb_2.get_alias_candidates("Randomness")
)
def test_nel_to_bytes():
@ -640,7 +648,9 @@ def test_nel_to_bytes():
kb_2 = nlp_2.get_pipe("entity_linker").kb
assert kb_2.contains_alias("Russ Cochran")
assert kb_2.get_vector("Q2146908") == [6, -4, 3]
assert_almost_equal(kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8)
assert_almost_equal(
kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8
)
def test_scorer_links():

View File

@ -82,7 +82,9 @@ def util_batch_unbatch_docs_list(
Y_batched = model.predict(in_data)
Y_not_batched = [model.predict([u])[0] for u in in_data]
for i in range(len(Y_batched)):
assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4)
assert_almost_equal(
OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4
)
def util_batch_unbatch_docs_array(

View File

@ -351,9 +351,21 @@ def test_language_factories_invalid():
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0, "f": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}),
(
[{"a": 0.0, "b": 0.0}, {"c": 0.0}],
{"c": 0.2},
{"a": 0.0, "b": 0.0, "c": 1.0},
),
(
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
{"a": 0.0, "b": 0.0},
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5},
),
(
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
{"a": 0.0, "b": 0.0, "f": 0.0},
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0},
),
],
)
def test_language_factories_combine_score_weights(weights, override, expected):

View File

@ -446,7 +446,12 @@ def test_update_with_annotates():
for text in texts:
examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
for components_to_annotate in [
[],
[f"{name}1"],
[f"{name}1", f"{name}2"],
[f"{name}2", f"{name}1"],
]:
for key in results:
results[key] = ""
nlp = English(vocab=nlp.vocab)

View File

@ -79,10 +79,7 @@ def test_ngram_suggester(en_tokenizer):
assert spans.shape[0] == len(spans_set)
offset += ngrams.lengths[i]
# the number of spans is correct
assert_equal(
ngrams.lengths,
[max(0, len(doc) - (size - 1)) for doc in docs]
)
assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs])
# test 1-3-gram suggestions
ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3])

View File

@ -132,8 +132,8 @@ def test_incomplete_data():
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
assert doc[1].tag_ is "V"
assert doc[2].tag_ is "J"
assert doc[1].tag_ == "V"
assert doc[2].tag_ == "J"
def test_overfitting_IO():
@ -154,20 +154,20 @@ def test_overfitting_IO():
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
assert doc[0].tag_ is "N"
assert doc[1].tag_ is "V"
assert doc[2].tag_ is "J"
assert doc[3].tag_ is "N"
assert doc[0].tag_ == "N"
assert doc[1].tag_ == "V"
assert doc[2].tag_ == "J"
assert doc[3].tag_ == "N"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
assert doc2[0].tag_ is "N"
assert doc2[1].tag_ is "V"
assert doc2[2].tag_ is "J"
assert doc2[3].tag_ is "N"
assert doc2[0].tag_ == "N"
assert doc2[1].tag_ == "V"
assert doc2[2].tag_ == "J"
assert doc2[3].tag_ == "N"
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
texts = [

View File

@ -131,7 +131,7 @@ def test_implicit_label(name, get_examples):
nlp.initialize(get_examples=get_examples(nlp))
#fmt: off
# fmt: off
@pytest.mark.parametrize(
"name,textcat_config",
[
@ -150,7 +150,7 @@ def test_implicit_label(name, get_examples):
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
],
)
#fmt: on
# fmt: on
def test_no_resize(name, textcat_config):
"""The old textcat architectures weren't resizable"""
nlp = Language()
@ -165,7 +165,7 @@ def test_no_resize(name, textcat_config):
textcat.add_label("NEUTRAL")
#fmt: off
# fmt: off
@pytest.mark.parametrize(
"name,textcat_config",
[
@ -179,7 +179,7 @@ def test_no_resize(name, textcat_config):
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
],
)
#fmt: on
# fmt: on
def test_resize(name, textcat_config):
"""The new textcat architectures are resizable"""
nlp = Language()
@ -194,7 +194,7 @@ def test_resize(name, textcat_config):
assert textcat.model.maybe_get_dim("nO") in [3, None]
#fmt: off
# fmt: off
@pytest.mark.parametrize(
"name,textcat_config",
[
@ -208,7 +208,7 @@ def test_resize(name, textcat_config):
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
],
)
#fmt: on
# fmt: on
def test_resize_same_results(name, textcat_config):
# Ensure that the resized textcat classifiers still produce the same results for old labels
fix_random_seed(0)
@ -511,7 +511,9 @@ def test_textcat_threshold():
macro_f = scores["cats_score"]
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"})
scores = nlp.evaluate(
train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"}
)
pos_f = scores["cats_score"]
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
assert pos_f > macro_f

View File

@ -129,8 +129,14 @@ cfg_string = """
"""
TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}),
("Eat blue ham", {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}),
(
"I like green eggs",
{"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}},
),
(
"Eat blue ham",
{"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}},
),
]
@ -405,5 +411,5 @@ def test_tok2vec_listeners_textcat():
cats1 = docs[1].cats
assert cats1["preference"] > 0.1
assert cats1["imperative"] < 0.9
assert([t.tag_ for t in docs[0]] == ["V", "J", "N"])
assert([t.tag_ for t in docs[1]] == ["N", "V", "J", "N"])
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]

View File

@ -152,7 +152,8 @@ labels = ['label1', 'label2']
@pytest.mark.parametrize(
"component_name", ["textcat", "textcat_multilabel"],
"component_name",
["textcat", "textcat_multilabel"],
)
def test_issue6908(component_name):
"""Test intializing textcat with labels in a list"""

View File

@ -8,8 +8,7 @@ def test_issue7056():
sentence segmentation errors."""
vocab = Vocab()
ae = ArcEager(
vocab.strings,
ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
)
doc = Doc(vocab, words="Severe pain , after trauma".split())
state = ae.init_batch([doc])[0]

View File

@ -41,7 +41,7 @@ def test_partial_links():
nlp.add_pipe("sentencizer", first=True)
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
{"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}
{"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)

View File

@ -8,7 +8,17 @@ def test_issue7065():
nlp = English()
nlp.add_pipe("sentencizer")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}]
patterns = [
{
"label": "THING",
"pattern": [
{"LOWER": "symphony"},
{"LOWER": "no"},
{"LOWER": "."},
{"LOWER": "8"},
],
}
]
ruler.add_patterns(patterns)
doc = nlp(text)
@ -28,11 +38,15 @@ def test_issue7065_b():
text = "Mahler 's Symphony No. 8 was beautiful."
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
links = {
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
}
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
doc = nlp(text)
example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts})
example = Example.from_dict(
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
)
train_examples = [example]
def create_kb(vocab):
@ -65,7 +79,15 @@ def test_issue7065_b():
# Add a custom rule-based component to mimick NER
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
{"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}
{
"label": "WORK",
"pattern": [
{"LOWER": "symphony"},
{"LOWER": "no"},
{"LOWER": "."},
{"LOWER": "8"},
],
},
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)

View File

@ -1,11 +1,22 @@
from spacy.lang.en import English
def test_issue8168():
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
patterns = [
{"label": "ORG", "pattern": "Apple"},
{
"label": "GPE",
"pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
"id": "san-francisco",
},
{
"label": "GPE",
"pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
"id": "san-francisco",
},
]
ruler.add_patterns(patterns)
assert ruler._ent_ids == {8043148519967183733: ('GPE', 'san-francisco')}
assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}

View File

@ -9,20 +9,13 @@ def test_issue8190():
"nlp": {
"lang": "en",
},
"custom": {
"key": "value"
}
"custom": {"key": "value"},
}
source_nlp = English.from_config(source_cfg)
with make_tempdir() as dir_path:
# We need to create a loadable source pipeline
source_path = dir_path / "test_model"
source_nlp.to_disk(source_path)
nlp = spacy.load(source_path, config={
"custom": {
"key": "updated_value"
}
})
nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
assert nlp.config["custom"]["key"] == "updated_value"

View File

@ -2,7 +2,6 @@ import pytest
from spacy import registry
from spacy.language import Language
from spacy.pipeline import EntityRuler
@pytest.fixture

View File

@ -4,7 +4,12 @@ import spacy
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
from spacy.util import registry, load_model_from_config, load_config, load_config_from_str
from spacy.util import (
registry,
load_model_from_config,
load_config,
load_config_from_str,
)
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
@ -493,4 +498,4 @@ def test_hyphen_in_config():
self.punctuation = punctuation
nlp = English.from_config(load_config_from_str(hyphen_config_str))
assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-']
assert nlp.get_pipe("my_punctual_component").punctuation == ["?", "-"]

View File

@ -64,7 +64,9 @@ def test_serialize_doc_span_groups(en_vocab):
def test_serialize_doc_bin():
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True)
doc_bin = DocBin(
attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
)
texts = ["Some text", "Lots of texts...", "..."]
cats = {"A": 0.5}
nlp = English()

View File

@ -5,7 +5,6 @@ from catalogue import RegistryError
def test_get_architecture():
@registry.architectures("my_test_function")
def create_model(nr_in, nr_out):
return Linear(nr_in, nr_out)

View File

@ -8,7 +8,7 @@ from spacy.vocab import Vocab
from spacy.training import Example
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error, logger
from spacy.util import registry, ignore_error, raise_error
import spacy
from thinc.api import NumpyOps, get_current_ops
@ -143,7 +143,9 @@ def sample_vectors():
@pytest.fixture
def nlp2(nlp, sample_vectors):
Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe)
Language.component(
"test_language_vector_modification_pipe", func=vector_modification_pipe
)
Language.component("test_language_userdata_pipe", func=userdata_pipe)
Language.component("test_language_ner_pipe", func=ner_pipe)
add_vecs_to_vocab(nlp.vocab, sample_vectors)

View File

@ -9,7 +9,7 @@ from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int
from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops
from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import set_current_ops
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English

View File

@ -444,7 +444,9 @@ def test_score_spans():
assert f"{key}_per_type" in scores
# Discard labels from the evaluation
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False)
scores = Scorer.score_spans(
[eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False
)
assert scores[f"{key}_p"] == 1.0
assert scores[f"{key}_r"] == 1.0
assert f"{key}_per_type" not in scores
@ -467,4 +469,6 @@ def test_prf_score():
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
a += b
assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore))
assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore)
)

View File

@ -209,10 +209,6 @@ def test_tokenizer_flush_specials(en_vocab):
suffix_search=suffix_re.search,
rules=rules,
)
tokenizer2 = Tokenizer(
en_vocab,
suffix_search=suffix_re.search,
)
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
tokenizer1.rules = {}
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]

View File

@ -278,7 +278,9 @@ def test_pretraining_training():
filled = filled.interpolate()
P = filled["pretraining"]
nlp_base = init_nlp(filled)
model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
model_base = (
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
)
embed_base = None
for node in model_base.walk():
if node.name == "hashembed":
@ -331,11 +333,12 @@ def write_sample_training(tmp_dir):
def write_vectors_model(tmp_dir):
import numpy
vocab = Vocab()
vector_data = {
"dog": numpy.random.uniform(-1, 1, (300,)),
"cat": numpy.random.uniform(-1, 1, (300,)),
"orange": numpy.random.uniform(-1, 1, (300,))
"orange": numpy.random.uniform(-1, 1, (300,)),
}
for word, vector in vector_data.items():
vocab.set_vector(word, vector)

View File

@ -434,8 +434,14 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer):
gold_doc = nlp.make_doc(text)
spans = []
prefix = "I flew to "
spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY"))
spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY"))
spans.append(
gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY")
)
spans.append(
gold_doc.char_span(
len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY"
)
)
spans_key = "overlap_ents"
gold_doc.spans[spans_key] = spans
example = Example(doc, gold_doc)
@ -443,7 +449,9 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer):
assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)]
# Ensure that 'get_aligned_spans_y2x' has the aligned entities correct
spans_y2x_no_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=False)
spans_y2x_no_overlap = example.get_aligned_spans_y2x(
spans_gold, allow_overlap=False
)
assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)]
spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True)
assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)]

View File

@ -12,6 +12,7 @@ from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
OPS = get_current_ops()
@pytest.fixture
def strings():
return ["apple", "orange"]

View File

@ -66,7 +66,11 @@ def configure_minibatch_by_words(
"""
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
minibatch_by_words, size=size, tolerance=tolerance, discard_oversize=discard_oversize, **optionals
minibatch_by_words,
size=size,
tolerance=tolerance,
discard_oversize=discard_oversize,
**optionals
)

View File

@ -70,14 +70,18 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
nlp._link_components()
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
if T["max_epochs"] == -1:
logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels")
logger.debug(
"Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels"
)
nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
for listener in getattr(
proc, "listening_components", []
): # e.g. tok2vec/transformer
# Don't warn about components not in the pipeline
if listener not in nlp.pipe_names:
continue

View File

@ -110,7 +110,8 @@ def wandb_logger(
):
try:
import wandb
from wandb import init, log, join # test that these are available
# test that these are available
from wandb import init, log, join # noqa: F401
except ImportError:
raise ImportError(Errors.E880)

View File

@ -1,4 +1,4 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO
from typing import Optional, TYPE_CHECKING
from pathlib import Path
from timeit import default_timer as timer
@ -96,8 +96,7 @@ def train(
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
if annotating_components:
stdout.write(
msg.info(f"Set annotations on update for: {annotating_components}")
+ "\n"
msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
)
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
with nlp.select_pipes(disable=frozen_components):

View File

@ -57,13 +57,13 @@ if TYPE_CHECKING:
from .vocab import Vocab # noqa: F401
# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
# fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
# fmt: on
@ -649,8 +649,7 @@ def get_model_version_range(spacy_version: str) -> str:
def get_model_lower_version(constraint: str) -> Optional[str]:
"""From a version range like >=1.2.3,<1.3.0 return the lower pin.
"""
"""From a version range like >=1.2.3,<1.3.0 return the lower pin."""
try:
specset = SpecifierSet(constraint)
for spec in specset:

View File

@ -285,8 +285,8 @@ Encode context using bidirectional LSTM layers. Requires
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
learned linear projection to control the dimensionality. Unknown tokens are
mapped to a zero vector. See the documentation on [static
vectors](/usage/embeddings-transformers#static-vectors) for details.
mapped to a zero vector. See the documentation on
[static vectors](/usage/embeddings-transformers#static-vectors) for details.
| Name |  Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -449,7 +449,7 @@ For more information, see the section on
> ```ini
> [pretraining]
> component = "tok2vec"
>
>
> [initialize]
> vectors = "en_core_web_lg"
> ...
@ -462,8 +462,8 @@ For more information, see the section on
> ```
Predict the word's vector from a static embeddings table as pretraining
objective for a Tok2Vec layer. To use this objective, make sure that the
`initialize.vectors` section in the config refers to a model with static
objective for a Tok2Vec layer. To use this objective, make sure that the
`initialize.vectors` section in the config refers to a model with static
vectors.
| Name | Description |
@ -649,8 +649,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, but used an internal `tok2vec` instead of
taking it as argument:
[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar,
but used an internal `tok2vec` instead of taking it as argument:
| Name | Description |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -701,8 +701,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was not yet resizable.
Since v2, new labels can be added to this component, even after training.
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
not yet resizable. Since v2, new labels can be added to this component, even
after training.
</Accordion>
@ -732,8 +733,9 @@ the others, but may not be as accurate, especially if texts are short.
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was not yet resizable.
Since v2, new labels can be added to this component, even after training.
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
not yet resizable. Since v2, new labels can be added to this component, even
after training.
</Accordion>
@ -747,7 +749,7 @@ Since v2, new labels can be added to this component, even after training.
> [model]
> @architectures = "spacy.SpanCategorizer.v1"
> scorer = {"@layers": "spacy.LinearLogistic.v1"}
>
>
> [model.reducer]
> @layers = spacy.mean_max_reducer.v1"
> hidden_size = 128

View File

@ -231,14 +231,14 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
> losses = parser.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## DependencyParser.get_loss {#get_loss tag="method"}

View File

@ -35,11 +35,11 @@ how the component should be configured. You can override its settings via the
> ```
| Setting | Description |
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
@ -64,14 +64,14 @@ be a token pattern (list) or a phrase pattern (string). For example:
> ```
| Name | Description |
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `" | | "`. ~~str~~ |
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
## EntityRuler.initialize {#initialize tag="method" new="3"}

View File

@ -245,8 +245,8 @@ certain prior probability.
### Candidate.\_\_init\_\_ {#candidate-init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly,
but instead these objects are returned by the
`get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe.
but instead these objects are returned by the `get_candidates` method of the
[`entity_linker`](/api/entitylinker) pipe.
> #### Example
>

View File

@ -178,8 +178,9 @@ added to an existing vectors table. See more details in
### spacy.TextCatCNN.v1 {#TextCatCNN_v1}
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means that you can add
labels to a previously trained textcat. `TextCatCNN` v1 did not yet support that.
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
yet support that.
> #### Example Config
>
@ -213,8 +214,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
### spacy.TextCatBOW.v1 {#TextCatBOW_v1}
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means that you can add
labels to a previously trained textcat. `TextCatBOW` v1 did not yet support that.
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
yet support that.
> #### Example Config
>

View File

@ -120,14 +120,14 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> matches = matcher(doc)
> ```
| Name | Description |
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ |
| `with_alignments` <Tag variant="new">3.0.6</Tag> | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
| Name | Description |
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~ |
| `with_alignments` <Tag variant="new">3.0.6</Tag> | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

View File

@ -61,11 +61,11 @@ shortcut for this and instantiate the component using its string name and
> morphologizer = Morphologizer(nlp.vocab, model)
> ```
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| Name | Description |
| ------- | -------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
## Morphologizer.\_\_call\_\_ {#call tag="method"}
@ -200,14 +200,14 @@ Delegates to [`predict`](/api/morphologizer#predict) and
> losses = morphologizer.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Morphologizer.get_loss {#get_loss tag="method"}

View File

@ -98,18 +98,18 @@ representation.
> assert f == "Feat1=Val1|Feat2=Val2"
> ```
| Name | Description |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
| Name | Description |
| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
## Attributes {#attributes}
| Name | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ |
| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
| Name | Description |
| ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ---------- |
| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is ` | `. ~~str~~ |
| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"}

View File

@ -59,7 +59,7 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
| Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
@ -149,8 +149,8 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
</Infobox>
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `match_id` | An ID for the thing you're matching. ~~str~~ | |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --- |
| `match_id` | An ID for the thing you're matching. ~~str~~ | |
| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
| _keyword-only_ | |
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |

View File

@ -187,14 +187,14 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
> losses = senter.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}

View File

@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the
> ```
| Setting | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ |
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
```python

View File

@ -491,8 +491,8 @@ document by the `parser`, `senter`, `sentencizer` or some custom function. It
will raise an error otherwise.
If the span happens to cross sentence boundaries, only the first sentence will
be returned. If it is required that the sentence always includes the
full span, the result can be adjusted as such:
be returned. If it is required that the sentence always includes the full span,
the result can be adjusted as such:
```python
sent = span.sent

View File

@ -213,14 +213,14 @@ Delegates to [`predict`](/api/spancategorizer#predict) and
> losses = spancat.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## SpanCategorizer.get_loss {#get_loss tag="method"}

View File

@ -25,9 +25,9 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tagger", config=config)
> ```
| Setting | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
| Setting | Description |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/tagger.pyx
@ -54,11 +54,11 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description |
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| Name | Description |
| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
## Tagger.\_\_call\_\_ {#call tag="method"}
@ -198,14 +198,14 @@ Delegates to [`predict`](/api/tagger#predict) and
> losses = tagger.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tagger.rehearse {#rehearse tag="method,experimental" new="3"}

View File

@ -196,14 +196,14 @@ Delegates to [`predict`](/api/tok2vec#predict).
> losses = tok2vec.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tok2Vec.create_optimizer {#create_optimizer tag="method"}

View File

@ -362,8 +362,8 @@ unknown. Defaults to `True` for the first token in the `Doc`.
> assert not doc[5].is_sent_start
> ```
| Name | Description |
| ----------- | --------------------------------------------- |
| Name | Description |
| ----------- | ------------------------------------------------------- |
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"}
@ -420,73 +420,73 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes}
| Name | Description |
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | The parent document. ~~Doc~~ |
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
| `text` | Verbatim text content. ~~str~~ |
| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
| `whitespace_` | Trailing space character if present. ~~str~~ |
| `orth` | ID of the verbatim text content. ~~int~~ |
| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
| `tensor` <Tag variant="new">2.1.7</Tag> | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
| `i` | The index of the token within the parent document. ~~int~~ |
| `ent_type` | Named entity type. ~~int~~ |
| `ent_type_` | Named entity type. ~~str~~ |
| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ |
| `lower` | Lowercase form of the token. ~~int~~ |
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
| `shape` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
| `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
| `like_url` | Does the token resemble a URL? ~~bool~~ |
| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
| `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
| `dep` | Syntactic dependency relation. ~~int~~ |
| `dep_` | Syntactic dependency relation. ~~str~~ |
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
| `idx` | The character offset of the token within the parent document. ~~int~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `cluster` | Brown cluster ID. ~~int~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
| Name | Description |
| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | The parent document. ~~Doc~~ |
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
| `text` | Verbatim text content. ~~str~~ |
| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
| `whitespace_` | Trailing space character if present. ~~str~~ |
| `orth` | ID of the verbatim text content. ~~int~~ |
| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
| `tensor` <Tag variant="new">2.1.7</Tag> | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
| `i` | The index of the token within the parent document. ~~int~~ |
| `ent_type` | Named entity type. ~~int~~ |
| `ent_type_` | Named entity type. ~~str~~ |
| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ |
| `lower` | Lowercase form of the token. ~~int~~ |
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
| `shape` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
| `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
| `like_url` | Does the token resemble a URL? ~~bool~~ |
| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
| `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
| `dep` | Syntactic dependency relation. ~~int~~ |
| `dep_` | Syntactic dependency relation. ~~str~~ |
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
| `idx` | The character offset of the token within the parent document. ~~int~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `cluster` | Brown cluster ID. ~~int~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |

View File

@ -239,6 +239,7 @@ it.
| `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore

View File

@ -290,8 +290,8 @@ If a table is full, it can be resized using
## Vectors.n_keys {#n_keys tag="property"}
Get the number of keys in the table. Note that this is the number of _all_ keys,
not just unique vectors. If several keys are mapped to the same
vectors, they will be counted individually.
not just unique vectors. If several keys are mapped to the same vectors, they
will be counted individually.
> #### Example
>
@ -321,7 +321,7 @@ performed in chunks to avoid consuming too much memory. You can set the
> ```
| Name | Description |
| -------------- | --------------------------------------------------------------------------- |
| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
| _keyword-only_ | |
| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |