mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #8523 from adrianeboyd/chore/cleanup-v3.1.0
This commit is contained in:
commit
8bc235dcc0
|
@ -111,7 +111,7 @@ universal = false
|
||||||
formats = gztar
|
formats = gztar
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, E731, W503, E741
|
ignore = E203, E266, E501, E731, W503, E741, F541
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
exclude =
|
exclude =
|
||||||
|
|
|
@ -4,6 +4,7 @@ import sys
|
||||||
|
|
||||||
# set library-specific custom warning handling before doing anything else
|
# set library-specific custom warning handling before doing anything else
|
||||||
from .errors import setup_default_warnings
|
from .errors import setup_default_warnings
|
||||||
|
|
||||||
setup_default_warnings()
|
setup_default_warnings()
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
|
|
|
@ -6,7 +6,6 @@ import logging
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code
|
||||||
from ..training.initialize import init_nlp
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_sourced_components, load_model_from_config
|
from ..util import get_sourced_components, load_model_from_config
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Dict, Any, Optional, Iterable
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import resolve_dot_names
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import fix_random_seed, set_dropout_rate
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -133,15 +133,16 @@ def debug_model(
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 2: Updating the model and printing again
|
# STEP 2: Updating the model and printing again
|
||||||
optimizer = Adam(0.001)
|
|
||||||
set_dropout_rate(model, 0.2)
|
set_dropout_rate(model, 0.2)
|
||||||
# ugly hack to deal with Tok2Vec/Transformer listeners
|
# ugly hack to deal with Tok2Vec/Transformer listeners
|
||||||
upstream_component = None
|
upstream_component = None
|
||||||
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
|
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
|
||||||
upstream_component = nlp.get_pipe("tok2vec")
|
upstream_component = nlp.get_pipe("tok2vec")
|
||||||
if model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name:
|
if (
|
||||||
|
model.has_ref("tok2vec")
|
||||||
|
and "transformer-listener" in model.get_ref("tok2vec").name
|
||||||
|
):
|
||||||
upstream_component = nlp.get_pipe("transformer")
|
upstream_component = nlp.get_pipe("transformer")
|
||||||
goldY = None
|
|
||||||
for e in range(3):
|
for e in range(3):
|
||||||
if upstream_component:
|
if upstream_component:
|
||||||
upstream_component.update(examples)
|
upstream_component.update(examples)
|
||||||
|
|
|
@ -127,7 +127,9 @@ def evaluate(
|
||||||
data["ents_per_type"] = scores["ents_per_type"]
|
data["ents_per_type"] = scores["ents_per_type"]
|
||||||
if f"spans_{spans_key}_per_type" in scores:
|
if f"spans_{spans_key}_per_type" in scores:
|
||||||
if scores[f"spans_{spans_key}_per_type"]:
|
if scores[f"spans_{spans_key}_per_type"]:
|
||||||
print_prf_per_type(msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type")
|
print_prf_per_type(
|
||||||
|
msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
|
||||||
|
)
|
||||||
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
|
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
|
||||||
if "cats_f_per_type" in scores:
|
if "cats_f_per_type" in scores:
|
||||||
if scores["cats_f_per_type"]:
|
if scores["cats_f_per_type"]:
|
||||||
|
|
|
@ -331,7 +331,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
|
||||||
continue
|
continue
|
||||||
col1 = md.bold(md.code(pipe))
|
col1 = md.bold(md.code(pipe))
|
||||||
col2 = ", ".join(
|
col2 = ", ".join(
|
||||||
[md.code(label.replace("|", "\|")) for label in labels]
|
[md.code(label.replace("|", "\\|")) for label in labels]
|
||||||
) # noqa: W605
|
) # noqa: W605
|
||||||
label_data.append((col1, col2))
|
label_data.append((col1, col2))
|
||||||
n_labels += len(labels)
|
n_labels += len(labels)
|
||||||
|
|
|
@ -5,7 +5,6 @@ import requests
|
||||||
from wasabi import msg, Printer
|
from wasabi import msg, Printer
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..errors import Warnings
|
|
||||||
from ._util import app
|
from ._util import app
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
from ..util import get_package_version, get_installed_models, get_minor_version
|
||||||
|
|
|
@ -120,7 +120,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document do parse.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
|
)
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
warnings.warn(Warnings.W005)
|
warnings.warn(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
|
|
|
@ -22,13 +22,13 @@ _num_words = [
|
||||||
"тринадесет",
|
"тринадесет",
|
||||||
"тринайсет",
|
"тринайсет",
|
||||||
"четиринадесет",
|
"четиринадесет",
|
||||||
"четиринайсет"
|
"четиринайсет",
|
||||||
"петнадесет",
|
"петнадесет",
|
||||||
"петнайсет"
|
"петнайсет",
|
||||||
"шестнадесет",
|
"шестнадесет",
|
||||||
"шестнайсет",
|
"шестнайсет",
|
||||||
"седемнадесет",
|
"седемнадесет",
|
||||||
"седемнайсет"
|
"седемнайсет",
|
||||||
"осемнадесет",
|
"осемнадесет",
|
||||||
"осемнайсет",
|
"осемнайсет",
|
||||||
"деветнадесет",
|
"деветнадесет",
|
||||||
|
@ -36,7 +36,7 @@ _num_words = [
|
||||||
"двадесет",
|
"двадесет",
|
||||||
"двайсет",
|
"двайсет",
|
||||||
"тридесет",
|
"тридесет",
|
||||||
"трийсет"
|
"трийсет",
|
||||||
"четиридесет",
|
"четиридесет",
|
||||||
"четиресет",
|
"четиресет",
|
||||||
"петдесет",
|
"петдесет",
|
||||||
|
|
|
@ -58,7 +58,6 @@ _abbr_dot_exc = [
|
||||||
{ORTH: "стр.", NORM: "страница"},
|
{ORTH: "стр.", NORM: "страница"},
|
||||||
{ORTH: "ул.", NORM: "улица"},
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
{ORTH: "чл.", NORM: "член"},
|
{ORTH: "чл.", NORM: "член"},
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for abbr in _abbr_dot_exc:
|
for abbr in _abbr_dot_exc:
|
||||||
|
|
|
@ -81,16 +81,32 @@ for exc_data in [
|
||||||
|
|
||||||
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
|
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
|
||||||
conj_contraction_bases = [
|
conj_contraction_bases = [
|
||||||
("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"),
|
("ett", "että"),
|
||||||
("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"),
|
("jott", "jotta"),
|
||||||
("joll", "jos"), ("ell", "jos")
|
("kosk", "koska"),
|
||||||
|
("mutt", "mutta"),
|
||||||
|
("vaikk", "vaikka"),
|
||||||
|
("ehk", "ehkä"),
|
||||||
|
("miks", "miksi"),
|
||||||
|
("siks", "siksi"),
|
||||||
|
("joll", "jos"),
|
||||||
|
("ell", "jos"),
|
||||||
]
|
]
|
||||||
conj_contraction_negations = [
|
conj_contraction_negations = [
|
||||||
("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"),
|
("en", "en"),
|
||||||
("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät")]
|
("et", "et"),
|
||||||
|
("ei", "ei"),
|
||||||
|
("emme", "emme"),
|
||||||
|
("ette", "ette"),
|
||||||
|
("eivat", "eivät"),
|
||||||
|
("eivät", "eivät"),
|
||||||
|
]
|
||||||
for (base_lower, base_norm) in conj_contraction_bases:
|
for (base_lower, base_norm) in conj_contraction_bases:
|
||||||
for base in [base_lower, base_lower.title()]:
|
for base in [base_lower, base_lower.title()]:
|
||||||
for (suffix, suffix_norm) in conj_contraction_negations:
|
for (suffix, suffix_norm) in conj_contraction_negations:
|
||||||
_exc[base + suffix] = [{ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}]
|
_exc[base + suffix] = [
|
||||||
|
{ORTH: base, NORM: base_norm},
|
||||||
|
{ORTH: suffix, NORM: suffix_norm},
|
||||||
|
]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -4,12 +4,12 @@ from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ItalianLemmatizer(Lemmatizer):
|
class ItalianLemmatizer(Lemmatizer):
|
||||||
"""This lemmatizer was adapted from the Polish one (version of April 2021).
|
"""This lemmatizer was adapted from the Polish one (version of April 2021).
|
||||||
It implements lookup lemmatization based on the morphological lexicon
|
It implements lookup lemmatization based on the morphological lexicon
|
||||||
morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware
|
morph-it (Baroni and Zanchetta). The table lemma_lookup with non-POS-aware
|
||||||
entries is used as a backup for words that aren't handled by morph-it."""
|
entries is used as a backup for words that aren't handled by morph-it."""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
if mode == "pos_lookup":
|
if mode == "pos_lookup":
|
||||||
|
|
|
@ -25,7 +25,7 @@ for orth in [
|
||||||
"artt.",
|
"artt.",
|
||||||
"att.",
|
"att.",
|
||||||
"avv.",
|
"avv.",
|
||||||
"Avv."
|
"Avv.",
|
||||||
"by-pass",
|
"by-pass",
|
||||||
"c.d.",
|
"c.d.",
|
||||||
"c/c",
|
"c/c",
|
||||||
|
|
|
@ -35,8 +35,8 @@ URL_PATTERN = (
|
||||||
# host & domain names
|
# host & domain names
|
||||||
# mods: match is case-sensitive, so include [A-Z]
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
r"(?:" # noqa: E131
|
r"(?:" # noqa: E131
|
||||||
r"(?:"
|
r"(?:" # noqa: E131
|
||||||
r"[A-Za-z0-9\u00a1-\uffff]"
|
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
|
||||||
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
r")?"
|
r")?"
|
||||||
r"[A-Za-z0-9\u00a1-\uffff]\."
|
r"[A-Za-z0-9\u00a1-\uffff]\."
|
||||||
|
|
|
@ -687,11 +687,13 @@ class Language:
|
||||||
if not isinstance(source, Language):
|
if not isinstance(source, Language):
|
||||||
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
||||||
# Check vectors, with faster checks first
|
# Check vectors, with faster checks first
|
||||||
if self.vocab.vectors.shape != source.vocab.vectors.shape or \
|
if (
|
||||||
self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
|
self.vocab.vectors.shape != source.vocab.vectors.shape
|
||||||
self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
|
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
|
||||||
|
or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
|
||||||
|
):
|
||||||
warnings.warn(Warnings.W113.format(name=source_name))
|
warnings.warn(Warnings.W113.format(name=source_name))
|
||||||
if not source_name in source.component_names:
|
if source_name not in source.component_names:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
Errors.E944.format(
|
Errors.E944.format(
|
||||||
name=source_name,
|
name=source_name,
|
||||||
|
@ -1539,15 +1541,21 @@ class Language:
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
|
byte_tuples = chain.from_iterable(
|
||||||
|
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1):
|
for i, (_, (byte_doc, byte_error)) in enumerate(
|
||||||
|
zip(raw_texts, byte_tuples), 1
|
||||||
|
):
|
||||||
if byte_doc is not None:
|
if byte_doc is not None:
|
||||||
doc = Doc(self.vocab).from_bytes(byte_doc)
|
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||||||
yield doc
|
yield doc
|
||||||
elif byte_error is not None:
|
elif byte_error is not None:
|
||||||
error = srsly.msgpack_loads(byte_error)
|
error = srsly.msgpack_loads(byte_error)
|
||||||
self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error)))
|
self.default_error_handler(
|
||||||
|
None, None, None, ValueError(Errors.E871.format(error=error))
|
||||||
|
)
|
||||||
if i % batch_size == 0:
|
if i % batch_size == 0:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
|
@ -1707,7 +1715,9 @@ class Language:
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
for name, proc in source_nlps[model].pipeline:
|
for name, proc in source_nlps[model].pipeline:
|
||||||
if source_name in getattr(proc, "listening_components", []):
|
if source_name in getattr(proc, "listening_components", []):
|
||||||
source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
|
source_nlps[model].replace_listeners(
|
||||||
|
name, source_name, pipe_cfg["replace_listeners"]
|
||||||
|
)
|
||||||
listeners_replaced = True
|
listeners_replaced = True
|
||||||
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
||||||
# Delete from cache if listeners were replaced
|
# Delete from cache if listeners were replaced
|
||||||
|
@ -1727,12 +1737,16 @@ class Language:
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
# Remove listeners not in the pipeline
|
# Remove listeners not in the pipeline
|
||||||
listener_names = getattr(proc, "listening_components", [])
|
listener_names = getattr(proc, "listening_components", [])
|
||||||
unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
|
unused_listener_names = [
|
||||||
|
ll for ll in listener_names if ll not in nlp.pipe_names
|
||||||
|
]
|
||||||
for listener_name in unused_listener_names:
|
for listener_name in unused_listener_names:
|
||||||
for listener in proc.listener_map.get(listener_name, []):
|
for listener in proc.listener_map.get(listener_name, []):
|
||||||
proc.remove_listener(listener, listener_name)
|
proc.remove_listener(listener, listener_name)
|
||||||
|
|
||||||
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
|
for listener in getattr(
|
||||||
|
proc, "listening_components", []
|
||||||
|
): # e.g. tok2vec/transformer
|
||||||
# If it's a component sourced from another pipeline, we check if
|
# If it's a component sourced from another pipeline, we check if
|
||||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
# the tok2vec listeners should be replaced with standalone tok2vec
|
||||||
# models (e.g. so component can be frozen without its performance
|
# models (e.g. so component can be frozen without its performance
|
||||||
|
@ -1827,7 +1841,9 @@ class Language:
|
||||||
new_config = tok2vec_cfg["model"]
|
new_config = tok2vec_cfg["model"]
|
||||||
if "replace_listener_cfg" in tok2vec_model.attrs:
|
if "replace_listener_cfg" in tok2vec_model.attrs:
|
||||||
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
|
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
|
||||||
new_config = replace_func(tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"])
|
new_config = replace_func(
|
||||||
|
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
|
||||||
|
)
|
||||||
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
|
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
|
||||||
# Go over the listener layers and replace them
|
# Go over the listener layers and replace them
|
||||||
for listener in pipe_listeners:
|
for listener in pipe_listeners:
|
||||||
|
@ -1866,7 +1882,10 @@ class Language:
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
|
self,
|
||||||
|
path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
|
|
@ -12,9 +12,7 @@ from .strings import get_string_id
|
||||||
UNSET = object()
|
UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
def load_lookups(
|
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
|
||||||
lang: str, tables: List[str], strict: bool = True
|
|
||||||
) -> 'Lookups':
|
|
||||||
"""Load the data from the spacy-lookups-data package for a given language,
|
"""Load the data from the spacy-lookups-data package for a given language,
|
||||||
if available. Returns an empty `Lookups` container if there's no data or if the package
|
if available. Returns an empty `Lookups` container if there's no data or if the package
|
||||||
is not installed.
|
is not installed.
|
||||||
|
|
|
@ -309,9 +309,7 @@ class EntityLinker(TrainablePipe):
|
||||||
assert sent_index >= 0
|
assert sent_index >= 0
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
end_sentence = min(
|
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||||
len(sentences) - 1, sent_index + self.n_sents
|
|
||||||
)
|
|
||||||
start_token = sentences[start_sentence].start
|
start_token = sentences[start_sentence].start
|
||||||
end_token = sentences[end_sentence].end
|
end_token = sentences[end_sentence].end
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
@ -337,22 +335,16 @@ class EntityLinker(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
random.shuffle(candidates)
|
random.shuffle(candidates)
|
||||||
# set all prior probabilities to 0 if incl_prior=False
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||||
[c.prior_prob for c in candidates]
|
|
||||||
)
|
|
||||||
if not self.incl_prior:
|
if not self.incl_prior:
|
||||||
prior_probs = xp.asarray(
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
[0.0 for _ in candidates]
|
|
||||||
)
|
|
||||||
scores = prior_probs
|
scores = prior_probs
|
||||||
# add in similarity from the context
|
# add in similarity from the context
|
||||||
if self.incl_context:
|
if self.incl_context:
|
||||||
entity_encodings = xp.asarray(
|
entity_encodings = xp.asarray(
|
||||||
[c.entity_vector for c in candidates]
|
[c.entity_vector for c in candidates]
|
||||||
)
|
)
|
||||||
entity_norm = xp.linalg.norm(
|
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||||
entity_encodings, axis=1
|
|
||||||
)
|
|
||||||
if len(entity_encodings) != len(prior_probs):
|
if len(entity_encodings) != len(prior_probs):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
Errors.E147.format(
|
Errors.E147.format(
|
||||||
|
@ -361,14 +353,12 @@ class EntityLinker(TrainablePipe):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# cosine similarity
|
# cosine similarity
|
||||||
sims = xp.dot(
|
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||||
entity_encodings, sentence_encoding_t
|
sentence_norm * entity_norm
|
||||||
) / (sentence_norm * entity_norm)
|
)
|
||||||
if sims.shape != prior_probs.shape:
|
if sims.shape != prior_probs.shape:
|
||||||
raise ValueError(Errors.E161)
|
raise ValueError(Errors.E161)
|
||||||
scores = (
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
prior_probs + sims - (prior_probs * sims)
|
|
||||||
)
|
|
||||||
# TODO: thresholding
|
# TODO: thresholding
|
||||||
best_index = scores.argmax().item()
|
best_index = scores.argmax().item()
|
||||||
best_candidate = candidates[best_index]
|
best_candidate = candidates[best_index]
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
|
@ -278,9 +277,7 @@ class EntityRuler(Pipe):
|
||||||
if self == pipe:
|
if self == pipe:
|
||||||
current_index = i
|
current_index = i
|
||||||
break
|
break
|
||||||
subsequent_pipes = [
|
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
|
||||||
pipe for pipe in self.nlp.pipe_names[current_index :]
|
|
||||||
]
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
subsequent_pipes = []
|
subsequent_pipes = []
|
||||||
with self.nlp.select_pipes(disable=subsequent_pipes):
|
with self.nlp.select_pipes(disable=subsequent_pipes):
|
||||||
|
|
|
@ -299,7 +299,9 @@ class TextCategorizer(TrainablePipe):
|
||||||
self._allow_extra_label()
|
self._allow_extra_label()
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
if self.model and "resize_output" in self.model.attrs:
|
if self.model and "resize_output" in self.model.attrs:
|
||||||
self.model = self.model.attrs["resize_output"](self.model, len(self.cfg["labels"]))
|
self.model = self.model.attrs["resize_output"](
|
||||||
|
self.model, len(self.cfg["labels"])
|
||||||
|
)
|
||||||
self.vocab.strings.add(label)
|
self.vocab.strings.add(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
|
@ -365,7 +365,9 @@ class Scorer:
|
||||||
gold_spans.add(gold_span)
|
gold_spans.add(gold_span)
|
||||||
gold_per_type[span.label_].add(gold_span)
|
gold_per_type[span.label_].add(gold_span)
|
||||||
pred_per_type = {label: set() for label in labels}
|
pred_per_type = {label: set() for label in labels}
|
||||||
for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap):
|
for span in example.get_aligned_spans_x2y(
|
||||||
|
getter(pred_doc, attr), allow_overlap
|
||||||
|
):
|
||||||
if labeled:
|
if labeled:
|
||||||
pred_span = (span.label_, span.start, span.end - 1)
|
pred_span = (span.label_, span.start, span.end - 1)
|
||||||
else:
|
else:
|
||||||
|
@ -392,7 +394,9 @@ class Scorer:
|
||||||
final_scores[f"{attr}_r"] = score.recall
|
final_scores[f"{attr}_r"] = score.recall
|
||||||
final_scores[f"{attr}_f"] = score.fscore
|
final_scores[f"{attr}_f"] = score.fscore
|
||||||
if labeled:
|
if labeled:
|
||||||
final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()}
|
final_scores[f"{attr}_per_type"] = {
|
||||||
|
k: v.to_dict() for k, v in score_per_type.items()
|
||||||
|
}
|
||||||
return final_scores
|
return final_scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -381,9 +381,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||||
assert len(m_doc) == len(en_docs_tokens)
|
assert len(m_doc) == len(en_docs_tokens)
|
||||||
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
|
||||||
assert m_doc[2]._.is_ambiguous == True
|
assert m_doc[2]._.is_ambiguous is True
|
||||||
assert m_doc[9].idx == think_idx
|
assert m_doc[9].idx == think_idx
|
||||||
assert m_doc[9]._.is_ambiguous == True
|
assert m_doc[9]._.is_ambiguous is True
|
||||||
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
||||||
assert "group" in m_doc.spans
|
assert "group" in m_doc.spans
|
||||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||||
|
|
|
@ -484,7 +484,7 @@ def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[3:6])
|
retokenizer.merge(doc[3:6])
|
||||||
assert doc[3].is_sent_start == None
|
assert doc[3].is_sent_start is None
|
||||||
|
|
||||||
# merging over a sentence boundary and setting sent_start
|
# merging over a sentence boundary and setting sent_start
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.bg.lex_attrs import like_num
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word,match",
|
"word,match",
|
||||||
|
|
|
@ -40,20 +40,21 @@ CONTRACTION_TESTS = [
|
||||||
(
|
(
|
||||||
"Päätimme ettemme tule.",
|
"Päätimme ettemme tule.",
|
||||||
["Päätimme", "ett", "emme", "tule", "."],
|
["Päätimme", "ett", "emme", "tule", "."],
|
||||||
["päätimme", "että", "emme", "tule", "."]
|
["päätimme", "että", "emme", "tule", "."],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"Miksei puhuttaisi?",
|
"Miksei puhuttaisi?",
|
||||||
["Miks", "ei", "puhuttaisi", "?"],
|
["Miks", "ei", "puhuttaisi", "?"],
|
||||||
["miksi", "ei", "puhuttaisi", "?"]
|
["miksi", "ei", "puhuttaisi", "?"],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"He tottelivat vaikkeivat halunneet",
|
"He tottelivat vaikkeivat halunneet",
|
||||||
["He", "tottelivat", "vaikk", "eivat", "halunneet"],
|
["He", "tottelivat", "vaikk", "eivat", "halunneet"],
|
||||||
["he", "tottelivat", "vaikka", "eivät", "halunneet"]
|
["he", "tottelivat", "vaikka", "eivät", "halunneet"],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
||||||
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
|
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
|
||||||
tokens = fi_tokenizer(text)
|
tokens = fi_tokenizer(text)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,11 +23,11 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
|
||||||
nlp_r = Vietnamese()
|
nlp_r = Vietnamese()
|
||||||
nlp_r.from_bytes(nlp_bytes)
|
nlp_r.from_bytes(nlp_bytes)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.use_pyvi == False
|
assert nlp_r.tokenizer.use_pyvi is False
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
nlp_r = Vietnamese()
|
nlp_r = Vietnamese()
|
||||||
nlp_r.from_disk(d)
|
nlp_r.from_disk(d)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.use_pyvi == False
|
assert nlp_r.tokenizer.use_pyvi is False
|
||||||
|
|
|
@ -354,7 +354,6 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
|
||||||
for token in doc:
|
for token in doc:
|
||||||
token.head = doc[0]
|
token.head = doc[0]
|
||||||
token.dep_ = "a"
|
token.dep_ = "a"
|
||||||
get_is_c = lambda token: token.text in ("c",)
|
|
||||||
Token.set_extension("is_c", default=False)
|
Token.set_extension("is_c", default=False)
|
||||||
doc[2]._.is_c = True
|
doc[2]._.is_c = True
|
||||||
pattern = [
|
pattern = [
|
||||||
|
|
|
@ -257,11 +257,21 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
||||||
(2, "aaab", "a a a b", [[0, 1, 2, 3]]),
|
(2, "aaab", "a a a b", [[0, 1, 2, 3]]),
|
||||||
(3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
|
(3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
|
||||||
(4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
|
(4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||||
(5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]),
|
(
|
||||||
|
5,
|
||||||
|
"aabaa",
|
||||||
|
"a+ b a+",
|
||||||
|
[[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]],
|
||||||
|
),
|
||||||
(6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
|
(6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
|
||||||
(7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
|
(7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
|
||||||
(8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
|
(8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
|
||||||
(9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]),
|
(
|
||||||
|
9,
|
||||||
|
"aabb",
|
||||||
|
"a* b* a*",
|
||||||
|
[[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]],
|
||||||
|
),
|
||||||
(10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
|
(10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
|
||||||
(11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
|
(11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
|
||||||
(12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),
|
(12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),
|
||||||
|
|
|
@ -557,7 +557,11 @@ def test_neg_annotation(neg_key):
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
ner.add_label("ORG")
|
ner.add_label("ORG")
|
||||||
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
||||||
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
|
example.reference.spans[neg_key] = [
|
||||||
|
Span(neg_doc, 2, 4, "ORG"),
|
||||||
|
Span(neg_doc, 2, 3, "PERSON"),
|
||||||
|
Span(neg_doc, 1, 4, "PERSON"),
|
||||||
|
]
|
||||||
|
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Callable, Iterable, Iterator
|
from typing import Callable, Iterable, Iterator
|
||||||
import pytest
|
import pytest
|
||||||
import io
|
|
||||||
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
|
@ -11,7 +11,7 @@ from spacy.ml import load_kb
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tests.util import make_tempdir, make_tempfile
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
@ -254,7 +254,9 @@ def test_nel_nsents(nlp):
|
||||||
"""Test that n_sents can be set through the configuration"""
|
"""Test that n_sents can be set through the configuration"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
assert entity_linker.n_sents == 0
|
assert entity_linker.n_sents == 0
|
||||||
entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2})
|
entity_linker = nlp.replace_pipe(
|
||||||
|
"entity_linker", "entity_linker", config={"n_sents": 2}
|
||||||
|
)
|
||||||
assert entity_linker.n_sents == 2
|
assert entity_linker.n_sents == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -596,7 +598,9 @@ def test_kb_to_bytes():
|
||||||
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
||||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||||
kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
|
kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
|
||||||
kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2])
|
kb_1.add_alias(
|
||||||
|
alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]
|
||||||
|
)
|
||||||
assert kb_1.contains_alias("Russ Cochran")
|
assert kb_1.contains_alias("Russ Cochran")
|
||||||
kb_bytes = kb_1.to_bytes()
|
kb_bytes = kb_1.to_bytes()
|
||||||
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
|
@ -611,8 +615,12 @@ def test_kb_to_bytes():
|
||||||
assert kb_2.contains_alias("Russ Cochran")
|
assert kb_2.contains_alias("Russ Cochran")
|
||||||
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
||||||
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
||||||
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(kb_2.get_alias_candidates("Russ Cochran"))
|
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
|
||||||
assert len(kb_1.get_alias_candidates("Randomness")) == len(kb_2.get_alias_candidates("Randomness"))
|
kb_2.get_alias_candidates("Russ Cochran")
|
||||||
|
)
|
||||||
|
assert len(kb_1.get_alias_candidates("Randomness")) == len(
|
||||||
|
kb_2.get_alias_candidates("Randomness")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_nel_to_bytes():
|
def test_nel_to_bytes():
|
||||||
|
@ -640,7 +648,9 @@ def test_nel_to_bytes():
|
||||||
kb_2 = nlp_2.get_pipe("entity_linker").kb
|
kb_2 = nlp_2.get_pipe("entity_linker").kb
|
||||||
assert kb_2.contains_alias("Russ Cochran")
|
assert kb_2.contains_alias("Russ Cochran")
|
||||||
assert kb_2.get_vector("Q2146908") == [6, -4, 3]
|
assert kb_2.get_vector("Q2146908") == [6, -4, 3]
|
||||||
assert_almost_equal(kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8)
|
assert_almost_equal(
|
||||||
|
kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_scorer_links():
|
def test_scorer_links():
|
||||||
|
|
|
@ -82,7 +82,9 @@ def util_batch_unbatch_docs_list(
|
||||||
Y_batched = model.predict(in_data)
|
Y_batched = model.predict(in_data)
|
||||||
Y_not_batched = [model.predict([u])[0] for u in in_data]
|
Y_not_batched = [model.predict([u])[0] for u in in_data]
|
||||||
for i in range(len(Y_batched)):
|
for i in range(len(Y_batched)):
|
||||||
assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4)
|
assert_almost_equal(
|
||||||
|
OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def util_batch_unbatch_docs_array(
|
def util_batch_unbatch_docs_array(
|
||||||
|
|
|
@ -351,9 +351,21 @@ def test_language_factories_invalid():
|
||||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
|
||||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
|
||||||
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
||||||
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
|
(
|
||||||
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
|
[{"a": 0.0, "b": 0.0}, {"c": 0.0}],
|
||||||
([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0, "f": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0}),
|
{"c": 0.2},
|
||||||
|
{"a": 0.0, "b": 0.0, "c": 1.0},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
|
||||||
|
{"a": 0.0, "b": 0.0},
|
||||||
|
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}],
|
||||||
|
{"a": 0.0, "b": 0.0, "f": 0.0},
|
||||||
|
{"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5, "f": 0.0},
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_language_factories_combine_score_weights(weights, override, expected):
|
def test_language_factories_combine_score_weights(weights, override, expected):
|
||||||
|
|
|
@ -446,7 +446,12 @@ def test_update_with_annotates():
|
||||||
for text in texts:
|
for text in texts:
|
||||||
examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
|
examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
|
||||||
|
|
||||||
for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
|
for components_to_annotate in [
|
||||||
|
[],
|
||||||
|
[f"{name}1"],
|
||||||
|
[f"{name}1", f"{name}2"],
|
||||||
|
[f"{name}2", f"{name}1"],
|
||||||
|
]:
|
||||||
for key in results:
|
for key in results:
|
||||||
results[key] = ""
|
results[key] = ""
|
||||||
nlp = English(vocab=nlp.vocab)
|
nlp = English(vocab=nlp.vocab)
|
||||||
|
|
|
@ -79,10 +79,7 @@ def test_ngram_suggester(en_tokenizer):
|
||||||
assert spans.shape[0] == len(spans_set)
|
assert spans.shape[0] == len(spans_set)
|
||||||
offset += ngrams.lengths[i]
|
offset += ngrams.lengths[i]
|
||||||
# the number of spans is correct
|
# the number of spans is correct
|
||||||
assert_equal(
|
assert_equal(ngrams.lengths, [max(0, len(doc) - (size - 1)) for doc in docs])
|
||||||
ngrams.lengths,
|
|
||||||
[max(0, len(doc) - (size - 1)) for doc in docs]
|
|
||||||
)
|
|
||||||
|
|
||||||
# test 1-3-gram suggestions
|
# test 1-3-gram suggestions
|
||||||
ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3])
|
ngram_suggester = registry.misc.get("ngram_suggester.v1")(sizes=[1, 2, 3])
|
||||||
|
|
|
@ -132,8 +132,8 @@ def test_incomplete_data():
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
assert doc[1].tag_ is "V"
|
assert doc[1].tag_ == "V"
|
||||||
assert doc[2].tag_ is "J"
|
assert doc[2].tag_ == "J"
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -154,20 +154,20 @@ def test_overfitting_IO():
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
assert doc[0].tag_ is "N"
|
assert doc[0].tag_ == "N"
|
||||||
assert doc[1].tag_ is "V"
|
assert doc[1].tag_ == "V"
|
||||||
assert doc[2].tag_ is "J"
|
assert doc[2].tag_ == "J"
|
||||||
assert doc[3].tag_ is "N"
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
doc2 = nlp2(test_text)
|
doc2 = nlp2(test_text)
|
||||||
assert doc2[0].tag_ is "N"
|
assert doc2[0].tag_ == "N"
|
||||||
assert doc2[1].tag_ is "V"
|
assert doc2[1].tag_ == "V"
|
||||||
assert doc2[2].tag_ is "J"
|
assert doc2[2].tag_ == "J"
|
||||||
assert doc2[3].tag_ is "N"
|
assert doc2[3].tag_ == "N"
|
||||||
|
|
||||||
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
||||||
texts = [
|
texts = [
|
||||||
|
|
|
@ -511,7 +511,9 @@ def test_textcat_threshold():
|
||||||
macro_f = scores["cats_score"]
|
macro_f = scores["cats_score"]
|
||||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||||
|
|
||||||
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"})
|
scores = nlp.evaluate(
|
||||||
|
train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"}
|
||||||
|
)
|
||||||
pos_f = scores["cats_score"]
|
pos_f = scores["cats_score"]
|
||||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||||
assert pos_f > macro_f
|
assert pos_f > macro_f
|
||||||
|
|
|
@ -129,8 +129,14 @@ cfg_string = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I like green eggs", {"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}}),
|
(
|
||||||
("Eat blue ham", {"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}}),
|
"I like green eggs",
|
||||||
|
{"tags": ["N", "V", "J", "N"], "cats": {"preference": 1.0, "imperative": 0.0}},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Eat blue ham",
|
||||||
|
{"tags": ["V", "J", "N"], "cats": {"preference": 0.0, "imperative": 1.0}},
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -405,5 +411,5 @@ def test_tok2vec_listeners_textcat():
|
||||||
cats1 = docs[1].cats
|
cats1 = docs[1].cats
|
||||||
assert cats1["preference"] > 0.1
|
assert cats1["preference"] > 0.1
|
||||||
assert cats1["imperative"] < 0.9
|
assert cats1["imperative"] < 0.9
|
||||||
assert([t.tag_ for t in docs[0]] == ["V", "J", "N"])
|
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
|
||||||
assert([t.tag_ for t in docs[1]] == ["N", "V", "J", "N"])
|
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
|
||||||
|
|
|
@ -152,7 +152,8 @@ labels = ['label1', 'label2']
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"component_name", ["textcat", "textcat_multilabel"],
|
"component_name",
|
||||||
|
["textcat", "textcat_multilabel"],
|
||||||
)
|
)
|
||||||
def test_issue6908(component_name):
|
def test_issue6908(component_name):
|
||||||
"""Test intializing textcat with labels in a list"""
|
"""Test intializing textcat with labels in a list"""
|
||||||
|
|
|
@ -8,8 +8,7 @@ def test_issue7056():
|
||||||
sentence segmentation errors."""
|
sentence segmentation errors."""
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
ae = ArcEager(
|
ae = ArcEager(
|
||||||
vocab.strings,
|
vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
|
||||||
ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
|
|
||||||
)
|
)
|
||||||
doc = Doc(vocab, words="Severe pain , after trauma".split())
|
doc = Doc(vocab, words="Severe pain , after trauma".split())
|
||||||
state = ae.init_batch([doc])[0]
|
state = ae.init_batch([doc])[0]
|
||||||
|
|
|
@ -41,7 +41,7 @@ def test_partial_links():
|
||||||
nlp.add_pipe("sentencizer", first=True)
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
patterns = [
|
patterns = [
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
|
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
|
||||||
{"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}
|
{"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
|
||||||
]
|
]
|
||||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -8,7 +8,17 @@ def test_issue7065():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("sentencizer")
|
nlp.add_pipe("sentencizer")
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "THING", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}]
|
patterns = [
|
||||||
|
{
|
||||||
|
"label": "THING",
|
||||||
|
"pattern": [
|
||||||
|
{"LOWER": "symphony"},
|
||||||
|
{"LOWER": "no"},
|
||||||
|
{"LOWER": "."},
|
||||||
|
{"LOWER": "8"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
@ -28,11 +38,15 @@ def test_issue7065_b():
|
||||||
|
|
||||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
||||||
links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
links = {
|
||||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
||||||
|
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
||||||
|
}
|
||||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts})
|
example = Example.from_dict(
|
||||||
|
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||||
|
)
|
||||||
train_examples = [example]
|
train_examples = [example]
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
|
@ -65,7 +79,15 @@ def test_issue7065_b():
|
||||||
# Add a custom rule-based component to mimick NER
|
# Add a custom rule-based component to mimick NER
|
||||||
patterns = [
|
patterns = [
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
||||||
{"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}
|
{
|
||||||
|
"label": "WORK",
|
||||||
|
"pattern": [
|
||||||
|
{"LOWER": "symphony"},
|
||||||
|
{"LOWER": "no"},
|
||||||
|
{"LOWER": "."},
|
||||||
|
{"LOWER": "8"},
|
||||||
|
],
|
||||||
|
},
|
||||||
]
|
]
|
||||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
|
@ -1,11 +1,22 @@
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
def test_issue8168():
|
def test_issue8168():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
patterns = [{"label": "ORG", "pattern": "Apple"},
|
patterns = [
|
||||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
|
{"label": "ORG", "pattern": "Apple"},
|
||||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
|
{
|
||||||
|
"label": "GPE",
|
||||||
|
"pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}],
|
||||||
|
"id": "san-francisco",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "GPE",
|
||||||
|
"pattern": [{"LOWER": "san"}, {"LOWER": "fran"}],
|
||||||
|
"id": "san-francisco",
|
||||||
|
},
|
||||||
|
]
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
assert ruler._ent_ids == {8043148519967183733: ('GPE', 'san-francisco')}
|
assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")}
|
||||||
|
|
|
@ -9,20 +9,13 @@ def test_issue8190():
|
||||||
"nlp": {
|
"nlp": {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
},
|
},
|
||||||
"custom": {
|
"custom": {"key": "value"},
|
||||||
"key": "value"
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
source_nlp = English.from_config(source_cfg)
|
source_nlp = English.from_config(source_cfg)
|
||||||
with make_tempdir() as dir_path:
|
with make_tempdir() as dir_path:
|
||||||
# We need to create a loadable source pipeline
|
# We need to create a loadable source pipeline
|
||||||
source_path = dir_path / "test_model"
|
source_path = dir_path / "test_model"
|
||||||
source_nlp.to_disk(source_path)
|
source_nlp.to_disk(source_path)
|
||||||
nlp = spacy.load(source_path, config={
|
nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}})
|
||||||
"custom": {
|
|
||||||
"key": "updated_value"
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
assert nlp.config["custom"]["key"] == "updated_value"
|
assert nlp.config["custom"]["key"] == "updated_value"
|
||||||
|
|
|
@ -2,7 +2,6 @@ import pytest
|
||||||
|
|
||||||
from spacy import registry
|
from spacy import registry
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -4,7 +4,12 @@ import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from spacy.util import registry, load_model_from_config, load_config, load_config_from_str
|
from spacy.util import (
|
||||||
|
registry,
|
||||||
|
load_model_from_config,
|
||||||
|
load_config,
|
||||||
|
load_config_from_str,
|
||||||
|
)
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||||
|
@ -493,4 +498,4 @@ def test_hyphen_in_config():
|
||||||
self.punctuation = punctuation
|
self.punctuation = punctuation
|
||||||
|
|
||||||
nlp = English.from_config(load_config_from_str(hyphen_config_str))
|
nlp = English.from_config(load_config_from_str(hyphen_config_str))
|
||||||
assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-']
|
assert nlp.get_pipe("my_punctual_component").punctuation == ["?", "-"]
|
||||||
|
|
|
@ -64,7 +64,9 @@ def test_serialize_doc_span_groups(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_bin():
|
def test_serialize_doc_bin():
|
||||||
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True)
|
doc_bin = DocBin(
|
||||||
|
attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True
|
||||||
|
)
|
||||||
texts = ["Some text", "Lots of texts...", "..."]
|
texts = ["Some text", "Lots of texts...", "..."]
|
||||||
cats = {"A": 0.5}
|
cats = {"A": 0.5}
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -5,7 +5,6 @@ from catalogue import RegistryError
|
||||||
|
|
||||||
|
|
||||||
def test_get_architecture():
|
def test_get_architecture():
|
||||||
|
|
||||||
@registry.architectures("my_test_function")
|
@registry.architectures("my_test_function")
|
||||||
def create_model(nr_in, nr_out):
|
def create_model(nr_in, nr_out):
|
||||||
return Linear(nr_in, nr_out)
|
return Linear(nr_in, nr_out)
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.util import registry, ignore_error, raise_error, logger
|
from spacy.util import registry, ignore_error, raise_error
|
||||||
import spacy
|
import spacy
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
|
@ -143,7 +143,9 @@ def sample_vectors():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp2(nlp, sample_vectors):
|
def nlp2(nlp, sample_vectors):
|
||||||
Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe)
|
Language.component(
|
||||||
|
"test_language_vector_modification_pipe", func=vector_modification_pipe
|
||||||
|
)
|
||||||
Language.component("test_language_userdata_pipe", func=userdata_pipe)
|
Language.component("test_language_userdata_pipe", func=userdata_pipe)
|
||||||
Language.component("test_language_ner_pipe", func=ner_pipe)
|
Language.component("test_language_ner_pipe", func=ner_pipe)
|
||||||
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
||||||
from spacy.util import to_ternary_int
|
from spacy.util import to_ternary_int
|
||||||
from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops
|
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||||
from thinc.api import set_current_ops
|
from thinc.api import set_current_ops
|
||||||
from spacy.training.batchers import minibatch_by_words
|
from spacy.training.batchers import minibatch_by_words
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
|
@ -444,7 +444,9 @@ def test_score_spans():
|
||||||
assert f"{key}_per_type" in scores
|
assert f"{key}_per_type" in scores
|
||||||
|
|
||||||
# Discard labels from the evaluation
|
# Discard labels from the evaluation
|
||||||
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False)
|
scores = Scorer.score_spans(
|
||||||
|
[eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False
|
||||||
|
)
|
||||||
assert scores[f"{key}_p"] == 1.0
|
assert scores[f"{key}_p"] == 1.0
|
||||||
assert scores[f"{key}_r"] == 1.0
|
assert scores[f"{key}_r"] == 1.0
|
||||||
assert f"{key}_per_type" not in scores
|
assert f"{key}_per_type" not in scores
|
||||||
|
@ -467,4 +469,6 @@ def test_prf_score():
|
||||||
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
|
assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
|
||||||
|
|
||||||
a += b
|
a += b
|
||||||
assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore))
|
assert (a.precision, a.recall, a.fscore) == approx(
|
||||||
|
(c.precision, c.recall, c.fscore)
|
||||||
|
)
|
||||||
|
|
|
@ -209,10 +209,6 @@ def test_tokenizer_flush_specials(en_vocab):
|
||||||
suffix_search=suffix_re.search,
|
suffix_search=suffix_re.search,
|
||||||
rules=rules,
|
rules=rules,
|
||||||
)
|
)
|
||||||
tokenizer2 = Tokenizer(
|
|
||||||
en_vocab,
|
|
||||||
suffix_search=suffix_re.search,
|
|
||||||
)
|
|
||||||
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
|
assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
|
||||||
tokenizer1.rules = {}
|
tokenizer1.rules = {}
|
||||||
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
|
assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
|
||||||
|
|
|
@ -278,7 +278,9 @@ def test_pretraining_training():
|
||||||
filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
P = filled["pretraining"]
|
P = filled["pretraining"]
|
||||||
nlp_base = init_nlp(filled)
|
nlp_base = init_nlp(filled)
|
||||||
model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
model_base = (
|
||||||
|
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
||||||
|
)
|
||||||
embed_base = None
|
embed_base = None
|
||||||
for node in model_base.walk():
|
for node in model_base.walk():
|
||||||
if node.name == "hashembed":
|
if node.name == "hashembed":
|
||||||
|
@ -331,11 +333,12 @@ def write_sample_training(tmp_dir):
|
||||||
|
|
||||||
def write_vectors_model(tmp_dir):
|
def write_vectors_model(tmp_dir):
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vector_data = {
|
vector_data = {
|
||||||
"dog": numpy.random.uniform(-1, 1, (300,)),
|
"dog": numpy.random.uniform(-1, 1, (300,)),
|
||||||
"cat": numpy.random.uniform(-1, 1, (300,)),
|
"cat": numpy.random.uniform(-1, 1, (300,)),
|
||||||
"orange": numpy.random.uniform(-1, 1, (300,))
|
"orange": numpy.random.uniform(-1, 1, (300,)),
|
||||||
}
|
}
|
||||||
for word, vector in vector_data.items():
|
for word, vector in vector_data.items():
|
||||||
vocab.set_vector(word, vector)
|
vocab.set_vector(word, vector)
|
||||||
|
|
|
@ -434,8 +434,14 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer):
|
||||||
gold_doc = nlp.make_doc(text)
|
gold_doc = nlp.make_doc(text)
|
||||||
spans = []
|
spans = []
|
||||||
prefix = "I flew to "
|
prefix = "I flew to "
|
||||||
spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY"))
|
spans.append(
|
||||||
spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY"))
|
gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY")
|
||||||
|
)
|
||||||
|
spans.append(
|
||||||
|
gold_doc.char_span(
|
||||||
|
len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY"
|
||||||
|
)
|
||||||
|
)
|
||||||
spans_key = "overlap_ents"
|
spans_key = "overlap_ents"
|
||||||
gold_doc.spans[spans_key] = spans
|
gold_doc.spans[spans_key] = spans
|
||||||
example = Example(doc, gold_doc)
|
example = Example(doc, gold_doc)
|
||||||
|
@ -443,7 +449,9 @@ def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer):
|
||||||
assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)]
|
assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)]
|
||||||
|
|
||||||
# Ensure that 'get_aligned_spans_y2x' has the aligned entities correct
|
# Ensure that 'get_aligned_spans_y2x' has the aligned entities correct
|
||||||
spans_y2x_no_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=False)
|
spans_y2x_no_overlap = example.get_aligned_spans_y2x(
|
||||||
|
spans_gold, allow_overlap=False
|
||||||
|
)
|
||||||
assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)]
|
assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)]
|
||||||
spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True)
|
spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True)
|
||||||
assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)]
|
assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)]
|
||||||
|
|
|
@ -12,6 +12,7 @@ from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
|
||||||
|
|
||||||
OPS = get_current_ops()
|
OPS = get_current_ops()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def strings():
|
def strings():
|
||||||
return ["apple", "orange"]
|
return ["apple", "orange"]
|
||||||
|
|
|
@ -66,7 +66,11 @@ def configure_minibatch_by_words(
|
||||||
"""
|
"""
|
||||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(
|
return partial(
|
||||||
minibatch_by_words, size=size, tolerance=tolerance, discard_oversize=discard_oversize, **optionals
|
minibatch_by_words,
|
||||||
|
size=size,
|
||||||
|
tolerance=tolerance,
|
||||||
|
discard_oversize=discard_oversize,
|
||||||
|
**optionals
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -70,14 +70,18 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
nlp._link_components()
|
nlp._link_components()
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
if T["max_epochs"] == -1:
|
if T["max_epochs"] == -1:
|
||||||
logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels")
|
logger.debug(
|
||||||
|
"Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
||||||
|
)
|
||||||
nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
|
nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
|
||||||
else:
|
else:
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||||
# Detect components with listeners that are not frozen consistently
|
# Detect components with listeners that are not frozen consistently
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
|
for listener in getattr(
|
||||||
|
proc, "listening_components", []
|
||||||
|
): # e.g. tok2vec/transformer
|
||||||
# Don't warn about components not in the pipeline
|
# Don't warn about components not in the pipeline
|
||||||
if listener not in nlp.pipe_names:
|
if listener not in nlp.pipe_names:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -110,7 +110,8 @@ def wandb_logger(
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
import wandb
|
import wandb
|
||||||
from wandb import init, log, join # test that these are available
|
# test that these are available
|
||||||
|
from wandb import init, log, join # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(Errors.E880)
|
raise ImportError(Errors.E880)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
|
from typing import List, Callable, Tuple, Dict, Iterable, Union, Any, IO
|
||||||
from typing import Optional, TYPE_CHECKING
|
from typing import Optional, TYPE_CHECKING
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
@ -96,8 +96,7 @@ def train(
|
||||||
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
||||||
if annotating_components:
|
if annotating_components:
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.info(f"Set annotations on update for: {annotating_components}")
|
msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
|
||||||
+ "\n"
|
|
||||||
)
|
)
|
||||||
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
|
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
|
|
|
@ -57,13 +57,13 @@ if TYPE_CHECKING:
|
||||||
from .vocab import Vocab # noqa: F401
|
from .vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||||
# and additional sections are added at the end, in alphabetical order.
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
# fmt: off
|
|
||||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
@ -649,8 +649,7 @@ def get_model_version_range(spacy_version: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def get_model_lower_version(constraint: str) -> Optional[str]:
|
def get_model_lower_version(constraint: str) -> Optional[str]:
|
||||||
"""From a version range like >=1.2.3,<1.3.0 return the lower pin.
|
"""From a version range like >=1.2.3,<1.3.0 return the lower pin."""
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
specset = SpecifierSet(constraint)
|
specset = SpecifierSet(constraint)
|
||||||
for spec in specset:
|
for spec in specset:
|
||||||
|
|
|
@ -285,8 +285,8 @@ Encode context using bidirectional LSTM layers. Requires
|
||||||
|
|
||||||
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
|
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
|
||||||
learned linear projection to control the dimensionality. Unknown tokens are
|
learned linear projection to control the dimensionality. Unknown tokens are
|
||||||
mapped to a zero vector. See the documentation on [static
|
mapped to a zero vector. See the documentation on
|
||||||
vectors](/usage/embeddings-transformers#static-vectors) for details.
|
[static vectors](/usage/embeddings-transformers#static-vectors) for details.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -649,8 +649,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
<Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
|
||||||
|
|
||||||
[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, but used an internal `tok2vec` instead of
|
[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar,
|
||||||
taking it as argument:
|
but used an internal `tok2vec` instead of taking it as argument:
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -701,8 +701,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
|
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
|
||||||
|
|
||||||
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was not yet resizable.
|
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
|
||||||
Since v2, new labels can be added to this component, even after training.
|
not yet resizable. Since v2, new labels can be added to this component, even
|
||||||
|
after training.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
@ -732,8 +733,9 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
|
|
||||||
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
|
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
|
||||||
|
|
||||||
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was not yet resizable.
|
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
|
||||||
Since v2, new labels can be added to this component, even after training.
|
not yet resizable. Since v2, new labels can be added to this component, even
|
||||||
|
after training.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -232,7 +232,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -35,7 +35,7 @@ how the component should be configured. You can override its settings via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
|
||||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
||||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||||
|
@ -64,7 +64,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ----------- |
|
||||||
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
||||||
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
|
|
|
@ -245,8 +245,8 @@ certain prior probability.
|
||||||
### Candidate.\_\_init\_\_ {#candidate-init tag="method"}
|
### Candidate.\_\_init\_\_ {#candidate-init tag="method"}
|
||||||
|
|
||||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
Construct a `Candidate` object. Usually this constructor is not called directly,
|
||||||
but instead these objects are returned by the
|
but instead these objects are returned by the `get_candidates` method of the
|
||||||
`get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe.
|
[`entity_linker`](/api/entitylinker) pipe.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -178,8 +178,9 @@ added to an existing vectors table. See more details in
|
||||||
|
|
||||||
### spacy.TextCatCNN.v1 {#TextCatCNN_v1}
|
### spacy.TextCatCNN.v1 {#TextCatCNN_v1}
|
||||||
|
|
||||||
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means that you can add
|
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
|
||||||
labels to a previously trained textcat. `TextCatCNN` v1 did not yet support that.
|
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
|
||||||
|
yet support that.
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
@ -213,8 +214,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
|
|
||||||
### spacy.TextCatBOW.v1 {#TextCatBOW_v1}
|
### spacy.TextCatBOW.v1 {#TextCatBOW_v1}
|
||||||
|
|
||||||
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means that you can add
|
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
||||||
labels to a previously trained textcat. `TextCatBOW` v1 did not yet support that.
|
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
||||||
|
yet support that.
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
|
|
|
@ -121,7 +121,7 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||||
|
|
|
@ -62,7 +62,7 @@ shortcut for this and instantiate the component using its string name and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------- |
|
| ------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
|
@ -201,7 +201,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -99,14 +99,14 @@ representation.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
|
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
|
||||||
| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ---------- |
|
||||||
| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is ` | `. ~~str~~ |
|
| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is ` | `. ~~str~~ |
|
||||||
| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
|
| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
|
||||||
| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
|
| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
|
||||||
|
|
|
@ -149,7 +149,7 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --- |
|
||||||
| `match_id` | An ID for the thing you're matching. ~~str~~ | |
|
| `match_id` | An ID for the thing you're matching. ~~str~~ | |
|
||||||
| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
|
| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
|
|
|
@ -188,7 +188,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ |
|
||||||
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
|
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -491,8 +491,8 @@ document by the `parser`, `senter`, `sentencizer` or some custom function. It
|
||||||
will raise an error otherwise.
|
will raise an error otherwise.
|
||||||
|
|
||||||
If the span happens to cross sentence boundaries, only the first sentence will
|
If the span happens to cross sentence boundaries, only the first sentence will
|
||||||
be returned. If it is required that the sentence always includes the
|
be returned. If it is required that the sentence always includes the full span,
|
||||||
full span, the result can be adjusted as such:
|
the result can be adjusted as such:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
sent = span.sent
|
sent = span.sent
|
||||||
|
|
|
@ -214,7 +214,7 @@ Delegates to [`predict`](/api/spancategorizer#predict) and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -26,7 +26,7 @@ architectures and their arguments and hyperparameters.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -55,7 +55,7 @@ shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
|
@ -199,7 +199,7 @@ Delegates to [`predict`](/api/tagger#predict) and
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -197,7 +197,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
|
|
@ -363,7 +363,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------- |
|
| ----------- | ------------------------------------------------------- |
|
||||||
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
||||||
|
|
||||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||||
|
@ -421,7 +421,7 @@ The L2 norm of the token's vector representation.
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | The parent document. ~~Doc~~ |
|
| `doc` | The parent document. ~~Doc~~ |
|
||||||
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
|
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
|
||||||
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
|
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
|
||||||
|
|
|
@ -239,6 +239,7 @@ it.
|
||||||
| `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
| `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
||||||
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||||
| `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
| `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
During serialization, spaCy will export several data fields used to restore
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
|
|
@ -290,8 +290,8 @@ If a table is full, it can be resized using
|
||||||
## Vectors.n_keys {#n_keys tag="property"}
|
## Vectors.n_keys {#n_keys tag="property"}
|
||||||
|
|
||||||
Get the number of keys in the table. Note that this is the number of _all_ keys,
|
Get the number of keys in the table. Note that this is the number of _all_ keys,
|
||||||
not just unique vectors. If several keys are mapped to the same
|
not just unique vectors. If several keys are mapped to the same vectors, they
|
||||||
vectors, they will be counted individually.
|
will be counted individually.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -321,7 +321,7 @@ performed in chunks to avoid consuming too much memory. You can set the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | --------------------------------------------------------------------------- |
|
| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
|
| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |
|
| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user