mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge remote-tracking branch 'upstream/develop' into docs/various-v3-2
This commit is contained in:
commit
5fbb8dfcbc
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a19"
|
||||
__version__ = "3.0.0a20"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -6,15 +6,16 @@ from wasabi import msg
|
|||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
import subprocess
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.config import Config, ConfigValidationError
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -38,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
|
|||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -62,24 +64,41 @@ def setup_cli() -> None:
|
|||
command(prog_name=COMMAND)
|
||||
|
||||
|
||||
def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||
def parse_config_overrides(
|
||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
"training.batch_size". Arguments without a "." are considered invalid,
|
||||
since the config only allows top-level sections to exist.
|
||||
|
||||
args (List[str]): The extra arguments from the command line.
|
||||
env_vars (Optional[str]): Optional environment variable to read from.
|
||||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||
"""
|
||||
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||
result = {}
|
||||
while args:
|
||||
opt = args.pop(0)
|
||||
err = f"Invalid CLI argument '{opt}'"
|
||||
err = f"Invalid config override '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
orig_opt = opt
|
||||
opt = opt.replace("--", "")
|
||||
if "." not in opt:
|
||||
raise NoSuchOption(orig_opt)
|
||||
if is_cli:
|
||||
raise NoSuchOption(orig_opt)
|
||||
else:
|
||||
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||
if "=" in opt: # we have --opt=value
|
||||
opt, value = opt.split("=", 1)
|
||||
opt = opt.replace("-", "_")
|
||||
|
@ -98,7 +117,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
else:
|
||||
msg.fail(f"{err}: override option should start with --", exits=1)
|
||||
msg.fail(f"{err}: name should start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
|
@ -287,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb") as input_file:
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
@ -327,7 +346,7 @@ def git_checkout(
|
|||
)
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
|
|
@ -57,7 +57,10 @@ class Warnings:
|
|||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
"token.head values, you can probably ignore this warning. Consider "
|
||||
"using Doc(words, ..., heads=heads, deps=deps) instead.")
|
||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||
"be more efficient to split your training data into multiple "
|
||||
"smaller JSON files instead.")
|
||||
|
|
|
@ -156,11 +156,7 @@ class Language:
|
|||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -1462,7 +1458,7 @@ class Language:
|
|||
# here :(
|
||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||
if hasattr(proc1, "find_listeners"):
|
||||
for name2, proc2 in self.pipeline[i+1:]:
|
||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||
if isinstance(getattr(proc2, "model", None), Model):
|
||||
proc1.find_listeners(proc2.model)
|
||||
|
||||
|
|
|
@ -164,7 +164,9 @@ def MultiHashEmbed(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
||||
def CharacterEmbed(
|
||||
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
|
||||
):
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
each word, taken from the beginning and end of the word equally. Padding is
|
||||
|
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
|||
),
|
||||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
||||
with_array(
|
||||
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
concatenate(
|
||||
|
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
|
|||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
with_array(
|
||||
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
||||
),
|
||||
ragged2list(),
|
||||
)
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
|
||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
|
@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
# fmt: on
|
||||
|
|
|
@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
|||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_api_init(en_vocab):
|
||||
# set sent_start by sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
# set sent_start by heads
|
||||
doc = Doc(
|
||||
en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
# heads override sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||
def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||
doc = Doc(en_vocab, words=text)
|
||||
|
|
|
@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[2, 1, 1, 0],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # the & the -> the
|
||||
|
|
|
@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
|||
|
||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
deps=["dep"] * len(heads),
|
||||
)
|
||||
|
||||
lefts = {}
|
||||
rights = {}
|
||||
|
|
|
@ -345,10 +345,7 @@ def test_language_factories_invalid():
|
|||
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
|
||||
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
|
||||
),
|
||||
(
|
||||
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
|
||||
{"a": 0.25, "b": 0.75},
|
||||
),
|
||||
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
|
||||
],
|
||||
)
|
||||
def test_language_factories_combine_score_weights(weights, expected):
|
||||
|
@ -363,16 +360,10 @@ def test_language_factories_scores():
|
|||
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||
Language.factory(
|
||||
f"{name}1",
|
||||
scores=list(weights1),
|
||||
default_score_weights=weights1,
|
||||
func=func,
|
||||
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
||||
)
|
||||
Language.factory(
|
||||
f"{name}2",
|
||||
scores=list(weights2),
|
||||
default_score_weights=weights2,
|
||||
func=func,
|
||||
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
||||
)
|
||||
meta1 = Language.get_factory_meta(f"{name}1")
|
||||
assert meta1.default_score_weights == weights1
|
||||
|
|
|
@ -212,9 +212,17 @@ def test_issue1834():
|
|||
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||
deps=["dep"] * len(words),
|
||||
)
|
||||
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
|
||||
print(
|
||||
doc.has_annotation("DEP"),
|
||||
[t.head.i for t in doc],
|
||||
[t.is_sent_start for t in doc],
|
||||
)
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
|
||||
print(
|
||||
new_doc.has_annotation("DEP"),
|
||||
[t.head.i for t in new_doc],
|
||||
[t.is_sent_start for t in new_doc],
|
||||
)
|
||||
assert new_doc[6].sent_start
|
||||
assert new_doc.has_annotation("DEP")
|
||||
assert new_doc.has_annotation("TAG")
|
||||
|
|
|
@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
|
|||
# See issue #1105
|
||||
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
|
||||
model = registry.make_from_config(cfg, validate=True)["model"]
|
||||
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
|
||||
textcat = TextCategorizer(
|
||||
en_vocab,
|
||||
model,
|
||||
labels=["ENTITY", "ACTION", "MODIFIER"],
|
||||
threshold=0.5,
|
||||
positive_label=None,
|
||||
)
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
|
|
|
@ -1,16 +1,15 @@
|
|||
import pytest
|
||||
from click import NoSuchOption
|
||||
|
||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from thinc.config import ConfigValidationError
|
||||
import srsly
|
||||
import os
|
||||
|
||||
from .util import make_tempdir
|
||||
|
||||
|
@ -342,6 +341,24 @@ def test_parse_config_overrides_invalid_2(args):
|
|||
parse_config_overrides(args)
|
||||
|
||||
|
||||
def test_parse_cli_overrides():
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
||||
result = parse_config_overrides([])
|
||||
assert len(result) == 4
|
||||
assert result["x.foo"] == "bar"
|
||||
assert result["x.bar"] == 12
|
||||
assert result["x.baz"] is False
|
||||
assert result["y.foo"] == "hello"
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x"
|
||||
assert parse_config_overrides([], env_var=None) == {}
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
os.environ[OVERRIDES_ENV_VAR] = "hello world"
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
del os.environ[OVERRIDES_ENV_VAR]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
|
|
|
@ -291,8 +291,7 @@ def test_spacy_blank():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value",
|
||||
[False, None, ["x", "y"], Language, Vocab],
|
||||
"value", [False, None, ["x", "y"], Language, Vocab],
|
||||
)
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
|
|
|
@ -95,7 +95,7 @@ def test_util_dot_section():
|
|||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from typing import Dict, Iterable, Callable
|
||||
import pytest
|
||||
from thinc.api import Config
|
||||
|
||||
from spacy import Language
|
||||
from spacy.util import load_model_from_config, registry, dot_to_object
|
||||
from spacy.training import Example
|
||||
|
@ -10,19 +9,19 @@ from spacy.training import Example
|
|||
def test_readers():
|
||||
config_string = """
|
||||
[training]
|
||||
|
||||
|
||||
[corpora]
|
||||
@readers = "myreader.v1"
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "textcat"]
|
||||
|
||||
|
||||
[components]
|
||||
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
"""
|
||||
|
@ -69,19 +68,19 @@ def test_readers():
|
|||
def test_cat_readers(reader, additional_config):
|
||||
nlp_config_string = """
|
||||
[training]
|
||||
|
||||
|
||||
[corpora]
|
||||
@readers = "PLACEHOLDER"
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "textcat"]
|
||||
|
||||
|
||||
[components]
|
||||
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
"""
|
||||
|
|
|
@ -34,7 +34,17 @@ def doc():
|
|||
# fmt: on
|
||||
nlp = English()
|
||||
words = [t.text for t in nlp.make_doc(text)]
|
||||
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
|
||||
doc = get_doc(
|
||||
nlp.vocab,
|
||||
words=words,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
lemmas=lemmas,
|
||||
ents=ents,
|
||||
)
|
||||
doc.cats = cats
|
||||
return doc
|
||||
|
||||
|
|
|
@ -30,60 +30,21 @@ def get_doc(
|
|||
morphs=None,
|
||||
):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
vocab.strings.add(value)
|
||||
|
||||
doc = Doc(vocab, words=words)
|
||||
|
||||
# if there are any other annotations, set them
|
||||
if headings:
|
||||
attrs = doc.to_array(headings)
|
||||
|
||||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = heads[i]
|
||||
else:
|
||||
attrs[i, j] = heads[i]
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = morph_key
|
||||
else:
|
||||
attrs[i, j] = morph_key
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = doc.vocab.strings[annot[i]]
|
||||
else:
|
||||
attrs[i, j] = doc.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
doc.from_array(headings, attrs)
|
||||
|
||||
# finally, set the entities
|
||||
if ents:
|
||||
doc.ents = [
|
||||
Span(doc, start, end, label=doc.vocab.strings[label])
|
||||
for start, end, label in ents
|
||||
]
|
||||
return doc
|
||||
if heads is not None:
|
||||
heads = [i + head for i, head in enumerate(heads)]
|
||||
if ents is not None:
|
||||
ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
|
||||
return Doc(
|
||||
vocab,
|
||||
words=words,
|
||||
pos=pos,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
tags=tags,
|
||||
ents=ents,
|
||||
lemmas=lemmas,
|
||||
morphs=morphs,
|
||||
)
|
||||
|
||||
|
||||
def get_batch(batch_size):
|
||||
|
|
|
@ -158,17 +158,50 @@ cdef class Doc:
|
|||
raise ValueError(Errors.E046.format(name=name))
|
||||
return Underscore.doc_extensions.pop(name)
|
||||
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
||||
def __init__(
|
||||
self,
|
||||
Vocab vocab,
|
||||
words=None,
|
||||
spaces=None,
|
||||
*,
|
||||
user_data=None,
|
||||
tags=None,
|
||||
pos=None,
|
||||
morphs=None,
|
||||
lemmas=None,
|
||||
heads=None,
|
||||
deps=None,
|
||||
sent_starts=None,
|
||||
ents=None,
|
||||
):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
words (Optional[List[str]]): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||
tags (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.tag. Defaults to None.
|
||||
pos (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.pos. Defaults to None.
|
||||
morphs (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.morph. Defaults to None.
|
||||
lemmas (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.lemma. Defaults to None.
|
||||
heads (Optional[List[int]]): A list of values, of the same length as
|
||||
words, to assign as heads. Head indices are the position of the
|
||||
head in the doc. Defaults to None.
|
||||
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.dep. Defaults to None.
|
||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||
the same length as words, to assign as token.is_sent_start. Will be
|
||||
overridden by heads if heads is provided. Defaults to None.
|
||||
ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
|
||||
Defaults to None.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/doc#init
|
||||
"""
|
||||
|
@ -217,6 +250,63 @@ cdef class Doc:
|
|||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||
self.push_back(lexeme, has_space)
|
||||
|
||||
if heads is not None:
|
||||
heads = [head - i for i, head in enumerate(heads)]
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
if sent_starts is not None:
|
||||
for i in range(len(sent_starts)):
|
||||
if sent_starts[i] is True:
|
||||
sent_starts[i] = 1
|
||||
elif sent_starts[i] is False:
|
||||
sent_starts[i] = -1
|
||||
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||
sent_starts[i] = 0
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads and annot is not sent_starts:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
self.vocab.strings.add(value)
|
||||
|
||||
# if there are any other annotations, set them
|
||||
if headings:
|
||||
attrs = self.to_array(headings)
|
||||
|
||||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
else:
|
||||
attrs[i, j] = annot[i]
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = morph_key
|
||||
else:
|
||||
attrs[i, j] = morph_key
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = self.vocab.strings[annot[i]]
|
||||
else:
|
||||
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
self.from_array(headings, attrs)
|
||||
if ents is not None:
|
||||
self.ents = ents
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
"""Custom extension attributes registered via `set_extension`."""
|
||||
|
|
|
@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
|
|||
heads.append(head)
|
||||
deps.append(dep)
|
||||
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
doc = Doc(
|
||||
vocab,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
tags=tags,
|
||||
pos=poses,
|
||||
deps=deps,
|
||||
lemmas=lemmas,
|
||||
heads=heads,
|
||||
)
|
||||
for i in range(len(doc)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = poses[i]
|
||||
doc[i].dep_ = deps[i]
|
||||
doc[i].lemma_ = lemmas[i]
|
||||
doc[i].head = doc[heads[i]]
|
||||
doc[i]._.merged_orth = words[i]
|
||||
doc[i]._.merged_morph = morphs[i]
|
||||
doc[i]._.merged_lemma = lemmas[i]
|
||||
|
@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
|
|||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
|
||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
||||
for i in range(len(doc)):
|
||||
doc_x[i].tag_ = tags[i]
|
||||
doc_x[i].morph_ = morphs[i]
|
||||
doc_x[i].lemma_ = lemmas[i]
|
||||
doc_x[i].pos_ = poses[i]
|
||||
doc_x[i].dep_ = deps[i]
|
||||
doc_x[i].head = doc_x[heads[i]]
|
||||
doc_x = Doc(
|
||||
vocab,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
tags=tags,
|
||||
morphs=morphs,
|
||||
lemmas=lemmas,
|
||||
pos=poses,
|
||||
deps=deps,
|
||||
heads=heads,
|
||||
)
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
|
||||
return doc_x
|
||||
|
|
|
@ -30,11 +30,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| tags | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| pos | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| morphs | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| lemmas | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| heads | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| deps | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| sent_starts | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| ents | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
|
|
@ -921,6 +921,14 @@ package is installed in the same environment as spaCy, it will automatically add
|
|||
[parallel training](/usage/training#parallel-training) for more details on how
|
||||
it works under the hood.
|
||||
|
||||
<Project id="integrations/ray">
|
||||
|
||||
Get started with parallel training using our project template. It trains a
|
||||
simple model on a Universal Dependencies Treebank and lets you parallelize the
|
||||
training with Ray.
|
||||
|
||||
</Project>
|
||||
|
||||
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
|
||||
`project.yml` just like the regular training command and pass it the config, and
|
||||
optional output directory or remote storage URL and config overrides if needed.
|
||||
|
@ -940,10 +948,6 @@ commands:
|
|||
- "training/model-best"
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
|
||||
---
|
||||
|
||||
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
|
||||
|
|
|
@ -214,6 +214,24 @@ overrides. Overrides are added before [variables](#config-interpolation) are
|
|||
resolved, by the way – so if you need to use a value in multiple places,
|
||||
reference it across your config and override it on the CLI once.
|
||||
|
||||
> #### 💡 Tip: Verbose logging
|
||||
>
|
||||
> If you're using config overrides, you can set the `--verbose` flag on
|
||||
> [`spacy train`](/api/cli#train) to make spaCy log more info, including which
|
||||
> overrides were set via the CLI and environment variables.
|
||||
|
||||
#### Adding overrides via environment variables {#config-overrides-env}
|
||||
|
||||
Instead of defining the overrides as CLI arguments, you can also use the
|
||||
`SPACY_CONFIG_OVERRIDES` environment variable using the same argument syntax.
|
||||
This is especially useful if you're training models as part of an automated
|
||||
process. Environment variables **take precedence** over CLI overrides and values
|
||||
defined in the config file.
|
||||
|
||||
```cli
|
||||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||
```
|
||||
|
||||
### Defining pipeline components {#config-components}
|
||||
|
||||
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
||||
|
@ -895,9 +913,13 @@ cluster. If it's not set, Ray will run locally.
|
|||
python -m spacy ray train config.cfg --n-workers 2
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
<Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
Get started with parallel training using our project template. It trains a
|
||||
simple model on a Universal Dependencies Treebank and lets you parallelize the
|
||||
training with Ray.
|
||||
|
||||
</Project>
|
||||
|
||||
### How parallel training works {#parallel-training-details}
|
||||
|
||||
|
|
|
@ -75,63 +75,63 @@
|
|||
{
|
||||
"label": "Containers",
|
||||
"items": [
|
||||
{ "text": "Language", "url": "/api/language" },
|
||||
{ "text": "Doc", "url": "/api/doc" },
|
||||
{ "text": "Token", "url": "/api/token" },
|
||||
{ "text": "Span", "url": "/api/span" },
|
||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
||||
{ "text": "DocBin", "url": "/api/docbin" },
|
||||
{ "text": "Example", "url": "/api/example" },
|
||||
{ "text": "DocBin", "url": "/api/docbin" }
|
||||
{ "text": "Language", "url": "/api/language" },
|
||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
||||
{ "text": "Span", "url": "/api/span" },
|
||||
{ "text": "Token", "url": "/api/token" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Pipeline",
|
||||
"items": [
|
||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||
{ "text": "Transformer", "url": "/api/transformer" },
|
||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||
{ "text": "Tagger", "url": "/api/tagger" },
|
||||
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
|
||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||
{ "text": "Pipe", "url": "/api/pipe" },
|
||||
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" },
|
||||
{ "text": "Pipe", "url": "/api/pipe" }
|
||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||
{ "text": "Tagger", "url": "/api/tagger" },
|
||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||
{ "text": "Transformer", "url": "/api/transformer" },
|
||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Matchers",
|
||||
"items": [
|
||||
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
|
||||
{ "text": "Matcher", "url": "/api/matcher" },
|
||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
|
||||
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
|
||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Other",
|
||||
"items": [
|
||||
{ "text": "Vocab", "url": "/api/vocab" },
|
||||
{ "text": "StringStore", "url": "/api/stringstore" },
|
||||
{ "text": "Vectors", "url": "/api/vectors" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "Morphology", "url": "/api/morphology" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Scorer", "url": "/api/scorer" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" }
|
||||
{ "text": "StringStore", "url": "/api/stringstore" },
|
||||
{ "text": "Vectors", "url": "/api/vectors" },
|
||||
{ "text": "Vocab", "url": "/api/vocab" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Cython",
|
||||
"items": [
|
||||
{ "text": "Architecture", "url": "/api/cython" },
|
||||
{ "text": "Structs", "url": "/api/cython-structs" },
|
||||
{ "text": "Classes", "url": "/api/cython-classes" }
|
||||
{ "text": "Classes", "url": "/api/cython-classes" },
|
||||
{ "text": "Structs", "url": "/api/cython-structs" }
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue
Block a user